""" 数据抓取模块。 """ import asyncio from typing import Optional, Union from sqlalchemy import select, desc import dock from dock.govc import govc_scrape_dept_feedback, govc_scrape_return_visit, govc_scrape_finish_info, govc_scrape_order from models.govc_task import GovcTask from paste.core.logging import echo_log from paste.web import requests async def fetch_govc_task(fetch_size: int = 60, task_id: Optional[Union[str, int]] = None): """ 抓取待办数据及其明细数据。 :param fetch_size: 读取多少任务进行明细抓取 :param task_id: 可选的指定的工单id """ echo_log(f"开始抓取待办数据...") task_request = await govc_scrape_order.get_task_request( fetch_size=fetch_size ) request_queue = asyncio.Queue() await request_queue.put(task_request) await requests.async_concurrency( request_queue, retry=dock.MAX_RETRY_COUNT, after_request=govc_scrape_order.after_task_request ) echo_log(f"待办数据抓取完成...") # 读取任务数据,以便能对最新数据抓取详细数据 query = select( GovcTask.id, GovcTask.pvi_guid, GovcTask.c_guid ).order_by( desc(GovcTask.id) ) if task_id: if isinstance(task_id, list): query = query.where(GovcTask.id.in_(task_id)) echo_log(f"开始抓取待办列表:{task_id} 的详细数据...") else: query = query.where(GovcTask.id == task_id) echo_log(f"开始抓取待办:{task_id} 的详细数据...") else: echo_log(f"开始抓取前 {fetch_size} 条待办的详细数据...") query = query.limit(fetch_size) task_df = await GovcTask.query_as_df(query) # 构建请求队列 feedback_queue = asyncio.Queue() result_info_queue = asyncio.Queue() finish_info_queue = asyncio.Queue() # 向队列中填充请求对象 echo_log(f"正在准备请求队列...") for _h, _row in task_df.iterrows(): _feedback_request = await govc_scrape_dept_feedback.get_feedback_request(_row.get(GovcTask.pvi_guid.key), _row.get(GovcTask.c_guid.key)) setattr(_feedback_request, 'task_id', _row.get(GovcTask.id.key)) await feedback_queue.put(_feedback_request) _result_info_request = await govc_scrape_return_visit.get_return_visit_request(_row.get(GovcTask.pvi_guid.key), _row.get(GovcTask.c_guid.key)) setattr(_result_info_request, 'task_id', _row.get(GovcTask.id.key)) await result_info_queue.put(_result_info_request) _finish_info_request = await govc_scrape_finish_info.get_finish_info_request(_row.get(GovcTask.pvi_guid.key), _row.get(GovcTask.c_guid.key)) setattr(_finish_info_request, 'task_id', _row.get(GovcTask.id.key)) await finish_info_queue.put(_finish_info_request) echo_log(f"抓取待办详细数据...") tasks = [ requests.async_concurrency( feedback_queue, con_count=dock.CONCURRENCY_COUNT, retry=dock.MAX_RETRY_COUNT, after_request=govc_scrape_dept_feedback.after_feedback_request ), requests.async_concurrency( result_info_queue, con_count=dock.CONCURRENCY_COUNT, retry=dock.MAX_RETRY_COUNT, after_request=govc_scrape_result_info.after_result_info_request ), requests.async_concurrency( finish_info_queue, con_count=dock.CONCURRENCY_COUNT, retry=dock.MAX_RETRY_COUNT, after_request=govc_scrape_finish_info.after_finish_info_request) ] await asyncio.gather(*tasks) if __name__ == "__main__": from paste.core import aio_pool _runner = aio_pool.get_aio_runner() _runner(fetch_govc_task(10))