import asyncio import json import pandas as pd from tornado.httpclient import HTTPResponse, HTTPRequest import dock from dock.govc import govc_api, govc_security from models.govc_task import GovcTask import models from paste.core.logging import echo_log from paste.util import udict from paste.web import requests async def get_task_request(fetch_size: int = 100): """ 获取市12345任务列表数据。 通过 POST 请求向市12345的任务列表接口提交表单数据,获取任务分页数据。 :param fetch_size: 抓取条数 """ api_url = f"/rest/bmfw/business/taskinfo/query/sztaskquerylistaction/getDataGridData?moduleGuid=94835dc2-a76f-489b-ae76-ea8ed699a113" headers = { 'Referer': f'{govc_api.ApiUrl}/bmfw/business/taskinfo/query/taskquerylist?moduleGuid=94835dc2-a76f-489b-ae76-ea8ed699a113', } # 设计公共请求对象和自定义请求对象的合并,这里主要合并的应该是 cmdParams 字段 epoint_user_loginid, cookie_header = await govc_security.get_cookies() request_body = { "commonDto": json.dumps([ { "id": "cserial", "bind": "cnsTinfo.serialnum", "type": "textbox", "action": "", "value": "", "text": "" }, { "id": "rqsttitle", "bind": "cnsTinfo.rqsttitle", "type": "textbox", "action": "", "value": "", "text": "" }, { "id": "mini-12", "bind": "cnsTinfo.handleouname", "type": "textbox", "action": "", "value": "", "text": "" }, { "id": "rqsttype", "bind": "cnsTinfo.rqsttype", "type": "combobox", "action": "getRqsttypeModel", "textField": "text", "valueField": "id", "pinyinField": "tag", "columns": [], "value": "", "text": "" }, { "id": "search_tstatus", "bind": "cnsTinfo.tstatus", "type": "combobox", "action": "getTstatusModel", "textField": "text", "valueField": "id", "pinyinField": "tag", "columns": [], "value": "", "text": "" }, { "id": "rqschannel", "bind": "cnsTinfo.rqschannel", "type": "combobox", "action": "getRqschannelModel", "textField": "text", "valueField": "id", "pinyinField": "tag", "columns": [], "value": "", "text": "" }, { "id": "rqstcontent", "bind": "cnsTinfo.rqstcontent", "type": "textbox", "action": "", "value": "", "text": "" }, { "id": "startDate", "bind": "startDate", "type": "datepicker", "action": "", "format": "yyyy/MM/dd HH:mm:ss", "value": "", "text": "" }, { "id": "endDate", "bind": "endDate", "type": "datepicker", "action": "", "format": "yyyy/MM/dd HH:mm:ss", "value": "", "text": "" }, { "id": "dataexport", "type": "dataexport", "action": "getExportBigDataModel", "mapClass": "com.epoint.basic.faces.export.DataExport", "exportAction": "zwztexportaction.export" }, { "id": "datagrid", "type": "datagrid", "action": "getDataGridData", "idField": "rowguid", "pageIndex": 0, "sortField": "", "sortOrder": "desc", "columns": [ { "fieldName": "serialnum" }, { "fieldName": "rqsttitle" }, { "fieldName": "rqstcontent" }, { "fieldName": "createdate", "format": "yyyy-MM-dd HH:mm:ss" }, { "fieldName": "handleouname" }, { "fieldName": "finishtime_bf", "format": "yyyy-MM-dd HH:mm:ss" }, { "fieldName": "backtime_bf", "format": "yyyy-MM-dd HH:mm:ss" }, { "fieldName": "tstatus", "code": "任务单状态" } ], "pageSize": fetch_size, "url": "getDataGridData", "data": [], "isSecondRequest": True }, { "id": "_common_hidden_viewdata", "type": "hidden", "value": json.dumps({'epoint_user_loginid': epoint_user_loginid}, separators=(',', ':')) } ], separators=(',', ':')), "cmdParams": json.dumps({ 'pageUrl': api_url }, separators=(',', ':')), 'pageIndex': 0, 'pageSize': fetch_size, 'sortField': '', 'sortOrder': 'desc', 'isSecondRequest': 'true' } # 构造 API 请求 return await govc_api.new_api_request(api_url, request_body, headers=headers) async def after_task_request(response: HTTPResponse, retry_queue: asyncio.Queue[HTTPRequest]): """ 任务请求响应后的处理程序。 :param response: 响应对象 :param retry_queue: 重试队列 """ response_body = response.body.decode() response_data = json.loads(response_body) list_data: list[dict] = udict.get_by_path(response_data, 'controls.0.data') if list_data: mapped_df = pd.DataFrame(list_data) # 更换映射方向,用于将源数据列名改为与数据库表对应 forward_mapping = {dict_f: table_f for table_f, dict_f in GovcTask.FieldMapping.items()} mapped_df = mapped_df.rename(columns=forward_mapping) # 把非数字的时间戳字段改为None mapped_df[GovcTask.finish_time.key] = mapped_df[GovcTask.finish_time.key].apply( lambda x: None if not isinstance(x, int) else x ) mapped_df[GovcTask.sign_time.key] = mapped_df[GovcTask.sign_time.key].apply( lambda x: None if not isinstance(x, int) else x ) mapped_df[GovcTask.sign_time_bf.key] = mapped_df[GovcTask.sign_time_bf.key].apply( lambda x: None if not isinstance(x, int) else x ) # 这里把空数据都换成 None,以便存入数据库时是 null mapped_df.replace(models.EmptyInDF + models.EmptyDatetimeInDF, None, inplace=True) # 筛选数据状态 _created, _updated = await GovcTask.save_batch(mapped_df) echo_log(f"成功创建企业待办:{_created}条,更新:{_updated}条.") else: echo_log('未获取到企业待办数据') if retry_queue: echo_log(f"企业待办重试队列中有:{retry_queue.qsize()} 个请求在等待.") if __name__ == "__main__": from paste.core import aio_pool async def scrape(): task_request = await get_task_request() request_queue = asyncio.Queue() await request_queue.put(task_request) await requests.async_concurrency( request_queue, retry=dock.MAX_RETRY_COUNT, after_request=after_task_request ) _runner = aio_pool.get_aio_runner() _runner(scrape())