初始化项目
This commit is contained in:
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
公共对接模块。
|
||||
"""
|
||||
import asyncio
|
||||
import random
|
||||
import re
|
||||
from http.cookies import SimpleCookie
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from tornado.httpclient import HTTPRequest, HTTPResponse
|
||||
|
||||
from paste.web import requests
|
||||
|
||||
USER_AGENTS = [
|
||||
# Chrome on Windows 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
||||
# Firefox on Windows 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
|
||||
# Edge on Windows 11 (Chromium-based)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.2210.91',
|
||||
|
||||
# Safari on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
|
||||
# Chrome on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
||||
# Firefox on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
|
||||
# Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
||||
# Opera on Windows (Chromium-based)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 OPR/106.0.0.0',
|
||||
]
|
||||
"""
|
||||
常用 PC 端浏览器 User-Agent 列表。
|
||||
"""
|
||||
|
||||
DEFAULT_TIMEOUT = 120.0
|
||||
"""
|
||||
默认超时时长。
|
||||
"""
|
||||
|
||||
CONCURRENCY_COUNT = 5
|
||||
"""
|
||||
最大并发次数。
|
||||
"""
|
||||
|
||||
MAX_RETRY_COUNT = 5
|
||||
"""
|
||||
最大重试次数。
|
||||
"""
|
||||
|
||||
|
||||
def get_random_user_agent() -> tuple[str, str, str]:
|
||||
"""
|
||||
从 user_agents 列表中随机返回一个 User-Agent 字符串及其浏览器版本和操作系统名称。
|
||||
|
||||
Returns:
|
||||
tuple: (user_agent: str, browser_version: str, os_name: str)
|
||||
"""
|
||||
ua: str = random.choice(USER_AGENTS)
|
||||
|
||||
# 提取浏览器版本
|
||||
browser_version: str = "Unknown"
|
||||
if "Chrome/" in ua:
|
||||
match = re.search(r"Chrome/(\d+\.\d+\.\d+\.\d+)", ua)
|
||||
if match:
|
||||
browser_version = match.group(1)
|
||||
elif "Firefox/" in ua:
|
||||
match = re.search(r"Firefox/(\d+\.\d+)", ua)
|
||||
if match:
|
||||
browser_version = match.group(1)
|
||||
elif "Version/" in ua and "Safari/" in ua: # Safari
|
||||
match = re.search(r"Version/(\d+\.\d+)", ua)
|
||||
if match:
|
||||
browser_version = match.group(1)
|
||||
elif "Edg/" in ua:
|
||||
match = re.search(r"Edg/(\d+\.\d+\.\d+\.\d+)", ua)
|
||||
if match:
|
||||
browser_version = match.group(1)
|
||||
elif "OPR/" in ua:
|
||||
match = re.search(r"OPR/(\d+\.\d+\.\d+\.\d+)", ua)
|
||||
if match:
|
||||
browser_version = match.group(1)
|
||||
|
||||
# 提取操作系统名称
|
||||
os_name: str = "Unknown"
|
||||
if "Mac" in ua:
|
||||
os_name = "Mac"
|
||||
elif "Windows" in ua:
|
||||
os_name = "Windows"
|
||||
elif "Linux" in ua:
|
||||
os_name = "Linux"
|
||||
|
||||
return ua, browser_version, os_name
|
||||
|
||||
|
||||
def get_cookies(response: HTTPResponse) -> Dict[str, str]:
|
||||
"""
|
||||
从响应对象读取 Cookies。
|
||||
|
||||
:param response: 请求响应对象
|
||||
:return: 提取到的 Cookies
|
||||
"""
|
||||
cookies = SimpleCookie()
|
||||
for set_cookie in response.headers.get_list('Set-Cookie'):
|
||||
cookies.load(set_cookie)
|
||||
return {k: v.value for k, v in cookies.items()}
|
||||
|
||||
|
||||
def get_cookie_value(cookies_string, cookie_name):
|
||||
"""
|
||||
从 cookies 字符串中按名称提取对应的值
|
||||
|
||||
参数:
|
||||
cookies_string: str, 格式为 "key1=value1; key2=value2; ..."
|
||||
cookie_name: str, 要查找的 cookie 名称
|
||||
|
||||
返回:
|
||||
str 或 None: 如果找到返回对应的值,否则返回 None
|
||||
"""
|
||||
# 按分号分割 cookie 字符串
|
||||
cookies_list = cookies_string.split(';')
|
||||
|
||||
for cookie in cookies_list:
|
||||
# 去除前后空格,然后按等号分割键值对
|
||||
parts = cookie.strip().split('=', 1)
|
||||
if len(parts) == 2:
|
||||
key, value = parts
|
||||
if key == cookie_name:
|
||||
return value
|
||||
|
||||
# 如果没有找到,返回 None
|
||||
return None
|
||||
|
||||
|
||||
def new_http_request(
|
||||
url: str,
|
||||
body: Optional[Dict[str, Any]] = None,
|
||||
method: str = 'POST',
|
||||
timeout: Optional[float] = None,
|
||||
follow_redirects: bool = True,
|
||||
use_form: bool = False,
|
||||
extra_headers: Optional[Dict[str, str]] = None,
|
||||
**kwargs
|
||||
) -> HTTPRequest:
|
||||
"""
|
||||
新建 HTTPRequest 对象。
|
||||
|
||||
支持 GET 和 POST 方法:
|
||||
- GET: 参数通过 URL 查询字符串传递
|
||||
- POST: 参数通过 JSON body 或 form 表单传递(由 use_form 控制)
|
||||
|
||||
:param url: 请求的完整 URL
|
||||
:param body: 请求体(字典),GET 时为查询参数,POST 时为 JSON 或 form 数据
|
||||
:param method: HTTP 方法,仅支持 'GET' 或 'POST'
|
||||
:param timeout: 请求超时时间(秒)
|
||||
:param follow_redirects: 是否跟随重定向
|
||||
:param use_form: 如果为 True,POST 时使用 application/x-www-form-urlencoded 格式;否则使用 JSON
|
||||
:param extra_headers: 可选的额外请求头,用于传入 Cookie、Authorization 等
|
||||
:param kwargs: 其他参数,符合 tornado.httpclient.HTTPRequest 参数要求
|
||||
:return: tornado.httpclient.HTTPRequest 对象
|
||||
:raises ValueError: 当 method 不合法时抛出
|
||||
"""
|
||||
return requests.build_http_request(
|
||||
url, body, method, timeout, follow_redirects, use_form, extra_headers,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
async def scrape_cookies(url: str, timeout: Optional[float] = 10,
|
||||
extra_headers: Optional[Dict[str, str]] = None,
|
||||
**kwargs) -> Dict[str, str]:
|
||||
"""
|
||||
发送 GET 请求到 url 以获取服务端下发的 Cookies(如 JSESSIONID)。
|
||||
不关心响应体,只提取响应头中的 Set-Cookie。
|
||||
返回解析后的 Cookie 字典 { 'name': 'value', ... }
|
||||
|
||||
:param url: 获取 Cookies 的路径
|
||||
:param timeout: 超时时间
|
||||
:param extra_headers: 扩展头
|
||||
:return: Cookies 读取到的 Cookies
|
||||
"""
|
||||
cookies: Optional[Dict[str, str]] = None
|
||||
|
||||
request = new_http_request(
|
||||
url=url,
|
||||
method='GET',
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
extra_headers=extra_headers,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def after_request(response: HTTPResponse, retry_queue: asyncio.Queue[HTTPRequest]):
|
||||
nonlocal cookies
|
||||
cookies = get_cookies(response)
|
||||
|
||||
request_queue = asyncio.Queue()
|
||||
await request_queue.put(request)
|
||||
await requests.async_concurrency(
|
||||
request_queue, con_count=1, retry=MAX_RETRY_COUNT,
|
||||
after_request=after_request
|
||||
)
|
||||
|
||||
if cookies is None:
|
||||
# 无正确响应,抛出异常
|
||||
raise Exception(f"未能读取到 Cookies.")
|
||||
|
||||
return cookies
|
||||
Reference in New Issue
Block a user