Squashed 'paste-framework/' content from commit 34e8684
git-subtree-dir: paste-framework git-subtree-split: 34e8684c4bc3cebbe177509f42ab4ef5b5425a7a
This commit is contained in:
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
基本公共函数。
|
||||
"""
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
from typing import Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from paste.db import basemodel
|
||||
|
||||
|
||||
def fetch_image(img_url: str) -> tuple[requests.Response, str]:
|
||||
"""
|
||||
获取外部图像。
|
||||
|
||||
:param img_url: 图像 URL
|
||||
:return: (响应对象,内容类型)
|
||||
:raises ValueError: URL 格式无效
|
||||
:raises requests.exceptions.RequestException: 请求失败
|
||||
"""
|
||||
# 验证 URL 格式
|
||||
parsed_url = urlparse(img_url)
|
||||
if not all([parsed_url.scheme, parsed_url.netloc]):
|
||||
raise ValueError("Invalid URL")
|
||||
|
||||
# 设置请求头,模拟浏览器请求
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
# 获取外部图像
|
||||
response = requests.get(img_url, headers=headers, stream=True, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 获取内容类型,如果没有则默认为 image/jpeg
|
||||
content_type = response.headers.get('Content-Type', 'image/jpeg')
|
||||
|
||||
return response, content_type
|
||||
|
||||
|
||||
def save_image_to_dir(image_data: bytes, image_type: str, output_dir: str) -> str:
|
||||
"""
|
||||
将图像数据保存到指定目录,返回相对路径。
|
||||
|
||||
:param image_data: 图像二进制数据
|
||||
:param image_type: 图像扩展名(如 'jpg', 'png')
|
||||
:param output_dir: 输出目录(相对于项目根目录,如 'static/upload/article/images')
|
||||
:return: 保存后的相对路径(以 / 开头)
|
||||
"""
|
||||
# 生成唯一文件名
|
||||
filename = f"{basemodel.BaseModel.newId()}.{image_type}"
|
||||
full_path = os.path.abspath(os.path.join(os.curdir, output_dir, filename))
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
|
||||
# 保存图像
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(image_data)
|
||||
|
||||
# 返回相对路径(以 / 开头)
|
||||
rel_path = os.path.join(output_dir, filename).replace('\\', '/')
|
||||
if not rel_path.startswith('/'):
|
||||
rel_path = '/' + rel_path
|
||||
return rel_path
|
||||
|
||||
|
||||
def download_and_save_image(url: str, output_dir: str) -> Union[str, None]:
|
||||
"""
|
||||
从外部 URL 下载图像并保存到指定目录。
|
||||
|
||||
:param url: 外部图像的完整 URL
|
||||
:param output_dir: 输出目录
|
||||
:return: 保存成功时返回相对路径,失败时返回 None
|
||||
"""
|
||||
try:
|
||||
res_img, res_content_type = fetch_image(url)
|
||||
|
||||
# 提取扩展名
|
||||
image_type = res_content_type.split('/')[1].split(';')[0].strip() if '/' in res_content_type else 'jpg'
|
||||
|
||||
# 验证扩展名安全性
|
||||
allowed_extensions = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp'}
|
||||
if image_type not in allowed_extensions:
|
||||
image_type = 'jpg'
|
||||
|
||||
# 收集图像数据
|
||||
image_data = b''.join(res_img.iter_content(1024))
|
||||
|
||||
# 保存到本地
|
||||
new_src = save_image_to_dir(image_data, image_type, output_dir)
|
||||
return new_src
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def decode_base64_image(header: str, data: str, output_dir: str) -> str:
|
||||
"""
|
||||
解码 base64 格式的图像数据并保存到指定目录。
|
||||
|
||||
:param header: base64 数据头
|
||||
:param data: base64 编码的图像数据
|
||||
:param output_dir: 输出目录
|
||||
:return: 保存后的相对路径
|
||||
"""
|
||||
# 从 header 中获取图像类型
|
||||
image_type = header.split(';')[0].split('/')[1]
|
||||
|
||||
# 验证扩展名安全性
|
||||
allowed_extensions = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp'}
|
||||
if image_type not in allowed_extensions:
|
||||
image_type = 'jpg'
|
||||
|
||||
# 解码并保存
|
||||
image_data = base64.b64decode(data)
|
||||
return save_image_to_dir(image_data, image_type, output_dir)
|
||||
|
||||
|
||||
def extract_image_paths(html_content: str) -> list[dict]:
|
||||
"""
|
||||
从 HTML 内容中提取所有图像的 src 信息。
|
||||
|
||||
该方法用于识别文章中引用的所有图像资源,返回详细的图像信息列表。
|
||||
|
||||
:param str html_content: HTML 内容
|
||||
:return: 图像信息列表,每个元素包含 src 值和类型
|
||||
:rtype: list[dict]
|
||||
|
||||
返回结构::
|
||||
|
||||
[
|
||||
{
|
||||
'original': 'https://external.com/img.jpg', # 原始 src 值
|
||||
'src': '/static/upload/article/images/abc.jpg', # 标准化后的本地路径(external/base64 为 None)
|
||||
'type': 'external', # local: 本地路径,domain: 本地域名,external: 外部域名,base64: base64 数据
|
||||
'url': 'https://external.com/img.jpg' # 完整 URL(仅 external 类型有值)
|
||||
}
|
||||
]
|
||||
|
||||
注意::
|
||||
|
||||
- local/domain 类型:src 为标准化本地路径
|
||||
- external 类型:src 为 None,url 为原始外部 URL
|
||||
- base64 类型:src 为 None,url 为 None
|
||||
"""
|
||||
# 允许的本地域名列表
|
||||
allowed_domains = {
|
||||
'haiten.cn', 'www.haiten.cn', 'usasu.cn', 'www.usasu.cn', 'pathx.cn', 'www.pathx.cn',
|
||||
'127.0.0.1', '100.64.0.18', 'localhost'
|
||||
}
|
||||
|
||||
# 改进的正则表达式:
|
||||
# - 允许 src 是第一个属性
|
||||
# - 支持单引号和双引号
|
||||
# - 确保引号成对匹配
|
||||
# - 支持跨行匹配
|
||||
img_pattern = re.compile(
|
||||
r'<img[^>]*?\s+src\s*=\s*(["\'])([^"\']+?)\1[^>]*?>?',
|
||||
re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
|
||||
images = []
|
||||
|
||||
for match in img_pattern.finditer(html_content):
|
||||
original_src = match.group(2) # 捕获组 2 是 src 的值
|
||||
image_info = {
|
||||
'original': original_src,
|
||||
'src': None,
|
||||
'type': None,
|
||||
'url': None
|
||||
}
|
||||
|
||||
# 判断图像类型
|
||||
if original_src.startswith('data:image'):
|
||||
# base64 数据
|
||||
image_info['type'] = 'base64'
|
||||
|
||||
elif original_src.startswith(('http://', 'https://')):
|
||||
parsed_url = urlparse(original_src)
|
||||
domain = parsed_url.netloc.split(':')[0]
|
||||
|
||||
if domain in allowed_domains:
|
||||
# 本地域名 - 转换为相对路径
|
||||
new_src = parsed_url.path
|
||||
if parsed_url.query:
|
||||
new_src += f"?{parsed_url.query}"
|
||||
if parsed_url.fragment:
|
||||
new_src += f"#{parsed_url.fragment}"
|
||||
# 确保路径以 / 开头
|
||||
if not new_src.startswith('/'):
|
||||
new_src = '/' + new_src
|
||||
image_info['src'] = new_src
|
||||
image_info['type'] = 'domain'
|
||||
else:
|
||||
# 外部域名
|
||||
image_info['type'] = 'external'
|
||||
image_info['url'] = original_src
|
||||
|
||||
else:
|
||||
# 本地相对路径
|
||||
# 确保路径以 / 开头
|
||||
if not original_src.startswith('/'):
|
||||
original_src = '/' + original_src
|
||||
image_info['src'] = original_src
|
||||
image_info['type'] = 'local'
|
||||
|
||||
images.append(image_info)
|
||||
|
||||
return images
|
||||
Reference in New Issue
Block a user