首次提交

2026-06-02 16:26:10 +08:00
commit 291e6fcaae
79 changed files with 11283 additions and 0 deletions
@@ -0,0 +1,214 @@
+"""
+基本公共函数。
+"""
+import base64
+import os
+import re
+from typing import Union
+from urllib.parse import urlparse
+
+import requests
+
+from paste.db import basemodel
+
+
+def fetch_image(img_url: str) -> tuple[requests.Response, str]:
+    """
+    获取外部图像。
+
+    :param img_url: 图像 URL
+    :return: (响应对象，内容类型)
+    :raises ValueError: URL 格式无效
+    :raises requests.exceptions.RequestException: 请求失败
+    """
+    # 验证 URL 格式
+    parsed_url = urlparse(img_url)
+    if not all([parsed_url.scheme, parsed_url.netloc]):
+        raise ValueError("Invalid URL")
+
+    # 设置请求头，模拟浏览器请求
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                      'AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/91.0.4472.124 Safari/537.36'
+    }
+
+    # 获取外部图像
+    response = requests.get(img_url, headers=headers, stream=True, timeout=10)
+    response.raise_for_status()
+
+    # 获取内容类型，如果没有则默认为 image/jpeg
+    content_type = response.headers.get('Content-Type', 'image/jpeg')
+
+    return response, content_type
+
+
+def save_image_to_dir(image_data: bytes, image_type: str, output_dir: str) -> str:
+    """
+    将图像数据保存到指定目录，返回相对路径。
+
+    :param image_data: 图像二进制数据
+    :param image_type: 图像扩展名（如 'jpg', 'png'）
+    :param output_dir: 输出目录（相对于项目根目录，如 'static/upload/article/images'）
+    :return: 保存后的相对路径（以 / 开头）
+    """
+    # 生成唯一文件名
+    filename = f"{basemodel.BaseModel.newId()}.{image_type}"
+    full_path = os.path.abspath(os.path.join(os.curdir, output_dir, filename))
+
+    # 确保目录存在
+    os.makedirs(os.path.dirname(full_path), exist_ok=True)
+
+    # 保存图像
+    with open(full_path, 'wb') as f:
+        f.write(image_data)
+
+    # 返回相对路径（以 / 开头）
+    rel_path = os.path.join(output_dir, filename).replace('\\', '/')
+    if not rel_path.startswith('/'):
+        rel_path = '/' + rel_path
+    return rel_path
+
+
+def download_and_save_image(url: str, output_dir: str) -> Union[str, None]:
+    """
+    从外部 URL 下载图像并保存到指定目录。
+
+    :param url: 外部图像的完整 URL
+    :param output_dir: 输出目录
+    :return: 保存成功时返回相对路径，失败时返回 None
+    """
+    try:
+        res_img, res_content_type = fetch_image(url)
+
+        # 提取扩展名
+        image_type = res_content_type.split('/')[1].split(';')[0].strip() if '/' in res_content_type else 'jpg'
+
+        # 验证扩展名安全性
+        allowed_extensions = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp'}
+        if image_type not in allowed_extensions:
+            image_type = 'jpg'
+
+        # 收集图像数据
+        image_data = b''.join(res_img.iter_content(1024))
+
+        # 保存到本地
+        new_src = save_image_to_dir(image_data, image_type, output_dir)
+        return new_src
+    except Exception:
+        return None
+
+
+def decode_base64_image(header: str, data: str, output_dir: str) -> str:
+    """
+    解码 base64 格式的图像数据并保存到指定目录。
+
+    :param header: base64 数据头
+    :param data: base64 编码的图像数据
+    :param output_dir: 输出目录
+    :return: 保存后的相对路径
+    """
+    # 从 header 中获取图像类型
+    image_type = header.split(';')[0].split('/')[1]
+
+    # 验证扩展名安全性
+    allowed_extensions = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp'}
+    if image_type not in allowed_extensions:
+        image_type = 'jpg'
+
+    # 解码并保存
+    image_data = base64.b64decode(data)
+    return save_image_to_dir(image_data, image_type, output_dir)
+
+
+def extract_image_paths(html_content: str) -> list[dict]:
+    """
+    从 HTML 内容中提取所有图像的 src 信息。
+
+    该方法用于识别文章中引用的所有图像资源，返回详细的图像信息列表。
+
+    :param str html_content: HTML 内容
+    :return: 图像信息列表，每个元素包含 src 值和类型
+    :rtype: list[dict]
+
+    返回结构::
+
+        [
+            {
+                'original': 'https://external.com/img.jpg',  # 原始 src 值
+                'src': '/static/upload/article/images/abc.jpg',  # 标准化后的本地路径（external/base64 为 None）
+                'type': 'external',  # local: 本地路径，domain: 本地域名，external: 外部域名，base64: base64 数据
+                'url': 'https://external.com/img.jpg'  # 完整 URL（仅 external 类型有值）
+            }
+        ]
+
+    注意::
+
+        - local/domain 类型：src 为标准化本地路径
+        - external 类型：src 为 None，url 为原始外部 URL
+        - base64 类型：src 为 None，url 为 None
+    """
+    # 允许的本地域名列表
+    allowed_domains = {
+        'haiten.cn', 'www.haiten.cn', 'usasu.cn', 'www.usasu.cn', 'pathx.cn', 'www.pathx.cn',
+        '127.0.0.1', '100.64.0.18', 'localhost'
+    }
+
+    # 改进的正则表达式：
+    # - 允许 src 是第一个属性
+    # - 支持单引号和双引号
+    # - 确保引号成对匹配
+    # - 支持跨行匹配
+    img_pattern = re.compile(
+        r'<img[^>]*?\s+src\s*=\s*(["\'])([^"\']+?)\1[^>]*?>?',
+        re.IGNORECASE | re.DOTALL
+    )
+
+    images = []
+
+    for match in img_pattern.finditer(html_content):
+        original_src = match.group(2)  # 捕获组 2 是 src 的值
+        image_info = {
+            'original': original_src,
+            'src': None,
+            'type': None,
+            'url': None
+        }
+
+        # 判断图像类型
+        if original_src.startswith('data:image'):
+            # base64 数据
+            image_info['type'] = 'base64'
+
+        elif original_src.startswith(('http://', 'https://')):
+            parsed_url = urlparse(original_src)
+            domain = parsed_url.netloc.split(':')[0]
+
+            if domain in allowed_domains:
+                # 本地域名 - 转换为相对路径
+                new_src = parsed_url.path
+                if parsed_url.query:
+                    new_src += f"?{parsed_url.query}"
+                if parsed_url.fragment:
+                    new_src += f"#{parsed_url.fragment}"
+                # 确保路径以 / 开头
+                if not new_src.startswith('/'):
+                    new_src = '/' + new_src
+                image_info['src'] = new_src
+                image_info['type'] = 'domain'
+            else:
+                # 外部域名
+                image_info['type'] = 'external'
+                image_info['url'] = original_src
+
+        else:
+            # 本地相对路径
+            # 确保路径以 / 开头
+            if not original_src.startswith('/'):
+                original_src = '/' + original_src
+            image_info['src'] = original_src
+            image_info['type'] = 'local'
+
+        images.append(image_info)
+
+    return images