"""HTML 文件解析器,使用 bs4 + html2text 将 HTML 转换为 Markdown""" import os from typing import List, Optional from bs4 import BeautifulSoup from charset_normalizer import detect import html2text from exceptions import ParseError from parsers.base import BaseParser class HtmlParser(BaseParser): """HTML 文件解析器,去除脚本和样式,转换为 Markdown 格式""" def supported_extensions(self) -> List[str]: return [".html", ".htm"] def parse(self, file_path: str) -> str: """ 解析 HTML 文件,自动检测编码,去除 script/style 标签,转换为 Markdown。 参考 MaxKB HTMLSplitHandle.get_content() 核心逻辑: 读取文件字节 → 检测编码(优先 meta charset,回退 charset_normalizer) → 解码 → html2text 转 Markdown Args: file_path: 文件路径 Returns: Markdown 格式的文本内容 Raises: ParseError: 文件无法读取或编码检测失败时抛出 """ file_name = os.path.basename(file_path) try: with open(file_path, "rb") as f: buffer = f.read() except Exception as e: raise ParseError(file_name, f"文件读取失败: {e}") if len(buffer) == 0: return "" encoding = self._get_encoding(buffer, file_name) try: content = buffer.decode(encoding) except Exception as e: raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}") converter = html2text.HTML2Text() converter.body_width = 0 # Don't wrap lines converter.ignore_images = False converter.ignore_links = False return converter.handle(content) @staticmethod def _get_encoding(buffer: bytes, file_name: str) -> str: """ 检测 HTML 文件编码。 优先从 HTML meta charset 标签获取编码,回退到 charset_normalizer 自动检测。 Args: buffer: 文件字节内容 file_name: 文件名(用于错误信息) Returns: 检测到的编码名称 Raises: ParseError: 编码无法检测时抛出 """ # First try: extract charset from meta tags try: soup = BeautifulSoup(buffer, "html.parser") meta_list = soup.find_all("meta") for meta in meta_list: if meta.attrs and "charset" in meta.attrs: return meta.attrs["charset"] except Exception: pass # Fallback: charset_normalizer result = detect(buffer) encoding = result.get("encoding") if result else None if encoding is None: raise ParseError(file_name, "无法检测文件编码") return encoding