Files
bigwo/parsers/html_parser.py
2026-03-02 17:38:28 +08:00

97 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""HTML 文件解析器,使用 bs4 + html2text 将 HTML 转换为 Markdown"""
import os
from typing import List, Optional
from bs4 import BeautifulSoup
from charset_normalizer import detect
import html2text
from exceptions import ParseError
from parsers.base import BaseParser
class HtmlParser(BaseParser):
"""HTML 文件解析器,去除脚本和样式,转换为 Markdown 格式"""
def supported_extensions(self) -> List[str]:
return [".html", ".htm"]
def parse(self, file_path: str) -> str:
"""
解析 HTML 文件,自动检测编码,去除 script/style 标签,转换为 Markdown。
参考 MaxKB HTMLSplitHandle.get_content() 核心逻辑:
读取文件字节 → 检测编码(优先 meta charset回退 charset_normalizer
→ 解码 → html2text 转 Markdown
Args:
file_path: 文件路径
Returns:
Markdown 格式的文本内容
Raises:
ParseError: 文件无法读取或编码检测失败时抛出
"""
file_name = os.path.basename(file_path)
try:
with open(file_path, "rb") as f:
buffer = f.read()
except Exception as e:
raise ParseError(file_name, f"文件读取失败: {e}")
if len(buffer) == 0:
return ""
encoding = self._get_encoding(buffer, file_name)
try:
content = buffer.decode(encoding)
except Exception as e:
raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}")
converter = html2text.HTML2Text()
converter.body_width = 0 # Don't wrap lines
converter.ignore_images = False
converter.ignore_links = False
return converter.handle(content)
@staticmethod
def _get_encoding(buffer: bytes, file_name: str) -> str:
"""
检测 HTML 文件编码。
优先从 HTML meta charset 标签获取编码,回退到 charset_normalizer 自动检测。
Args:
buffer: 文件字节内容
file_name: 文件名(用于错误信息)
Returns:
检测到的编码名称
Raises:
ParseError: 编码无法检测时抛出
"""
# First try: extract charset from meta tags
try:
soup = BeautifulSoup(buffer, "html.parser")
meta_list = soup.find_all("meta")
for meta in meta_list:
if meta.attrs and "charset" in meta.attrs:
return meta.attrs["charset"]
except Exception:
pass
# Fallback: charset_normalizer
result = detect(buffer)
encoding = result.get("encoding") if result else None
if encoding is None:
raise ParseError(file_name, "无法检测文件编码")
return encoding