97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
|
|
"""HTML 文件解析器,使用 bs4 + html2text 将 HTML 转换为 Markdown"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from typing import List, Optional
|
|||
|
|
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
from charset_normalizer import detect
|
|||
|
|
import html2text
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.base import BaseParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
class HtmlParser(BaseParser):
|
|||
|
|
"""HTML 文件解析器,去除脚本和样式,转换为 Markdown 格式"""
|
|||
|
|
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
return [".html", ".htm"]
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
解析 HTML 文件,自动检测编码,去除 script/style 标签,转换为 Markdown。
|
|||
|
|
|
|||
|
|
参考 MaxKB HTMLSplitHandle.get_content() 核心逻辑:
|
|||
|
|
读取文件字节 → 检测编码(优先 meta charset,回退 charset_normalizer)
|
|||
|
|
→ 解码 → html2text 转 Markdown
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Markdown 格式的文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ParseError: 文件无法读取或编码检测失败时抛出
|
|||
|
|
"""
|
|||
|
|
file_name = os.path.basename(file_path)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(file_path, "rb") as f:
|
|||
|
|
buffer = f.read()
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"文件读取失败: {e}")
|
|||
|
|
|
|||
|
|
if len(buffer) == 0:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
encoding = self._get_encoding(buffer, file_name)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
content = buffer.decode(encoding)
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}")
|
|||
|
|
|
|||
|
|
converter = html2text.HTML2Text()
|
|||
|
|
converter.body_width = 0 # Don't wrap lines
|
|||
|
|
converter.ignore_images = False
|
|||
|
|
converter.ignore_links = False
|
|||
|
|
|
|||
|
|
return converter.handle(content)
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _get_encoding(buffer: bytes, file_name: str) -> str:
|
|||
|
|
"""
|
|||
|
|
检测 HTML 文件编码。
|
|||
|
|
|
|||
|
|
优先从 HTML meta charset 标签获取编码,回退到 charset_normalizer 自动检测。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
buffer: 文件字节内容
|
|||
|
|
file_name: 文件名(用于错误信息)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
检测到的编码名称
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ParseError: 编码无法检测时抛出
|
|||
|
|
"""
|
|||
|
|
# First try: extract charset from meta tags
|
|||
|
|
try:
|
|||
|
|
soup = BeautifulSoup(buffer, "html.parser")
|
|||
|
|
meta_list = soup.find_all("meta")
|
|||
|
|
for meta in meta_list:
|
|||
|
|
if meta.attrs and "charset" in meta.attrs:
|
|||
|
|
return meta.attrs["charset"]
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# Fallback: charset_normalizer
|
|||
|
|
result = detect(buffer)
|
|||
|
|
encoding = result.get("encoding") if result else None
|
|||
|
|
|
|||
|
|
if encoding is None:
|
|||
|
|
raise ParseError(file_name, "无法检测文件编码")
|
|||
|
|
|
|||
|
|
return encoding
|