54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
|
|
"""TXT/MD 文件解析器,使用 charset-normalizer 自动检测编码"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
from charset_normalizer import detect
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.base import BaseParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TextParser(BaseParser):
|
|||
|
|
"""纯文本和 Markdown 文件解析器"""
|
|||
|
|
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
return [".txt", ".md"]
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
解析文本文件,自动检测编码并返回文本内容。
|
|||
|
|
|
|||
|
|
参考 MaxKB TextSplitHandle.get_content() 核心逻辑:
|
|||
|
|
读取文件字节 → charset_normalizer 检测编码 → 解码返回文本
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件的文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ParseError: 文件无法读取或编码检测失败时抛出
|
|||
|
|
"""
|
|||
|
|
file_name = os.path.basename(file_path)
|
|||
|
|
try:
|
|||
|
|
with open(file_path, "rb") as f:
|
|||
|
|
buffer = f.read()
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"文件读取失败: {e}")
|
|||
|
|
|
|||
|
|
if len(buffer) == 0:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
result = detect(buffer)
|
|||
|
|
encoding = result.get("encoding")
|
|||
|
|
|
|||
|
|
if encoding is None:
|
|||
|
|
raise ParseError(file_name, "无法检测文件编码")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
return buffer.decode(encoding)
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}")
|