Files
bigwo/parsers/text_parser.py

54 lines
1.5 KiB
Python
Raw Normal View History

"""TXT/MD 文件解析器,使用 charset-normalizer 自动检测编码"""
import os
from typing import List
from charset_normalizer import detect
from exceptions import ParseError
from parsers.base import BaseParser
class TextParser(BaseParser):
"""纯文本和 Markdown 文件解析器"""
def supported_extensions(self) -> List[str]:
return [".txt", ".md"]
def parse(self, file_path: str) -> str:
"""
解析文本文件自动检测编码并返回文本内容
参考 MaxKB TextSplitHandle.get_content() 核心逻辑
读取文件字节 charset_normalizer 检测编码 解码返回文本
Args:
file_path: 文件路径
Returns:
文件的文本内容
Raises:
ParseError: 文件无法读取或编码检测失败时抛出
"""
file_name = os.path.basename(file_path)
try:
with open(file_path, "rb") as f:
buffer = f.read()
except Exception as e:
raise ParseError(file_name, f"文件读取失败: {e}")
if len(buffer) == 0:
return ""
result = detect(buffer)
encoding = result.get("encoding")
if encoding is None:
raise ParseError(file_name, "无法检测文件编码")
try:
return buffer.decode(encoding)
except Exception as e:
raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}")