Initial commit: AI 知识库文档智能分块工具

This commit is contained in:
AI Knowledge Splitter
2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions

53
parsers/text_parser.py Normal file
View File

@@ -0,0 +1,53 @@
"""TXT/MD 文件解析器,使用 charset-normalizer 自动检测编码"""
import os
from typing import List
from charset_normalizer import detect
from exceptions import ParseError
from parsers.base import BaseParser
class TextParser(BaseParser):
"""纯文本和 Markdown 文件解析器"""
def supported_extensions(self) -> List[str]:
return [".txt", ".md"]
def parse(self, file_path: str) -> str:
"""
解析文本文件,自动检测编码并返回文本内容。
参考 MaxKB TextSplitHandle.get_content() 核心逻辑:
读取文件字节 → charset_normalizer 检测编码 → 解码返回文本
Args:
file_path: 文件路径
Returns:
文件的文本内容
Raises:
ParseError: 文件无法读取或编码检测失败时抛出
"""
file_name = os.path.basename(file_path)
try:
with open(file_path, "rb") as f:
buffer = f.read()
except Exception as e:
raise ParseError(file_name, f"文件读取失败: {e}")
if len(buffer) == 0:
return ""
result = detect(buffer)
encoding = result.get("encoding")
if encoding is None:
raise ParseError(file_name, "无法检测文件编码")
try:
return buffer.decode(encoding)
except Exception as e:
raise ParseError(file_name, f"编码解码失败 ({encoding}): {e}")