Initial commit: AI 知识库文档智能分块工具

This commit is contained in:
AI Knowledge Splitter
2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions

83
tests/test_text_parser.py Normal file
View File

@@ -0,0 +1,83 @@
"""TextParser 单元测试"""
import os
import tempfile
import pytest
from exceptions import ParseError
from parsers.text_parser import TextParser
@pytest.fixture
def parser():
return TextParser()
class TestSupportedExtensions:
def test_supports_txt(self, parser):
assert ".txt" in parser.supported_extensions()
def test_supports_md(self, parser):
assert ".md" in parser.supported_extensions()
def test_only_two_extensions(self, parser):
assert len(parser.supported_extensions()) == 2
class TestParse:
def test_parse_utf8_txt(self, parser, tmp_path):
f = tmp_path / "test.txt"
f.write_text("Hello, world!", encoding="utf-8")
assert parser.parse(str(f)) == "Hello, world!"
def test_parse_utf8_md(self, parser, tmp_path):
f = tmp_path / "readme.md"
content = "# Title\n\nSome **bold** text."
f.write_bytes(content.encode("utf-8"))
assert parser.parse(str(f)) == content
def test_parse_gbk_encoded_file(self, parser, tmp_path):
f = tmp_path / "chinese.txt"
# Use longer text so charset_normalizer can reliably detect GBK
content = "你好,世界!这是一段中文文本。我们正在测试文件编码的自动检测功能,需要足够长的文本才能让检测器准确识别编码格式。"
f.write_bytes(content.encode("gbk"))
result = parser.parse(str(f))
assert result == content
def test_parse_utf8_bom(self, parser, tmp_path):
f = tmp_path / "bom.txt"
content = "UTF-8 with BOM"
f.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
result = parser.parse(str(f))
assert "UTF-8 with BOM" in result
def test_parse_empty_file(self, parser, tmp_path):
f = tmp_path / "empty.txt"
f.write_bytes(b"")
assert parser.parse(str(f)) == ""
def test_parse_multiline(self, parser, tmp_path):
f = tmp_path / "multi.md"
content = "Line 1\nLine 2\nLine 3\n"
f.write_bytes(content.encode("utf-8"))
assert parser.parse(str(f)) == content
def test_parse_nonexistent_file_raises(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/nonexistent/path/file.txt")
assert "file.txt" in exc_info.value.file_name
assert exc_info.value.reason != ""
def test_parse_error_contains_filename(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/no/such/myfile.txt")
assert exc_info.value.file_name == "myfile.txt"
def test_parse_latin1_encoded_file(self, parser, tmp_path):
f = tmp_path / "latin.txt"
content = "café résumé naïve"
f.write_bytes(content.encode("latin-1"))
result = parser.parse(str(f))
assert "caf" in result
assert "sum" in result