Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
83
tests/test_text_parser.py
Normal file
83
tests/test_text_parser.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""TextParser 单元测试"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from exceptions import ParseError
|
||||
from parsers.text_parser import TextParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser():
|
||||
return TextParser()
|
||||
|
||||
|
||||
class TestSupportedExtensions:
|
||||
def test_supports_txt(self, parser):
|
||||
assert ".txt" in parser.supported_extensions()
|
||||
|
||||
def test_supports_md(self, parser):
|
||||
assert ".md" in parser.supported_extensions()
|
||||
|
||||
def test_only_two_extensions(self, parser):
|
||||
assert len(parser.supported_extensions()) == 2
|
||||
|
||||
|
||||
class TestParse:
|
||||
def test_parse_utf8_txt(self, parser, tmp_path):
|
||||
f = tmp_path / "test.txt"
|
||||
f.write_text("Hello, world!", encoding="utf-8")
|
||||
assert parser.parse(str(f)) == "Hello, world!"
|
||||
|
||||
def test_parse_utf8_md(self, parser, tmp_path):
|
||||
f = tmp_path / "readme.md"
|
||||
content = "# Title\n\nSome **bold** text."
|
||||
f.write_bytes(content.encode("utf-8"))
|
||||
assert parser.parse(str(f)) == content
|
||||
|
||||
def test_parse_gbk_encoded_file(self, parser, tmp_path):
|
||||
f = tmp_path / "chinese.txt"
|
||||
# Use longer text so charset_normalizer can reliably detect GBK
|
||||
content = "你好,世界!这是一段中文文本。我们正在测试文件编码的自动检测功能,需要足够长的文本才能让检测器准确识别编码格式。"
|
||||
f.write_bytes(content.encode("gbk"))
|
||||
result = parser.parse(str(f))
|
||||
assert result == content
|
||||
|
||||
def test_parse_utf8_bom(self, parser, tmp_path):
|
||||
f = tmp_path / "bom.txt"
|
||||
content = "UTF-8 with BOM"
|
||||
f.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
|
||||
result = parser.parse(str(f))
|
||||
assert "UTF-8 with BOM" in result
|
||||
|
||||
def test_parse_empty_file(self, parser, tmp_path):
|
||||
f = tmp_path / "empty.txt"
|
||||
f.write_bytes(b"")
|
||||
assert parser.parse(str(f)) == ""
|
||||
|
||||
def test_parse_multiline(self, parser, tmp_path):
|
||||
f = tmp_path / "multi.md"
|
||||
content = "Line 1\nLine 2\nLine 3\n"
|
||||
f.write_bytes(content.encode("utf-8"))
|
||||
assert parser.parse(str(f)) == content
|
||||
|
||||
def test_parse_nonexistent_file_raises(self, parser):
|
||||
with pytest.raises(ParseError) as exc_info:
|
||||
parser.parse("/nonexistent/path/file.txt")
|
||||
assert "file.txt" in exc_info.value.file_name
|
||||
assert exc_info.value.reason != ""
|
||||
|
||||
def test_parse_error_contains_filename(self, parser):
|
||||
with pytest.raises(ParseError) as exc_info:
|
||||
parser.parse("/no/such/myfile.txt")
|
||||
assert exc_info.value.file_name == "myfile.txt"
|
||||
|
||||
def test_parse_latin1_encoded_file(self, parser, tmp_path):
|
||||
f = tmp_path / "latin.txt"
|
||||
content = "café résumé naïve"
|
||||
f.write_bytes(content.encode("latin-1"))
|
||||
result = parser.parse(str(f))
|
||||
assert "caf" in result
|
||||
assert "sum" in result
|
||||
Reference in New Issue
Block a user