84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
|
|
"""TextParser 单元测试"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import tempfile
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from exceptions import ParseError
|
||
|
|
from parsers.text_parser import TextParser
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def parser():
|
||
|
|
return TextParser()
|
||
|
|
|
||
|
|
|
||
|
|
class TestSupportedExtensions:
|
||
|
|
def test_supports_txt(self, parser):
|
||
|
|
assert ".txt" in parser.supported_extensions()
|
||
|
|
|
||
|
|
def test_supports_md(self, parser):
|
||
|
|
assert ".md" in parser.supported_extensions()
|
||
|
|
|
||
|
|
def test_only_two_extensions(self, parser):
|
||
|
|
assert len(parser.supported_extensions()) == 2
|
||
|
|
|
||
|
|
|
||
|
|
class TestParse:
|
||
|
|
def test_parse_utf8_txt(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "test.txt"
|
||
|
|
f.write_text("Hello, world!", encoding="utf-8")
|
||
|
|
assert parser.parse(str(f)) == "Hello, world!"
|
||
|
|
|
||
|
|
def test_parse_utf8_md(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "readme.md"
|
||
|
|
content = "# Title\n\nSome **bold** text."
|
||
|
|
f.write_bytes(content.encode("utf-8"))
|
||
|
|
assert parser.parse(str(f)) == content
|
||
|
|
|
||
|
|
def test_parse_gbk_encoded_file(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "chinese.txt"
|
||
|
|
# Use longer text so charset_normalizer can reliably detect GBK
|
||
|
|
content = "你好,世界!这是一段中文文本。我们正在测试文件编码的自动检测功能,需要足够长的文本才能让检测器准确识别编码格式。"
|
||
|
|
f.write_bytes(content.encode("gbk"))
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert result == content
|
||
|
|
|
||
|
|
def test_parse_utf8_bom(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "bom.txt"
|
||
|
|
content = "UTF-8 with BOM"
|
||
|
|
f.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8"))
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "UTF-8 with BOM" in result
|
||
|
|
|
||
|
|
def test_parse_empty_file(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "empty.txt"
|
||
|
|
f.write_bytes(b"")
|
||
|
|
assert parser.parse(str(f)) == ""
|
||
|
|
|
||
|
|
def test_parse_multiline(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "multi.md"
|
||
|
|
content = "Line 1\nLine 2\nLine 3\n"
|
||
|
|
f.write_bytes(content.encode("utf-8"))
|
||
|
|
assert parser.parse(str(f)) == content
|
||
|
|
|
||
|
|
def test_parse_nonexistent_file_raises(self, parser):
|
||
|
|
with pytest.raises(ParseError) as exc_info:
|
||
|
|
parser.parse("/nonexistent/path/file.txt")
|
||
|
|
assert "file.txt" in exc_info.value.file_name
|
||
|
|
assert exc_info.value.reason != ""
|
||
|
|
|
||
|
|
def test_parse_error_contains_filename(self, parser):
|
||
|
|
with pytest.raises(ParseError) as exc_info:
|
||
|
|
parser.parse("/no/such/myfile.txt")
|
||
|
|
assert exc_info.value.file_name == "myfile.txt"
|
||
|
|
|
||
|
|
def test_parse_latin1_encoded_file(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "latin.txt"
|
||
|
|
content = "café résumé naïve"
|
||
|
|
f.write_bytes(content.encode("latin-1"))
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "caf" in result
|
||
|
|
assert "sum" in result
|