"""TextParser 单元测试""" import os import tempfile import pytest from exceptions import ParseError from parsers.text_parser import TextParser @pytest.fixture def parser(): return TextParser() class TestSupportedExtensions: def test_supports_txt(self, parser): assert ".txt" in parser.supported_extensions() def test_supports_md(self, parser): assert ".md" in parser.supported_extensions() def test_only_two_extensions(self, parser): assert len(parser.supported_extensions()) == 2 class TestParse: def test_parse_utf8_txt(self, parser, tmp_path): f = tmp_path / "test.txt" f.write_text("Hello, world!", encoding="utf-8") assert parser.parse(str(f)) == "Hello, world!" def test_parse_utf8_md(self, parser, tmp_path): f = tmp_path / "readme.md" content = "# Title\n\nSome **bold** text." f.write_bytes(content.encode("utf-8")) assert parser.parse(str(f)) == content def test_parse_gbk_encoded_file(self, parser, tmp_path): f = tmp_path / "chinese.txt" # Use longer text so charset_normalizer can reliably detect GBK content = "你好,世界!这是一段中文文本。我们正在测试文件编码的自动检测功能,需要足够长的文本才能让检测器准确识别编码格式。" f.write_bytes(content.encode("gbk")) result = parser.parse(str(f)) assert result == content def test_parse_utf8_bom(self, parser, tmp_path): f = tmp_path / "bom.txt" content = "UTF-8 with BOM" f.write_bytes(b"\xef\xbb\xbf" + content.encode("utf-8")) result = parser.parse(str(f)) assert "UTF-8 with BOM" in result def test_parse_empty_file(self, parser, tmp_path): f = tmp_path / "empty.txt" f.write_bytes(b"") assert parser.parse(str(f)) == "" def test_parse_multiline(self, parser, tmp_path): f = tmp_path / "multi.md" content = "Line 1\nLine 2\nLine 3\n" f.write_bytes(content.encode("utf-8")) assert parser.parse(str(f)) == content def test_parse_nonexistent_file_raises(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/nonexistent/path/file.txt") assert "file.txt" in exc_info.value.file_name assert exc_info.value.reason != "" def test_parse_error_contains_filename(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/no/such/myfile.txt") assert exc_info.value.file_name == "myfile.txt" def test_parse_latin1_encoded_file(self, parser, tmp_path): f = tmp_path / "latin.txt" content = "café résumé naïve" f.write_bytes(content.encode("latin-1")) result = parser.parse(str(f)) assert "caf" in result assert "sum" in result