Initial commit: AI 知识库文档智能分块工具

2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions
--- a/tests/test_csv_parser.py
+++ b/tests/test_csv_parser.py
@@ -0,0 +1,105 @@
+"""CsvParser 单元测试"""
+
+import pytest
+
+from exceptions import ParseError
+from parsers.csv_parser import CsvParser
+
+
+@pytest.fixture
+def parser():
+    return CsvParser()
+
+
+class TestSupportedExtensions:
+    def test_supports_csv(self, parser):
+        assert ".csv" in parser.supported_extensions()
+
+    def test_only_one_extension(self, parser):
+        assert len(parser.supported_extensions()) == 1
+
+
+class TestParse:
+    def test_basic_csv(self, parser, tmp_path):
+        f = tmp_path / "basic.csv"
+        f.write_text("name,age,city\nAlice,30,Beijing\nBob,25,Shanghai\n", encoding="utf-8")
+        result = parser.parse(str(f))
+        assert "| name | age | city |" in result
+        assert "| --- | --- | --- |" in result
+        assert "| Alice | 30 | Beijing |" in result
+        assert "| Bob | 25 | Shanghai |" in result
+
+    def test_empty_file(self, parser, tmp_path):
+        f = tmp_path / "empty.csv"
+        f.write_bytes(b"")
+        assert parser.parse(str(f)) == ""
+
+    def test_header_only(self, parser, tmp_path):
+        f = tmp_path / "header.csv"
+        f.write_text("col1,col2,col3\n", encoding="utf-8")
+        result = parser.parse(str(f))
+        assert "| col1 | col2 | col3 |" in result
+        assert "| --- | --- | --- |" in result
+        lines = result.strip().split("\n")
+        assert len(lines) == 2
+
+    def test_pipe_char_escaped(self, parser, tmp_path):
+        f = tmp_path / "pipe.csv"
+        f.write_text('header\n"a|b"\n', encoding="utf-8")
+        result = parser.parse(str(f))
+        assert "&#124;" in result
+        assert "a&#124;b" in result
+
+    def test_newline_in_cell(self, parser, tmp_path):
+        f = tmp_path / "newline.csv"
+        f.write_text('header\n"line1\nline2"\n', encoding="utf-8")
+        result = parser.parse(str(f))
+        assert "<br>" in result
+        assert "line1<br>line2" in result
+
+    def test_gbk_encoded_csv(self, parser, tmp_path):
+        f = tmp_path / "gbk.csv"
+        content = "姓名,年龄,城市\n张三,28,北京\n李四,32,上海\n"
+        f.write_bytes(content.encode("gbk"))
+        result = parser.parse(str(f))
+        assert "张三" in result
+        assert "北京" in result
+
+    def test_nonexistent_file_raises(self, parser):
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse("/nonexistent/path/data.csv")
+        assert "data.csv" in exc_info.value.file_name
+        assert exc_info.value.reason != ""
+
+    def test_short_row_padded(self, parser, tmp_path):
+        """Rows shorter than header should be padded with empty cells."""
+        f = tmp_path / "short.csv"
+        f.write_text("a,b,c\n1\n", encoding="utf-8")
+        result = parser.parse(str(f))
+        assert "| 1 |  |  |" in result
+
+    def test_result_ends_with_newline(self, parser, tmp_path):
+        f = tmp_path / "trail.csv"
+        f.write_text("h1,h2\nv1,v2\n", encoding="utf-8")
+        result = parser.parse(str(f))
+        assert result.endswith("\n")
+
+
+class TestEscapeCell:
+    def test_no_special_chars(self):
+        assert CsvParser._escape_cell("hello") == "hello"
+
+    def test_pipe_escaped(self):
+        assert CsvParser._escape_cell("a|b") == "a&#124;b"
+
+    def test_newline_escaped(self):
+        assert CsvParser._escape_cell("a\nb") == "a<br>b"
+
+    def test_crlf_escaped(self):
+        assert CsvParser._escape_cell("a\r\nb") == "a<br>b"
+
+    def test_cr_escaped(self):
+        assert CsvParser._escape_cell("a\rb") == "a<br>b"
+
+    def test_combined_escapes(self):
+        assert CsvParser._escape_cell("a|b\nc") == "a&#124;b<br>c"