Initial commit: AI 知识库文档智能分块工具

2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions
--- a/tests/test_xls_parser.py
+++ b/tests/test_xls_parser.py
@@ -0,0 +1,178 @@
+"""XlsParser 单元测试"""
+
+import pytest
+import xlwt
+
+from exceptions import ParseError
+from parsers.xls_parser import XlsParser
+
+
+@pytest.fixture
+def parser():
+    return XlsParser()
+
+
+def _create_xls(path, sheets=None):
+    """
+    创建测试用 XLS 文件。
+
+    Args:
+        path: 输出文件路径
+        sheets: dict，key 为 sheet 名称，value 为二维列表（行×列的数据）
+                如果为 None，创建空工作簿
+    """
+    wb = xlwt.Workbook()
+
+    if sheets:
+        for sheet_name, rows in sheets.items():
+            ws = wb.add_sheet(sheet_name)
+            for row_idx, row in enumerate(rows):
+                for col_idx, value in enumerate(row):
+                    ws.write(row_idx, col_idx, value)
+    else:
+        # xlwt 需要至少一个 sheet
+        wb.add_sheet("Sheet1")
+
+    wb.save(str(path))
+
+
+class TestSupportedExtensions:
+    def test_supports_xls(self, parser):
+        assert ".xls" in parser.supported_extensions()
+
+    def test_only_one_extension(self, parser):
+        assert len(parser.supported_extensions()) == 1
+
+
+class TestParse:
+    def test_simple_table(self, parser, tmp_path):
+        """基本表格转换为 Markdown"""
+        xls_path = tmp_path / "simple.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [
+                ["Name", "Age"],
+                ["Alice", 30],
+                ["Bob", 25],
+            ]
+        })
+        result = parser.parse(str(xls_path))
+        assert "## Sheet1" in result
+        assert "| Name | Age |" in result
+        assert "| --- | --- |" in result
+        assert "Alice" in result
+        assert "Bob" in result
+
+    def test_multiple_sheets(self, parser, tmp_path):
+        """多个工作表各自生成标题和表格"""
+        xls_path = tmp_path / "multi.xls"
+        _create_xls(xls_path, {
+            "Users": [["Name"], ["Alice"]],
+            "Orders": [["ID"], ["001"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "## Users" in result
+        assert "## Orders" in result
+        assert "| Name |" in result
+        assert "| ID |" in result
+
+    def test_empty_sheet_skipped(self, parser, tmp_path):
+        """空工作表应被跳过"""
+        xls_path = tmp_path / "empty_sheet.xls"
+        wb = xlwt.Workbook()
+        wb.add_sheet("Empty")  # no data written
+        ws = wb.add_sheet("Data")
+        ws.write(0, 0, "Col1")
+        ws.write(1, 0, "Val1")
+        wb.save(str(xls_path))
+
+        result = parser.parse(str(xls_path))
+        assert "## Empty" not in result
+        assert "## Data" in result
+
+    def test_pipe_escaped(self, parser, tmp_path):
+        """单元格中的 | 应被转义为 &#124;"""
+        xls_path = tmp_path / "pipe.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [["Header"], ["value|with|pipes"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "&#124;" in result
+        assert "value&#124;with&#124;pipes" in result
+
+    def test_newline_escaped(self, parser, tmp_path):
+        """单元格中的换行符应被转义为 <br>"""
+        xls_path = tmp_path / "newline.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [["Header"], ["line1\nline2"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "line1<br>line2" in result
+
+    def test_backtick_escaped(self, parser, tmp_path):
+        """单元格中的反引号应被转义为 &#96;"""
+        xls_path = tmp_path / "backtick.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [["Header"], ["code `snippet`"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "&#96;" in result
+
+    def test_empty_cell_becomes_empty(self, parser, tmp_path):
+        """空单元格应显示为空字符串"""
+        xls_path = tmp_path / "empty_cell.xls"
+        wb = xlwt.Workbook()
+        ws = wb.add_sheet("Sheet1")
+        ws.write(0, 0, "A")
+        ws.write(0, 1, "B")
+        ws.write(1, 0, "val")
+        # cell (1,1) is not written — will be empty
+        wb.save(str(xls_path))
+
+        result = parser.parse(str(xls_path))
+        assert "| val |  |" in result
+
+    def test_sheet_name_as_heading(self, parser, tmp_path):
+        """工作表名称应作为 ## 标题"""
+        xls_path = tmp_path / "named.xls"
+        _create_xls(xls_path, {
+            "Sales Report": [["Month", "Revenue"], ["Jan", "1000"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "## Sales Report" in result
+
+    def test_nonexistent_file_raises(self, parser):
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse("/nonexistent/path/file.xls")
+        assert "file.xls" in exc_info.value.file_name
+        assert exc_info.value.reason != ""
+
+    def test_corrupted_file_raises(self, parser, tmp_path):
+        xls_path = tmp_path / "corrupted.xls"
+        xls_path.write_bytes(b"this is not an xls file")
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse(str(xls_path))
+        assert "corrupted.xls" in exc_info.value.file_name
+
+    def test_parse_error_contains_filename(self, parser):
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse("/no/such/report.xls")
+        assert exc_info.value.file_name == "report.xls"
+
+    def test_numeric_values(self, parser, tmp_path):
+        """数值类型应正确转换为字符串"""
+        xls_path = tmp_path / "numeric.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [["Int", "Float"], [42, 3.14]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "42" in result
+        assert "3.14" in result
+
+    def test_crlf_escaped(self, parser, tmp_path):
+        """\\r\\n 应被转义为 <br>"""
+        xls_path = tmp_path / "crlf.xls"
+        _create_xls(xls_path, {
+            "Sheet1": [["Header"], ["line1\r\nline2"]],
+        })
+        result = parser.parse(str(xls_path))
+        assert "line1<br>line2" in result