Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
178
tests/test_xls_parser.py
Normal file
178
tests/test_xls_parser.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""XlsParser 单元测试"""
|
||||
|
||||
import pytest
|
||||
import xlwt
|
||||
|
||||
from exceptions import ParseError
|
||||
from parsers.xls_parser import XlsParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser():
|
||||
return XlsParser()
|
||||
|
||||
|
||||
def _create_xls(path, sheets=None):
|
||||
"""
|
||||
创建测试用 XLS 文件。
|
||||
|
||||
Args:
|
||||
path: 输出文件路径
|
||||
sheets: dict,key 为 sheet 名称,value 为二维列表(行×列的数据)
|
||||
如果为 None,创建空工作簿
|
||||
"""
|
||||
wb = xlwt.Workbook()
|
||||
|
||||
if sheets:
|
||||
for sheet_name, rows in sheets.items():
|
||||
ws = wb.add_sheet(sheet_name)
|
||||
for row_idx, row in enumerate(rows):
|
||||
for col_idx, value in enumerate(row):
|
||||
ws.write(row_idx, col_idx, value)
|
||||
else:
|
||||
# xlwt 需要至少一个 sheet
|
||||
wb.add_sheet("Sheet1")
|
||||
|
||||
wb.save(str(path))
|
||||
|
||||
|
||||
class TestSupportedExtensions:
|
||||
def test_supports_xls(self, parser):
|
||||
assert ".xls" in parser.supported_extensions()
|
||||
|
||||
def test_only_one_extension(self, parser):
|
||||
assert len(parser.supported_extensions()) == 1
|
||||
|
||||
|
||||
class TestParse:
|
||||
def test_simple_table(self, parser, tmp_path):
|
||||
"""基本表格转换为 Markdown"""
|
||||
xls_path = tmp_path / "simple.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [
|
||||
["Name", "Age"],
|
||||
["Alice", 30],
|
||||
["Bob", 25],
|
||||
]
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "## Sheet1" in result
|
||||
assert "| Name | Age |" in result
|
||||
assert "| --- | --- |" in result
|
||||
assert "Alice" in result
|
||||
assert "Bob" in result
|
||||
|
||||
def test_multiple_sheets(self, parser, tmp_path):
|
||||
"""多个工作表各自生成标题和表格"""
|
||||
xls_path = tmp_path / "multi.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Users": [["Name"], ["Alice"]],
|
||||
"Orders": [["ID"], ["001"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "## Users" in result
|
||||
assert "## Orders" in result
|
||||
assert "| Name |" in result
|
||||
assert "| ID |" in result
|
||||
|
||||
def test_empty_sheet_skipped(self, parser, tmp_path):
|
||||
"""空工作表应被跳过"""
|
||||
xls_path = tmp_path / "empty_sheet.xls"
|
||||
wb = xlwt.Workbook()
|
||||
wb.add_sheet("Empty") # no data written
|
||||
ws = wb.add_sheet("Data")
|
||||
ws.write(0, 0, "Col1")
|
||||
ws.write(1, 0, "Val1")
|
||||
wb.save(str(xls_path))
|
||||
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "## Empty" not in result
|
||||
assert "## Data" in result
|
||||
|
||||
def test_pipe_escaped(self, parser, tmp_path):
|
||||
"""单元格中的 | 应被转义为 |"""
|
||||
xls_path = tmp_path / "pipe.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [["Header"], ["value|with|pipes"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "|" in result
|
||||
assert "value|with|pipes" in result
|
||||
|
||||
def test_newline_escaped(self, parser, tmp_path):
|
||||
"""单元格中的换行符应被转义为 <br>"""
|
||||
xls_path = tmp_path / "newline.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [["Header"], ["line1\nline2"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "line1<br>line2" in result
|
||||
|
||||
def test_backtick_escaped(self, parser, tmp_path):
|
||||
"""单元格中的反引号应被转义为 `"""
|
||||
xls_path = tmp_path / "backtick.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [["Header"], ["code `snippet`"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "`" in result
|
||||
|
||||
def test_empty_cell_becomes_empty(self, parser, tmp_path):
|
||||
"""空单元格应显示为空字符串"""
|
||||
xls_path = tmp_path / "empty_cell.xls"
|
||||
wb = xlwt.Workbook()
|
||||
ws = wb.add_sheet("Sheet1")
|
||||
ws.write(0, 0, "A")
|
||||
ws.write(0, 1, "B")
|
||||
ws.write(1, 0, "val")
|
||||
# cell (1,1) is not written — will be empty
|
||||
wb.save(str(xls_path))
|
||||
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "| val | |" in result
|
||||
|
||||
def test_sheet_name_as_heading(self, parser, tmp_path):
|
||||
"""工作表名称应作为 ## 标题"""
|
||||
xls_path = tmp_path / "named.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sales Report": [["Month", "Revenue"], ["Jan", "1000"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "## Sales Report" in result
|
||||
|
||||
def test_nonexistent_file_raises(self, parser):
|
||||
with pytest.raises(ParseError) as exc_info:
|
||||
parser.parse("/nonexistent/path/file.xls")
|
||||
assert "file.xls" in exc_info.value.file_name
|
||||
assert exc_info.value.reason != ""
|
||||
|
||||
def test_corrupted_file_raises(self, parser, tmp_path):
|
||||
xls_path = tmp_path / "corrupted.xls"
|
||||
xls_path.write_bytes(b"this is not an xls file")
|
||||
with pytest.raises(ParseError) as exc_info:
|
||||
parser.parse(str(xls_path))
|
||||
assert "corrupted.xls" in exc_info.value.file_name
|
||||
|
||||
def test_parse_error_contains_filename(self, parser):
|
||||
with pytest.raises(ParseError) as exc_info:
|
||||
parser.parse("/no/such/report.xls")
|
||||
assert exc_info.value.file_name == "report.xls"
|
||||
|
||||
def test_numeric_values(self, parser, tmp_path):
|
||||
"""数值类型应正确转换为字符串"""
|
||||
xls_path = tmp_path / "numeric.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [["Int", "Float"], [42, 3.14]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "42" in result
|
||||
assert "3.14" in result
|
||||
|
||||
def test_crlf_escaped(self, parser, tmp_path):
|
||||
"""\\r\\n 应被转义为 <br>"""
|
||||
xls_path = tmp_path / "crlf.xls"
|
||||
_create_xls(xls_path, {
|
||||
"Sheet1": [["Header"], ["line1\r\nline2"]],
|
||||
})
|
||||
result = parser.parse(str(xls_path))
|
||||
assert "line1<br>line2" in result
|
||||
Reference in New Issue
Block a user