Files
bigwo/parsers/xls_parser.py

85 lines
2.4 KiB
Python
Raw Permalink Normal View History

"""XLS 解析器,使用 xlrd 提取 Excel 数据并转换为 Markdown 表格格式"""
import os
from typing import List
import xlrd
from exceptions import ParseError
from parsers.base import BaseParser
class XlsParser(BaseParser):
"""XLS 解析器,遍历所有 sheet跳过空 sheet转 Markdown 表格"""
def supported_extensions(self) -> List[str]:
return [".xls"]
def parse(self, file_path: str) -> str:
file_name = os.path.basename(file_path)
try:
workbook = xlrd.open_workbook(filename=file_path)
except Exception as e:
raise ParseError(file_name, f"XLS 文件打开失败: {e}")
try:
return self._workbook_to_md(workbook)
except ParseError:
raise
except Exception as e:
raise ParseError(file_name, f"XLS 文件解析失败: {e}")
def _workbook_to_md(self, workbook) -> str:
"""将整个工作簿转换为 Markdown"""
sheet_parts = []
for sheet in workbook.sheets():
md = self._sheet_to_md(sheet)
if md:
sheet_parts.append(md)
return "\n\n".join(sheet_parts)
def _sheet_to_md(self, sheet) -> str:
"""将单个工作表转换为 Markdown 表格"""
# 跳过空 sheet
if sheet.nrows == 0 or sheet.ncols == 0:
return ""
# 第一行作为表头
headers = [self._escape_cell(cell) for cell in sheet.row_values(0)]
lines = [f"## {sheet.name}", ""]
# 表头行
lines.append("| " + " | ".join(headers) + " |")
# 分隔行
lines.append("| " + " | ".join("---" for _ in headers) + " |")
# 数据行
for row_idx in range(1, sheet.nrows):
cells = [self._escape_cell(cell) for cell in sheet.row_values(row_idx)]
lines.append("| " + " | ".join(cells) + " |")
return "\n".join(lines)
@staticmethod
def _escape_cell(value) -> str:
"""转义单元格内容,避免破坏 Markdown 表格结构"""
if value is None:
return ""
cell_str = str(value)
# xlrd 返回的空单元格可能是空字符串
if not cell_str:
return ""
cell_str = cell_str.replace("\r\n", "<br>")
cell_str = cell_str.replace("\n", "<br>")
cell_str = cell_str.replace("|", "&#124;")
cell_str = cell_str.replace("`", "&#96;")
return cell_str