"""XLS 解析器,使用 xlrd 提取 Excel 数据并转换为 Markdown 表格格式""" import os from typing import List import xlrd from exceptions import ParseError from parsers.base import BaseParser class XlsParser(BaseParser): """XLS 解析器,遍历所有 sheet,跳过空 sheet,转 Markdown 表格""" def supported_extensions(self) -> List[str]: return [".xls"] def parse(self, file_path: str) -> str: file_name = os.path.basename(file_path) try: workbook = xlrd.open_workbook(filename=file_path) except Exception as e: raise ParseError(file_name, f"XLS 文件打开失败: {e}") try: return self._workbook_to_md(workbook) except ParseError: raise except Exception as e: raise ParseError(file_name, f"XLS 文件解析失败: {e}") def _workbook_to_md(self, workbook) -> str: """将整个工作簿转换为 Markdown""" sheet_parts = [] for sheet in workbook.sheets(): md = self._sheet_to_md(sheet) if md: sheet_parts.append(md) return "\n\n".join(sheet_parts) def _sheet_to_md(self, sheet) -> str: """将单个工作表转换为 Markdown 表格""" # 跳过空 sheet if sheet.nrows == 0 or sheet.ncols == 0: return "" # 第一行作为表头 headers = [self._escape_cell(cell) for cell in sheet.row_values(0)] lines = [f"## {sheet.name}", ""] # 表头行 lines.append("| " + " | ".join(headers) + " |") # 分隔行 lines.append("| " + " | ".join("---" for _ in headers) + " |") # 数据行 for row_idx in range(1, sheet.nrows): cells = [self._escape_cell(cell) for cell in sheet.row_values(row_idx)] lines.append("| " + " | ".join(cells) + " |") return "\n".join(lines) @staticmethod def _escape_cell(value) -> str: """转义单元格内容,避免破坏 Markdown 表格结构""" if value is None: return "" cell_str = str(value) # xlrd 返回的空单元格可能是空字符串 if not cell_str: return "" cell_str = cell_str.replace("\r\n", "
") cell_str = cell_str.replace("\n", "
") cell_str = cell_str.replace("|", "|") cell_str = cell_str.replace("`", "`") return cell_str