85 lines
2.4 KiB
Python
85 lines
2.4 KiB
Python
|
|
"""XLS 解析器,使用 xlrd 提取 Excel 数据并转换为 Markdown 表格格式"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
import xlrd
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.base import BaseParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
class XlsParser(BaseParser):
|
|||
|
|
"""XLS 解析器,遍历所有 sheet,跳过空 sheet,转 Markdown 表格"""
|
|||
|
|
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
return [".xls"]
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
file_name = os.path.basename(file_path)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
workbook = xlrd.open_workbook(filename=file_path)
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"XLS 文件打开失败: {e}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
return self._workbook_to_md(workbook)
|
|||
|
|
except ParseError:
|
|||
|
|
raise
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"XLS 文件解析失败: {e}")
|
|||
|
|
|
|||
|
|
def _workbook_to_md(self, workbook) -> str:
|
|||
|
|
"""将整个工作簿转换为 Markdown"""
|
|||
|
|
sheet_parts = []
|
|||
|
|
|
|||
|
|
for sheet in workbook.sheets():
|
|||
|
|
md = self._sheet_to_md(sheet)
|
|||
|
|
if md:
|
|||
|
|
sheet_parts.append(md)
|
|||
|
|
|
|||
|
|
return "\n\n".join(sheet_parts)
|
|||
|
|
|
|||
|
|
def _sheet_to_md(self, sheet) -> str:
|
|||
|
|
"""将单个工作表转换为 Markdown 表格"""
|
|||
|
|
# 跳过空 sheet
|
|||
|
|
if sheet.nrows == 0 or sheet.ncols == 0:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
# 第一行作为表头
|
|||
|
|
headers = [self._escape_cell(cell) for cell in sheet.row_values(0)]
|
|||
|
|
|
|||
|
|
lines = [f"## {sheet.name}", ""]
|
|||
|
|
|
|||
|
|
# 表头行
|
|||
|
|
lines.append("| " + " | ".join(headers) + " |")
|
|||
|
|
# 分隔行
|
|||
|
|
lines.append("| " + " | ".join("---" for _ in headers) + " |")
|
|||
|
|
|
|||
|
|
# 数据行
|
|||
|
|
for row_idx in range(1, sheet.nrows):
|
|||
|
|
cells = [self._escape_cell(cell) for cell in sheet.row_values(row_idx)]
|
|||
|
|
lines.append("| " + " | ".join(cells) + " |")
|
|||
|
|
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _escape_cell(value) -> str:
|
|||
|
|
"""转义单元格内容,避免破坏 Markdown 表格结构"""
|
|||
|
|
if value is None:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
cell_str = str(value)
|
|||
|
|
|
|||
|
|
# xlrd 返回的空单元格可能是空字符串
|
|||
|
|
if not cell_str:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
cell_str = cell_str.replace("\r\n", "<br>")
|
|||
|
|
cell_str = cell_str.replace("\n", "<br>")
|
|||
|
|
cell_str = cell_str.replace("|", "|")
|
|||
|
|
cell_str = cell_str.replace("`", "`")
|
|||
|
|
|
|||
|
|
return cell_str
|