120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
|
|
"""旧版 Word (.doc) 解析器
|
|||
|
|
|
|||
|
|
优先级:
|
|||
|
|
1. Windows + pywin32 → 通过 Word COM 接口转换
|
|||
|
|
2. LibreOffice → 跨平台 fallback
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import subprocess
|
|||
|
|
import sys
|
|||
|
|
import tempfile
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.base import BaseParser
|
|||
|
|
from parsers.doc_parser import DocParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
class LegacyDocParser(BaseParser):
|
|||
|
|
"""旧版 .doc 文件解析器,自动选择最佳转换方式"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self._docx_parser = DocParser()
|
|||
|
|
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
return [".doc"]
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
file_name = os.path.basename(file_path)
|
|||
|
|
|
|||
|
|
# Windows 优先尝试 Word COM 接口
|
|||
|
|
if sys.platform == "win32":
|
|||
|
|
try:
|
|||
|
|
return self._parse_via_com(file_path, file_name)
|
|||
|
|
except ParseError:
|
|||
|
|
raise
|
|||
|
|
except Exception:
|
|||
|
|
# COM 失败(Word 未安装等),fallback 到 LibreOffice
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
return self._parse_via_libreoffice(file_path, file_name)
|
|||
|
|
|
|||
|
|
def _parse_via_com(self, file_path: str, file_name: str) -> str:
|
|||
|
|
"""通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx"""
|
|||
|
|
try:
|
|||
|
|
import win32com.client
|
|||
|
|
import pythoncom
|
|||
|
|
except ImportError:
|
|||
|
|
raise RuntimeError("pywin32 未安装")
|
|||
|
|
|
|||
|
|
abs_path = os.path.abspath(file_path)
|
|||
|
|
|
|||
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|||
|
|
docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx")
|
|||
|
|
|
|||
|
|
pythoncom.CoInitialize()
|
|||
|
|
word = None
|
|||
|
|
doc = None
|
|||
|
|
try:
|
|||
|
|
word = win32com.client.Dispatch("Word.Application")
|
|||
|
|
word.Visible = False
|
|||
|
|
word.DisplayAlerts = False
|
|||
|
|
doc = word.Documents.Open(abs_path, ReadOnly=True)
|
|||
|
|
# SaveAs2 格式 16 = wdFormatDocumentDefault (.docx)
|
|||
|
|
doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16)
|
|||
|
|
doc.Close(False)
|
|||
|
|
doc = None
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"Word COM 转换失败: {e}")
|
|||
|
|
finally:
|
|||
|
|
if doc:
|
|||
|
|
try:
|
|||
|
|
doc.Close(False)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if word:
|
|||
|
|
try:
|
|||
|
|
word.Quit()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
pythoncom.CoUninitialize()
|
|||
|
|
|
|||
|
|
if not os.path.exists(docx_path):
|
|||
|
|
raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件")
|
|||
|
|
|
|||
|
|
return self._docx_parser.parse(docx_path)
|
|||
|
|
|
|||
|
|
def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str:
|
|||
|
|
"""通过 LibreOffice 转换 .doc → .docx(跨平台 fallback)"""
|
|||
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|||
|
|
try:
|
|||
|
|
subprocess.run(
|
|||
|
|
[
|
|||
|
|
"libreoffice", "--headless", "--convert-to", "docx",
|
|||
|
|
"--outdir", tmp_dir, os.path.abspath(file_path),
|
|||
|
|
],
|
|||
|
|
capture_output=True,
|
|||
|
|
timeout=120,
|
|||
|
|
check=True,
|
|||
|
|
)
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
raise ParseError(
|
|||
|
|
file_name,
|
|||
|
|
"无法处理 .doc 文件。Windows 需要安装 Microsoft Word,"
|
|||
|
|
"其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/",
|
|||
|
|
)
|
|||
|
|
except subprocess.TimeoutExpired:
|
|||
|
|
raise ParseError(file_name, "LibreOffice 转换超时(120秒)")
|
|||
|
|
except subprocess.CalledProcessError as e:
|
|||
|
|
stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误"
|
|||
|
|
raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}")
|
|||
|
|
|
|||
|
|
base_name = os.path.splitext(file_name)[0] + ".docx"
|
|||
|
|
converted_path = os.path.join(tmp_dir, base_name)
|
|||
|
|
|
|||
|
|
if not os.path.exists(converted_path):
|
|||
|
|
raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件")
|
|||
|
|
|
|||
|
|
return self._docx_parser.parse(converted_path)
|