120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
"""旧版 Word (.doc) 解析器
|
||
|
||
优先级:
|
||
1. Windows + pywin32 → 通过 Word COM 接口转换
|
||
2. LibreOffice → 跨平台 fallback
|
||
"""
|
||
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
from typing import List
|
||
|
||
from exceptions import ParseError
|
||
from parsers.base import BaseParser
|
||
from parsers.doc_parser import DocParser
|
||
|
||
|
||
class LegacyDocParser(BaseParser):
|
||
"""旧版 .doc 文件解析器,自动选择最佳转换方式"""
|
||
|
||
def __init__(self):
|
||
self._docx_parser = DocParser()
|
||
|
||
def supported_extensions(self) -> List[str]:
|
||
return [".doc"]
|
||
|
||
def parse(self, file_path: str) -> str:
|
||
file_name = os.path.basename(file_path)
|
||
|
||
# Windows 优先尝试 Word COM 接口
|
||
if sys.platform == "win32":
|
||
try:
|
||
return self._parse_via_com(file_path, file_name)
|
||
except ParseError:
|
||
raise
|
||
except Exception:
|
||
# COM 失败(Word 未安装等),fallback 到 LibreOffice
|
||
pass
|
||
|
||
return self._parse_via_libreoffice(file_path, file_name)
|
||
|
||
def _parse_via_com(self, file_path: str, file_name: str) -> str:
|
||
"""通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx"""
|
||
try:
|
||
import win32com.client
|
||
import pythoncom
|
||
except ImportError:
|
||
raise RuntimeError("pywin32 未安装")
|
||
|
||
abs_path = os.path.abspath(file_path)
|
||
|
||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||
docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx")
|
||
|
||
pythoncom.CoInitialize()
|
||
word = None
|
||
doc = None
|
||
try:
|
||
word = win32com.client.Dispatch("Word.Application")
|
||
word.Visible = False
|
||
word.DisplayAlerts = False
|
||
doc = word.Documents.Open(abs_path, ReadOnly=True)
|
||
# SaveAs2 格式 16 = wdFormatDocumentDefault (.docx)
|
||
doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16)
|
||
doc.Close(False)
|
||
doc = None
|
||
except Exception as e:
|
||
raise ParseError(file_name, f"Word COM 转换失败: {e}")
|
||
finally:
|
||
if doc:
|
||
try:
|
||
doc.Close(False)
|
||
except Exception:
|
||
pass
|
||
if word:
|
||
try:
|
||
word.Quit()
|
||
except Exception:
|
||
pass
|
||
pythoncom.CoUninitialize()
|
||
|
||
if not os.path.exists(docx_path):
|
||
raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件")
|
||
|
||
return self._docx_parser.parse(docx_path)
|
||
|
||
def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str:
|
||
"""通过 LibreOffice 转换 .doc → .docx(跨平台 fallback)"""
|
||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||
try:
|
||
subprocess.run(
|
||
[
|
||
"libreoffice", "--headless", "--convert-to", "docx",
|
||
"--outdir", tmp_dir, os.path.abspath(file_path),
|
||
],
|
||
capture_output=True,
|
||
timeout=120,
|
||
check=True,
|
||
)
|
||
except FileNotFoundError:
|
||
raise ParseError(
|
||
file_name,
|
||
"无法处理 .doc 文件。Windows 需要安装 Microsoft Word,"
|
||
"其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/",
|
||
)
|
||
except subprocess.TimeoutExpired:
|
||
raise ParseError(file_name, "LibreOffice 转换超时(120秒)")
|
||
except subprocess.CalledProcessError as e:
|
||
stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误"
|
||
raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}")
|
||
|
||
base_name = os.path.splitext(file_name)[0] + ".docx"
|
||
converted_path = os.path.join(tmp_dir, base_name)
|
||
|
||
if not os.path.exists(converted_path):
|
||
raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件")
|
||
|
||
return self._docx_parser.parse(converted_path)
|