Files
bigwo/parsers/legacy_doc_parser.py
2026-03-02 17:38:28 +08:00

120 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""旧版 Word (.doc) 解析器
优先级:
1. Windows + pywin32 → 通过 Word COM 接口转换
2. LibreOffice → 跨平台 fallback
"""
import os
import subprocess
import sys
import tempfile
from typing import List
from exceptions import ParseError
from parsers.base import BaseParser
from parsers.doc_parser import DocParser
class LegacyDocParser(BaseParser):
"""旧版 .doc 文件解析器,自动选择最佳转换方式"""
def __init__(self):
self._docx_parser = DocParser()
def supported_extensions(self) -> List[str]:
return [".doc"]
def parse(self, file_path: str) -> str:
file_name = os.path.basename(file_path)
# Windows 优先尝试 Word COM 接口
if sys.platform == "win32":
try:
return self._parse_via_com(file_path, file_name)
except ParseError:
raise
except Exception:
# COM 失败Word 未安装等fallback 到 LibreOffice
pass
return self._parse_via_libreoffice(file_path, file_name)
def _parse_via_com(self, file_path: str, file_name: str) -> str:
"""通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx"""
try:
import win32com.client
import pythoncom
except ImportError:
raise RuntimeError("pywin32 未安装")
abs_path = os.path.abspath(file_path)
with tempfile.TemporaryDirectory() as tmp_dir:
docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx")
pythoncom.CoInitialize()
word = None
doc = None
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
word.DisplayAlerts = False
doc = word.Documents.Open(abs_path, ReadOnly=True)
# SaveAs2 格式 16 = wdFormatDocumentDefault (.docx)
doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16)
doc.Close(False)
doc = None
except Exception as e:
raise ParseError(file_name, f"Word COM 转换失败: {e}")
finally:
if doc:
try:
doc.Close(False)
except Exception:
pass
if word:
try:
word.Quit()
except Exception:
pass
pythoncom.CoUninitialize()
if not os.path.exists(docx_path):
raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件")
return self._docx_parser.parse(docx_path)
def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str:
"""通过 LibreOffice 转换 .doc → .docx跨平台 fallback"""
with tempfile.TemporaryDirectory() as tmp_dir:
try:
subprocess.run(
[
"libreoffice", "--headless", "--convert-to", "docx",
"--outdir", tmp_dir, os.path.abspath(file_path),
],
capture_output=True,
timeout=120,
check=True,
)
except FileNotFoundError:
raise ParseError(
file_name,
"无法处理 .doc 文件。Windows 需要安装 Microsoft Word"
"其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/",
)
except subprocess.TimeoutExpired:
raise ParseError(file_name, "LibreOffice 转换超时120秒")
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误"
raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}")
base_name = os.path.splitext(file_name)[0] + ".docx"
converted_path = os.path.join(tmp_dir, base_name)
if not os.path.exists(converted_path):
raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件")
return self._docx_parser.parse(converted_path)