"""旧版 Word (.doc) 解析器 优先级: 1. Windows + pywin32 → 通过 Word COM 接口转换 2. LibreOffice → 跨平台 fallback """ import os import subprocess import sys import tempfile from typing import List from exceptions import ParseError from parsers.base import BaseParser from parsers.doc_parser import DocParser class LegacyDocParser(BaseParser): """旧版 .doc 文件解析器,自动选择最佳转换方式""" def __init__(self): self._docx_parser = DocParser() def supported_extensions(self) -> List[str]: return [".doc"] def parse(self, file_path: str) -> str: file_name = os.path.basename(file_path) # Windows 优先尝试 Word COM 接口 if sys.platform == "win32": try: return self._parse_via_com(file_path, file_name) except ParseError: raise except Exception: # COM 失败(Word 未安装等),fallback 到 LibreOffice pass return self._parse_via_libreoffice(file_path, file_name) def _parse_via_com(self, file_path: str, file_name: str) -> str: """通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx""" try: import win32com.client import pythoncom except ImportError: raise RuntimeError("pywin32 未安装") abs_path = os.path.abspath(file_path) with tempfile.TemporaryDirectory() as tmp_dir: docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx") pythoncom.CoInitialize() word = None doc = None try: word = win32com.client.Dispatch("Word.Application") word.Visible = False word.DisplayAlerts = False doc = word.Documents.Open(abs_path, ReadOnly=True) # SaveAs2 格式 16 = wdFormatDocumentDefault (.docx) doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16) doc.Close(False) doc = None except Exception as e: raise ParseError(file_name, f"Word COM 转换失败: {e}") finally: if doc: try: doc.Close(False) except Exception: pass if word: try: word.Quit() except Exception: pass pythoncom.CoUninitialize() if not os.path.exists(docx_path): raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件") return self._docx_parser.parse(docx_path) def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str: """通过 LibreOffice 转换 .doc → .docx(跨平台 fallback)""" with tempfile.TemporaryDirectory() as tmp_dir: try: subprocess.run( [ "libreoffice", "--headless", "--convert-to", "docx", "--outdir", tmp_dir, os.path.abspath(file_path), ], capture_output=True, timeout=120, check=True, ) except FileNotFoundError: raise ParseError( file_name, "无法处理 .doc 文件。Windows 需要安装 Microsoft Word," "其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/", ) except subprocess.TimeoutExpired: raise ParseError(file_name, "LibreOffice 转换超时(120秒)") except subprocess.CalledProcessError as e: stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误" raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}") base_name = os.path.splitext(file_name)[0] + ".docx" converted_path = os.path.join(tmp_dir, base_name) if not os.path.exists(converted_path): raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件") return self._docx_parser.parse(converted_path)