Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
119
parsers/legacy_doc_parser.py
Normal file
119
parsers/legacy_doc_parser.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""旧版 Word (.doc) 解析器
|
||||
|
||||
优先级:
|
||||
1. Windows + pywin32 → 通过 Word COM 接口转换
|
||||
2. LibreOffice → 跨平台 fallback
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
from exceptions import ParseError
|
||||
from parsers.base import BaseParser
|
||||
from parsers.doc_parser import DocParser
|
||||
|
||||
|
||||
class LegacyDocParser(BaseParser):
|
||||
"""旧版 .doc 文件解析器,自动选择最佳转换方式"""
|
||||
|
||||
def __init__(self):
|
||||
self._docx_parser = DocParser()
|
||||
|
||||
def supported_extensions(self) -> List[str]:
|
||||
return [".doc"]
|
||||
|
||||
def parse(self, file_path: str) -> str:
|
||||
file_name = os.path.basename(file_path)
|
||||
|
||||
# Windows 优先尝试 Word COM 接口
|
||||
if sys.platform == "win32":
|
||||
try:
|
||||
return self._parse_via_com(file_path, file_name)
|
||||
except ParseError:
|
||||
raise
|
||||
except Exception:
|
||||
# COM 失败(Word 未安装等),fallback 到 LibreOffice
|
||||
pass
|
||||
|
||||
return self._parse_via_libreoffice(file_path, file_name)
|
||||
|
||||
def _parse_via_com(self, file_path: str, file_name: str) -> str:
|
||||
"""通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx"""
|
||||
try:
|
||||
import win32com.client
|
||||
import pythoncom
|
||||
except ImportError:
|
||||
raise RuntimeError("pywin32 未安装")
|
||||
|
||||
abs_path = os.path.abspath(file_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx")
|
||||
|
||||
pythoncom.CoInitialize()
|
||||
word = None
|
||||
doc = None
|
||||
try:
|
||||
word = win32com.client.Dispatch("Word.Application")
|
||||
word.Visible = False
|
||||
word.DisplayAlerts = False
|
||||
doc = word.Documents.Open(abs_path, ReadOnly=True)
|
||||
# SaveAs2 格式 16 = wdFormatDocumentDefault (.docx)
|
||||
doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16)
|
||||
doc.Close(False)
|
||||
doc = None
|
||||
except Exception as e:
|
||||
raise ParseError(file_name, f"Word COM 转换失败: {e}")
|
||||
finally:
|
||||
if doc:
|
||||
try:
|
||||
doc.Close(False)
|
||||
except Exception:
|
||||
pass
|
||||
if word:
|
||||
try:
|
||||
word.Quit()
|
||||
except Exception:
|
||||
pass
|
||||
pythoncom.CoUninitialize()
|
||||
|
||||
if not os.path.exists(docx_path):
|
||||
raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件")
|
||||
|
||||
return self._docx_parser.parse(docx_path)
|
||||
|
||||
def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str:
|
||||
"""通过 LibreOffice 转换 .doc → .docx(跨平台 fallback)"""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"libreoffice", "--headless", "--convert-to", "docx",
|
||||
"--outdir", tmp_dir, os.path.abspath(file_path),
|
||||
],
|
||||
capture_output=True,
|
||||
timeout=120,
|
||||
check=True,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise ParseError(
|
||||
file_name,
|
||||
"无法处理 .doc 文件。Windows 需要安装 Microsoft Word,"
|
||||
"其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/",
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise ParseError(file_name, "LibreOffice 转换超时(120秒)")
|
||||
except subprocess.CalledProcessError as e:
|
||||
stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误"
|
||||
raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}")
|
||||
|
||||
base_name = os.path.splitext(file_name)[0] + ".docx"
|
||||
converted_path = os.path.join(tmp_dir, base_name)
|
||||
|
||||
if not os.path.exists(converted_path):
|
||||
raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件")
|
||||
|
||||
return self._docx_parser.parse(converted_path)
|
||||
Reference in New Issue
Block a user