Initial commit: AI 知识库文档智能分块工具

This commit is contained in:
AI Knowledge Splitter
2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions

View File

@@ -0,0 +1,119 @@
"""旧版 Word (.doc) 解析器
优先级:
1. Windows + pywin32 → 通过 Word COM 接口转换
2. LibreOffice → 跨平台 fallback
"""
import os
import subprocess
import sys
import tempfile
from typing import List
from exceptions import ParseError
from parsers.base import BaseParser
from parsers.doc_parser import DocParser
class LegacyDocParser(BaseParser):
"""旧版 .doc 文件解析器,自动选择最佳转换方式"""
def __init__(self):
self._docx_parser = DocParser()
def supported_extensions(self) -> List[str]:
return [".doc"]
def parse(self, file_path: str) -> str:
file_name = os.path.basename(file_path)
# Windows 优先尝试 Word COM 接口
if sys.platform == "win32":
try:
return self._parse_via_com(file_path, file_name)
except ParseError:
raise
except Exception:
# COM 失败Word 未安装等fallback 到 LibreOffice
pass
return self._parse_via_libreoffice(file_path, file_name)
def _parse_via_com(self, file_path: str, file_name: str) -> str:
"""通过 pywin32 COM 接口调用 Microsoft Word 转换 .doc → .docx"""
try:
import win32com.client
import pythoncom
except ImportError:
raise RuntimeError("pywin32 未安装")
abs_path = os.path.abspath(file_path)
with tempfile.TemporaryDirectory() as tmp_dir:
docx_path = os.path.join(tmp_dir, os.path.splitext(file_name)[0] + ".docx")
pythoncom.CoInitialize()
word = None
doc = None
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
word.DisplayAlerts = False
doc = word.Documents.Open(abs_path, ReadOnly=True)
# SaveAs2 格式 16 = wdFormatDocumentDefault (.docx)
doc.SaveAs2(os.path.abspath(docx_path), FileFormat=16)
doc.Close(False)
doc = None
except Exception as e:
raise ParseError(file_name, f"Word COM 转换失败: {e}")
finally:
if doc:
try:
doc.Close(False)
except Exception:
pass
if word:
try:
word.Quit()
except Exception:
pass
pythoncom.CoUninitialize()
if not os.path.exists(docx_path):
raise ParseError(file_name, "Word COM 转换后未找到 .docx 文件")
return self._docx_parser.parse(docx_path)
def _parse_via_libreoffice(self, file_path: str, file_name: str) -> str:
"""通过 LibreOffice 转换 .doc → .docx跨平台 fallback"""
with tempfile.TemporaryDirectory() as tmp_dir:
try:
subprocess.run(
[
"libreoffice", "--headless", "--convert-to", "docx",
"--outdir", tmp_dir, os.path.abspath(file_path),
],
capture_output=True,
timeout=120,
check=True,
)
except FileNotFoundError:
raise ParseError(
file_name,
"无法处理 .doc 文件。Windows 需要安装 Microsoft Word"
"其他系统需要安装 LibreOffice: https://www.libreoffice.org/download/",
)
except subprocess.TimeoutExpired:
raise ParseError(file_name, "LibreOffice 转换超时120秒")
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else "未知错误"
raise ParseError(file_name, f"LibreOffice 转换失败: {stderr}")
base_name = os.path.splitext(file_name)[0] + ".docx"
converted_path = os.path.join(tmp_dir, base_name)
if not os.path.exists(converted_path):
raise ParseError(file_name, "LibreOffice 转换后未找到 .docx 文件")
return self._docx_parser.parse(converted_path)