115 lines
3.7 KiB
Python
115 lines
3.7 KiB
Python
|
|
"""PDF 文件解析器,使用 fitz (PyMuPDF) 提取文本并根据字体大小判断标题层级"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from collections import Counter
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
import fitz
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.base import BaseParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
class PdfParser(BaseParser):
|
|||
|
|
"""PDF 文件解析器,逐页提取文本块,根据字体大小与正文众数的差值判断标题层级"""
|
|||
|
|
|
|||
|
|
def supported_extensions(self) -> List[str]:
|
|||
|
|
return [".pdf"]
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
解析 PDF 文件,提取文本并根据字体大小判断标题层级,拼接为 Markdown。
|
|||
|
|
|
|||
|
|
参考 MaxKB PdfSplitHandle.handle_pdf_content() 核心逻辑:
|
|||
|
|
逐页提取文本块(dict格式) → 收集所有字体大小 → 计算众数作为正文字体
|
|||
|
|
→ 根据差值判断标题层级(>2 → ##, >0.5 → ###) → 拼接为 Markdown
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Markdown 格式的文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ParseError: 文件无法读取或解析失败时抛出
|
|||
|
|
"""
|
|||
|
|
file_name = os.path.basename(file_path)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
doc = fitz.open(file_path)
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"PDF 文件打开失败: {e}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
return self._extract_content(doc)
|
|||
|
|
except ParseError:
|
|||
|
|
raise
|
|||
|
|
except Exception as e:
|
|||
|
|
raise ParseError(file_name, f"PDF 解析失败: {e}")
|
|||
|
|
finally:
|
|||
|
|
doc.close()
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _extract_content(doc: fitz.Document) -> str:
|
|||
|
|
"""
|
|||
|
|
从 PDF 文档中提取文本内容。
|
|||
|
|
|
|||
|
|
第一遍遍历收集所有字体大小,计算众数作为正文字体大小。
|
|||
|
|
第二遍遍历根据字体大小差值判断标题层级,拼接为 Markdown。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: PyMuPDF 文档对象
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Markdown 格式的文本内容
|
|||
|
|
"""
|
|||
|
|
# 第一步:收集所有字体大小
|
|||
|
|
font_sizes = []
|
|||
|
|
for page_num in range(len(doc)):
|
|||
|
|
page = doc.load_page(page_num)
|
|||
|
|
blocks = page.get_text("dict")["blocks"]
|
|||
|
|
for block in blocks:
|
|||
|
|
if block["type"] == 0: # 文本块
|
|||
|
|
for line in block["lines"]:
|
|||
|
|
for span in line["spans"]:
|
|||
|
|
if span["size"] > 0:
|
|||
|
|
font_sizes.append(span["size"])
|
|||
|
|
|
|||
|
|
# 计算正文字体大小(众数)
|
|||
|
|
if not font_sizes:
|
|||
|
|
body_font_size = 12
|
|||
|
|
else:
|
|||
|
|
body_font_size = Counter(font_sizes).most_common(1)[0][0]
|
|||
|
|
|
|||
|
|
# 第二步:提取内容
|
|||
|
|
content = ""
|
|||
|
|
for page_num in range(len(doc)):
|
|||
|
|
page = doc.load_page(page_num)
|
|||
|
|
blocks = page.get_text("dict")["blocks"]
|
|||
|
|
|
|||
|
|
for block in blocks:
|
|||
|
|
if block["type"] != 0: # 跳过非文本块
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
for line in block["lines"]:
|
|||
|
|
if not line["spans"]:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
text = "".join(span["text"] for span in line["spans"])
|
|||
|
|
font_size = line["spans"][0]["size"]
|
|||
|
|
|
|||
|
|
# 根据与正文字体的差值判断标题层级
|
|||
|
|
size_diff = font_size - body_font_size
|
|||
|
|
|
|||
|
|
if size_diff > 2: # 明显大于正文
|
|||
|
|
content += f"## {text}\n\n"
|
|||
|
|
elif size_diff > 0.5: # 略大于正文
|
|||
|
|
content += f"### {text}\n\n"
|
|||
|
|
else: # 正文
|
|||
|
|
content += f"{text}\n"
|
|||
|
|
|
|||
|
|
# 清除 null 字符
|
|||
|
|
content = content.replace("\0", "")
|
|||
|
|
|
|||
|
|
return content
|