Files
urbanLifeline/difyPlugin/pdf/tools/pdf_to_markdown.py
2026-03-06 14:50:43 +08:00

336 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import base64
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
"""Convert PDF to a single Markdown file. No LLM needed.
- Auto-detect TOC and organize content by chapters.
- Extract text and tables as Markdown.
- Embed raster images as base64.
- Render vector drawings as base64 PNG.
- Output one .md file via create_blob_message.
"""
_TOC_PATTERNS = [
r"目录", r"目 录", r"\u3000录",
r"Table of Contents", r"Contents", r"目次",
]
# ── entry point ──────────────────────────────────────────
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
include_images = self._to_bool(tool_parameters.get("include_images"), True)
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
image_dpi = max(72, min(image_dpi, 300))
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
# 1) Build chapter map (metadata TOC → printed TOC → none)
chapters, content_offset = self._build_chapter_map(doc, num_pages)
# 2) Convert every page
page_mds: list[str] = []
for idx in range(num_pages):
md = self._page_to_markdown(
doc, doc[idx], idx,
include_images, image_dpi, max_image_bytes,
)
page_mds.append(md)
# 3) Assemble
if chapters:
final_md = self._assemble_by_chapters(
chapters, page_mds, content_offset, num_pages,
)
else:
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
# 4) Output: text (for variable aggregation) + blob (.md file)
yield self.create_text_message(final_md)
md_bytes = final_md.encode("utf-8")
yield self.create_blob_message(
blob=md_bytes,
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
# ── chapter detection ────────────────────────────────────
def _build_chapter_map(
self, doc: fitz.Document, num_pages: int,
) -> tuple[dict, int]:
"""Return (chapters_dict, content_offset).
Try embedded PDF TOC metadata first (reliable page mapping).
Fall back to scanning printed TOC pages.
"""
toc = doc.get_toc()
if toc:
chapters = self._chapters_from_metadata(toc, num_pages)
if chapters:
return chapters, 0
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
)
chapters = self._parse_toc_lines(toc_text)
if chapters:
offset = self._guess_offset(chapters, toc_end)
return chapters, offset
return {}, 0
def _chapters_from_metadata(
self, toc: list, num_pages: int,
) -> dict[str, dict[str, int]]:
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
if not top:
return {}
chapters: dict[str, dict[str, int]] = OrderedDict()
for i, (title, start) in enumerate(top):
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
chapters[title] = {"start": start, "end": max(start, end)}
return chapters
def _find_toc_pages(self, doc, num_pages):
toc_start = toc_end = None
for pn in range(min(num_pages, 30)):
text = doc[pn].get_text() or ""
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
if toc_start is None:
toc_start = pn
toc_end = pn
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
m = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text, re.IGNORECASE | re.MULTILINE,
)
if m:
text = text[: m.start()]
pat = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
continue
m2 = pat.match(line)
if not m2:
continue
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_: ")
page = self._to_int(m2.group("page"), None)
if not title or page is None or len(title) <= 1:
continue
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
dedup: OrderedDict[str, int] = OrderedDict()
for t, p in entries:
dedup.setdefault(t, p)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = OrderedDict()
for i, t in enumerate(titles):
s = pages[i]
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
catalog[t] = {"start": s, "end": e}
return catalog
@staticmethod
def _guess_offset(chapters: dict, toc_end: int) -> int:
first_page = None
for info in chapters.values():
s = info["start"]
if first_page is None or s < first_page:
first_page = s
if first_page is None:
return 0
return (toc_end + 1) - first_page
# ── per-page conversion ──────────────────────────────────
def _page_to_markdown(
self,
doc: fitz.Document,
page: fitz.Page,
page_idx: int,
include_images: bool,
image_dpi: int,
max_image_bytes: int,
) -> str:
parts: list[str] = []
# ── text ──
text = (page.get_text("text", sort=True) or "").strip()
if text:
parts.append(text)
# ── tables → Markdown ──
try:
for tab in (page.find_tables().tables or [])[:5]:
cells = tab.extract() or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
parts.append(md)
except Exception:
pass
if not include_images:
return "\n\n".join(parts)
# ── embedded raster images ──
try:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
data = doc.extract_image(xref)
if not data or not data.get("image"):
continue
raw = data["image"]
if len(raw) > max_image_bytes:
continue
# skip tiny icons (< 20x20)
w = data.get("width", 0)
h = data.get("height", 0)
if w < 20 and h < 20:
continue
ext = data.get("ext", "png")
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
b64 = base64.b64encode(raw).decode("ascii")
parts.append(
f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
)
except Exception:
pass
except Exception:
pass
# ── vector drawings → render as PNG ──
try:
drawings = page.get_drawings()
if len(drawings) >= 3:
valid_rects: list[fitz.Rect] = []
for d in drawings:
r = d.get("rect")
if r:
try:
rect = fitz.Rect(r)
if rect.is_valid and not rect.is_empty:
valid_rects.append(rect)
except Exception:
pass
if valid_rects:
bbox = valid_rects[0]
for r in valid_rects[1:]:
bbox |= r
bbox &= page.rect
if bbox.width > 30 and bbox.height > 30:
scale = image_dpi / 72
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, clip=bbox)
png = pix.tobytes("png")
if len(png) <= max_image_bytes:
b64 = base64.b64encode(png).decode("ascii")
parts.append(
f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
)
except Exception:
pass
return "\n\n".join(parts)
# ── assembly ─────────────────────────────────────────────
def _assemble_by_chapters(
self,
chapters: dict[str, dict[str, int]],
page_mds: list[str],
offset: int,
num_pages: int,
) -> str:
parts: list[str] = []
for name, info in chapters.items():
s = info["start"] + offset
e = info["end"] + offset
s = max(0, min(s, num_pages - 1))
e = max(s, min(e, num_pages - 1))
ch: list[str] = [f"# {name}\n"]
for idx in range(s, e + 1):
if idx < len(page_mds) and page_mds[idx].strip():
ch.append(page_mds[idx])
parts.append("\n\n".join(ch))
return "\n\n---\n\n".join(parts)
# ── helpers ──────────────────────────────────────────────
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
return "\n".join(lines)
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default