This commit is contained in:
2026-03-06 14:50:43 +08:00
parent 843146cdd7
commit 91ff28bdcf
18 changed files with 1316 additions and 100 deletions

View File

@@ -0,0 +1,335 @@
import base64
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
"""Convert PDF to a single Markdown file. No LLM needed.
- Auto-detect TOC and organize content by chapters.
- Extract text and tables as Markdown.
- Embed raster images as base64.
- Render vector drawings as base64 PNG.
- Output one .md file via create_blob_message.
"""
_TOC_PATTERNS = [
r"目录", r"目 录", r"\u3000录",
r"Table of Contents", r"Contents", r"目次",
]
# ── entry point ──────────────────────────────────────────
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
include_images = self._to_bool(tool_parameters.get("include_images"), True)
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
image_dpi = max(72, min(image_dpi, 300))
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
# 1) Build chapter map (metadata TOC → printed TOC → none)
chapters, content_offset = self._build_chapter_map(doc, num_pages)
# 2) Convert every page
page_mds: list[str] = []
for idx in range(num_pages):
md = self._page_to_markdown(
doc, doc[idx], idx,
include_images, image_dpi, max_image_bytes,
)
page_mds.append(md)
# 3) Assemble
if chapters:
final_md = self._assemble_by_chapters(
chapters, page_mds, content_offset, num_pages,
)
else:
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
# 4) Output: text (for variable aggregation) + blob (.md file)
yield self.create_text_message(final_md)
md_bytes = final_md.encode("utf-8")
yield self.create_blob_message(
blob=md_bytes,
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
# ── chapter detection ────────────────────────────────────
def _build_chapter_map(
self, doc: fitz.Document, num_pages: int,
) -> tuple[dict, int]:
"""Return (chapters_dict, content_offset).
Try embedded PDF TOC metadata first (reliable page mapping).
Fall back to scanning printed TOC pages.
"""
toc = doc.get_toc()
if toc:
chapters = self._chapters_from_metadata(toc, num_pages)
if chapters:
return chapters, 0
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
)
chapters = self._parse_toc_lines(toc_text)
if chapters:
offset = self._guess_offset(chapters, toc_end)
return chapters, offset
return {}, 0
def _chapters_from_metadata(
self, toc: list, num_pages: int,
) -> dict[str, dict[str, int]]:
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
if not top:
return {}
chapters: dict[str, dict[str, int]] = OrderedDict()
for i, (title, start) in enumerate(top):
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
chapters[title] = {"start": start, "end": max(start, end)}
return chapters
def _find_toc_pages(self, doc, num_pages):
toc_start = toc_end = None
for pn in range(min(num_pages, 30)):
text = doc[pn].get_text() or ""
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
if toc_start is None:
toc_start = pn
toc_end = pn
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
m = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text, re.IGNORECASE | re.MULTILINE,
)
if m:
text = text[: m.start()]
pat = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
continue
m2 = pat.match(line)
if not m2:
continue
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_: ")
page = self._to_int(m2.group("page"), None)
if not title or page is None or len(title) <= 1:
continue
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
dedup: OrderedDict[str, int] = OrderedDict()
for t, p in entries:
dedup.setdefault(t, p)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = OrderedDict()
for i, t in enumerate(titles):
s = pages[i]
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
catalog[t] = {"start": s, "end": e}
return catalog
@staticmethod
def _guess_offset(chapters: dict, toc_end: int) -> int:
first_page = None
for info in chapters.values():
s = info["start"]
if first_page is None or s < first_page:
first_page = s
if first_page is None:
return 0
return (toc_end + 1) - first_page
# ── per-page conversion ──────────────────────────────────
def _page_to_markdown(
self,
doc: fitz.Document,
page: fitz.Page,
page_idx: int,
include_images: bool,
image_dpi: int,
max_image_bytes: int,
) -> str:
parts: list[str] = []
# ── text ──
text = (page.get_text("text", sort=True) or "").strip()
if text:
parts.append(text)
# ── tables → Markdown ──
try:
for tab in (page.find_tables().tables or [])[:5]:
cells = tab.extract() or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
parts.append(md)
except Exception:
pass
if not include_images:
return "\n\n".join(parts)
# ── embedded raster images ──
try:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
data = doc.extract_image(xref)
if not data or not data.get("image"):
continue
raw = data["image"]
if len(raw) > max_image_bytes:
continue
# skip tiny icons (< 20x20)
w = data.get("width", 0)
h = data.get("height", 0)
if w < 20 and h < 20:
continue
ext = data.get("ext", "png")
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
b64 = base64.b64encode(raw).decode("ascii")
parts.append(
f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
)
except Exception:
pass
except Exception:
pass
# ── vector drawings → render as PNG ──
try:
drawings = page.get_drawings()
if len(drawings) >= 3:
valid_rects: list[fitz.Rect] = []
for d in drawings:
r = d.get("rect")
if r:
try:
rect = fitz.Rect(r)
if rect.is_valid and not rect.is_empty:
valid_rects.append(rect)
except Exception:
pass
if valid_rects:
bbox = valid_rects[0]
for r in valid_rects[1:]:
bbox |= r
bbox &= page.rect
if bbox.width > 30 and bbox.height > 30:
scale = image_dpi / 72
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, clip=bbox)
png = pix.tobytes("png")
if len(png) <= max_image_bytes:
b64 = base64.b64encode(png).decode("ascii")
parts.append(
f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
)
except Exception:
pass
return "\n\n".join(parts)
# ── assembly ─────────────────────────────────────────────
def _assemble_by_chapters(
self,
chapters: dict[str, dict[str, int]],
page_mds: list[str],
offset: int,
num_pages: int,
) -> str:
parts: list[str] = []
for name, info in chapters.items():
s = info["start"] + offset
e = info["end"] + offset
s = max(0, min(s, num_pages - 1))
e = max(s, min(e, num_pages - 1))
ch: list[str] = [f"# {name}\n"]
for idx in range(s, e + 1):
if idx < len(page_mds) and page_mds[idx].strip():
ch.append(page_mds[idx])
parts.append("\n\n".join(ch))
return "\n\n---\n\n".join(parts)
# ── helpers ──────────────────────────────────────────────
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
return "\n".join(lines)
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default