336 lines
12 KiB
Python
336 lines
12 KiB
Python
|
|
import base64
|
|||
|
|
import re
|
|||
|
|
from collections import OrderedDict
|
|||
|
|
from collections.abc import Generator
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
import fitz
|
|||
|
|
from dify_plugin import Tool
|
|||
|
|
from dify_plugin.entities.tool import ToolInvokeMessage
|
|||
|
|
|
|||
|
|
|
|||
|
|
class PdfToMarkdownTool(Tool):
|
|||
|
|
"""Convert PDF to a single Markdown file. No LLM needed.
|
|||
|
|
|
|||
|
|
- Auto-detect TOC and organize content by chapters.
|
|||
|
|
- Extract text and tables as Markdown.
|
|||
|
|
- Embed raster images as base64.
|
|||
|
|
- Render vector drawings as base64 PNG.
|
|||
|
|
- Output one .md file via create_blob_message.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
_TOC_PATTERNS = [
|
|||
|
|
r"目录", r"目 录", r"目\u3000录",
|
|||
|
|
r"Table of Contents", r"Contents", r"目次",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# ── entry point ──────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
|||
|
|
file = tool_parameters.get("file")
|
|||
|
|
if not file:
|
|||
|
|
yield self.create_text_message("Error: file is required")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
include_images = self._to_bool(tool_parameters.get("include_images"), True)
|
|||
|
|
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
|
|||
|
|
image_dpi = max(72, min(image_dpi, 300))
|
|||
|
|
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
|
|||
|
|
|
|||
|
|
doc = fitz.open(stream=file.blob, filetype="pdf")
|
|||
|
|
try:
|
|||
|
|
num_pages = len(doc)
|
|||
|
|
|
|||
|
|
# 1) Build chapter map (metadata TOC → printed TOC → none)
|
|||
|
|
chapters, content_offset = self._build_chapter_map(doc, num_pages)
|
|||
|
|
|
|||
|
|
# 2) Convert every page
|
|||
|
|
page_mds: list[str] = []
|
|||
|
|
for idx in range(num_pages):
|
|||
|
|
md = self._page_to_markdown(
|
|||
|
|
doc, doc[idx], idx,
|
|||
|
|
include_images, image_dpi, max_image_bytes,
|
|||
|
|
)
|
|||
|
|
page_mds.append(md)
|
|||
|
|
|
|||
|
|
# 3) Assemble
|
|||
|
|
if chapters:
|
|||
|
|
final_md = self._assemble_by_chapters(
|
|||
|
|
chapters, page_mds, content_offset, num_pages,
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
|
|||
|
|
|
|||
|
|
# 4) Output: text (for variable aggregation) + blob (.md file)
|
|||
|
|
yield self.create_text_message(final_md)
|
|||
|
|
md_bytes = final_md.encode("utf-8")
|
|||
|
|
yield self.create_blob_message(
|
|||
|
|
blob=md_bytes,
|
|||
|
|
meta={"mime_type": "text/markdown"},
|
|||
|
|
)
|
|||
|
|
finally:
|
|||
|
|
doc.close()
|
|||
|
|
|
|||
|
|
# ── chapter detection ────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _build_chapter_map(
|
|||
|
|
self, doc: fitz.Document, num_pages: int,
|
|||
|
|
) -> tuple[dict, int]:
|
|||
|
|
"""Return (chapters_dict, content_offset).
|
|||
|
|
|
|||
|
|
Try embedded PDF TOC metadata first (reliable page mapping).
|
|||
|
|
Fall back to scanning printed TOC pages.
|
|||
|
|
"""
|
|||
|
|
toc = doc.get_toc()
|
|||
|
|
if toc:
|
|||
|
|
chapters = self._chapters_from_metadata(toc, num_pages)
|
|||
|
|
if chapters:
|
|||
|
|
return chapters, 0
|
|||
|
|
|
|||
|
|
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
|
|||
|
|
if toc_start is not None and toc_end is not None:
|
|||
|
|
toc_text = "\n".join(
|
|||
|
|
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
|
|||
|
|
)
|
|||
|
|
chapters = self._parse_toc_lines(toc_text)
|
|||
|
|
if chapters:
|
|||
|
|
offset = self._guess_offset(chapters, toc_end)
|
|||
|
|
return chapters, offset
|
|||
|
|
|
|||
|
|
return {}, 0
|
|||
|
|
|
|||
|
|
def _chapters_from_metadata(
|
|||
|
|
self, toc: list, num_pages: int,
|
|||
|
|
) -> dict[str, dict[str, int]]:
|
|||
|
|
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
|
|||
|
|
if not top:
|
|||
|
|
return {}
|
|||
|
|
chapters: dict[str, dict[str, int]] = OrderedDict()
|
|||
|
|
for i, (title, start) in enumerate(top):
|
|||
|
|
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
|
|||
|
|
chapters[title] = {"start": start, "end": max(start, end)}
|
|||
|
|
return chapters
|
|||
|
|
|
|||
|
|
def _find_toc_pages(self, doc, num_pages):
|
|||
|
|
toc_start = toc_end = None
|
|||
|
|
for pn in range(min(num_pages, 30)):
|
|||
|
|
text = doc[pn].get_text() or ""
|
|||
|
|
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
|
|||
|
|
if toc_start is None:
|
|||
|
|
toc_start = pn
|
|||
|
|
toc_end = pn
|
|||
|
|
elif toc_start is not None:
|
|||
|
|
break
|
|||
|
|
return toc_start, toc_end
|
|||
|
|
|
|||
|
|
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
|
|||
|
|
m = re.search(
|
|||
|
|
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
|
|||
|
|
text, re.IGNORECASE | re.MULTILINE,
|
|||
|
|
)
|
|||
|
|
if m:
|
|||
|
|
text = text[: m.start()]
|
|||
|
|
|
|||
|
|
pat = re.compile(
|
|||
|
|
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
|
|||
|
|
)
|
|||
|
|
entries: list[tuple[str, int]] = []
|
|||
|
|
for raw in text.splitlines():
|
|||
|
|
line = raw.strip()
|
|||
|
|
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
|
|||
|
|
continue
|
|||
|
|
m2 = pat.match(line)
|
|||
|
|
if not m2:
|
|||
|
|
continue
|
|||
|
|
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ")
|
|||
|
|
page = self._to_int(m2.group("page"), None)
|
|||
|
|
if not title or page is None or len(title) <= 1:
|
|||
|
|
continue
|
|||
|
|
if title.lower() in {"page", "pages", "目录", "contents"}:
|
|||
|
|
continue
|
|||
|
|
entries.append((title, page))
|
|||
|
|
|
|||
|
|
if not entries:
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
dedup: OrderedDict[str, int] = OrderedDict()
|
|||
|
|
for t, p in entries:
|
|||
|
|
dedup.setdefault(t, p)
|
|||
|
|
|
|||
|
|
titles = list(dedup.keys())
|
|||
|
|
pages = [dedup[t] for t in titles]
|
|||
|
|
catalog: dict[str, dict[str, int]] = OrderedDict()
|
|||
|
|
for i, t in enumerate(titles):
|
|||
|
|
s = pages[i]
|
|||
|
|
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
|
|||
|
|
catalog[t] = {"start": s, "end": e}
|
|||
|
|
return catalog
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _guess_offset(chapters: dict, toc_end: int) -> int:
|
|||
|
|
first_page = None
|
|||
|
|
for info in chapters.values():
|
|||
|
|
s = info["start"]
|
|||
|
|
if first_page is None or s < first_page:
|
|||
|
|
first_page = s
|
|||
|
|
if first_page is None:
|
|||
|
|
return 0
|
|||
|
|
return (toc_end + 1) - first_page
|
|||
|
|
|
|||
|
|
# ── per-page conversion ──────────────────────────────────
|
|||
|
|
|
|||
|
|
def _page_to_markdown(
|
|||
|
|
self,
|
|||
|
|
doc: fitz.Document,
|
|||
|
|
page: fitz.Page,
|
|||
|
|
page_idx: int,
|
|||
|
|
include_images: bool,
|
|||
|
|
image_dpi: int,
|
|||
|
|
max_image_bytes: int,
|
|||
|
|
) -> str:
|
|||
|
|
parts: list[str] = []
|
|||
|
|
|
|||
|
|
# ── text ──
|
|||
|
|
text = (page.get_text("text", sort=True) or "").strip()
|
|||
|
|
if text:
|
|||
|
|
parts.append(text)
|
|||
|
|
|
|||
|
|
# ── tables → Markdown ──
|
|||
|
|
try:
|
|||
|
|
for tab in (page.find_tables().tables or [])[:5]:
|
|||
|
|
cells = tab.extract() or []
|
|||
|
|
if len(cells) >= 2:
|
|||
|
|
md = self._cells_to_md_table(cells)
|
|||
|
|
if md:
|
|||
|
|
parts.append(md)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if not include_images:
|
|||
|
|
return "\n\n".join(parts)
|
|||
|
|
|
|||
|
|
# ── embedded raster images ──
|
|||
|
|
try:
|
|||
|
|
for img_idx, img_info in enumerate(page.get_images(full=True)):
|
|||
|
|
xref = img_info[0]
|
|||
|
|
try:
|
|||
|
|
data = doc.extract_image(xref)
|
|||
|
|
if not data or not data.get("image"):
|
|||
|
|
continue
|
|||
|
|
raw = data["image"]
|
|||
|
|
if len(raw) > max_image_bytes:
|
|||
|
|
continue
|
|||
|
|
# skip tiny icons (< 20x20)
|
|||
|
|
w = data.get("width", 0)
|
|||
|
|
h = data.get("height", 0)
|
|||
|
|
if w < 20 and h < 20:
|
|||
|
|
continue
|
|||
|
|
ext = data.get("ext", "png")
|
|||
|
|
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
|
|||
|
|
b64 = base64.b64encode(raw).decode("ascii")
|
|||
|
|
parts.append(
|
|||
|
|
f""
|
|||
|
|
)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# ── vector drawings → render as PNG ──
|
|||
|
|
try:
|
|||
|
|
drawings = page.get_drawings()
|
|||
|
|
if len(drawings) >= 3:
|
|||
|
|
valid_rects: list[fitz.Rect] = []
|
|||
|
|
for d in drawings:
|
|||
|
|
r = d.get("rect")
|
|||
|
|
if r:
|
|||
|
|
try:
|
|||
|
|
rect = fitz.Rect(r)
|
|||
|
|
if rect.is_valid and not rect.is_empty:
|
|||
|
|
valid_rects.append(rect)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if valid_rects:
|
|||
|
|
bbox = valid_rects[0]
|
|||
|
|
for r in valid_rects[1:]:
|
|||
|
|
bbox |= r
|
|||
|
|
bbox &= page.rect
|
|||
|
|
if bbox.width > 30 and bbox.height > 30:
|
|||
|
|
scale = image_dpi / 72
|
|||
|
|
mat = fitz.Matrix(scale, scale)
|
|||
|
|
pix = page.get_pixmap(matrix=mat, clip=bbox)
|
|||
|
|
png = pix.tobytes("png")
|
|||
|
|
if len(png) <= max_image_bytes:
|
|||
|
|
b64 = base64.b64encode(png).decode("ascii")
|
|||
|
|
parts.append(
|
|||
|
|
f""
|
|||
|
|
)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
return "\n\n".join(parts)
|
|||
|
|
|
|||
|
|
# ── assembly ─────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _assemble_by_chapters(
|
|||
|
|
self,
|
|||
|
|
chapters: dict[str, dict[str, int]],
|
|||
|
|
page_mds: list[str],
|
|||
|
|
offset: int,
|
|||
|
|
num_pages: int,
|
|||
|
|
) -> str:
|
|||
|
|
parts: list[str] = []
|
|||
|
|
for name, info in chapters.items():
|
|||
|
|
s = info["start"] + offset
|
|||
|
|
e = info["end"] + offset
|
|||
|
|
s = max(0, min(s, num_pages - 1))
|
|||
|
|
e = max(s, min(e, num_pages - 1))
|
|||
|
|
ch: list[str] = [f"# {name}\n"]
|
|||
|
|
for idx in range(s, e + 1):
|
|||
|
|
if idx < len(page_mds) and page_mds[idx].strip():
|
|||
|
|
ch.append(page_mds[idx])
|
|||
|
|
parts.append("\n\n".join(ch))
|
|||
|
|
return "\n\n---\n\n".join(parts)
|
|||
|
|
|
|||
|
|
# ── helpers ──────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _cells_to_md_table(cells: list) -> str:
|
|||
|
|
if not cells:
|
|||
|
|
return ""
|
|||
|
|
header = cells[0]
|
|||
|
|
ncols = len(header)
|
|||
|
|
if ncols == 0:
|
|||
|
|
return ""
|
|||
|
|
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
|
|||
|
|
lines = [
|
|||
|
|
"| " + " | ".join(clean(c) for c in header) + " |",
|
|||
|
|
"| " + " | ".join("---" for _ in range(ncols)) + " |",
|
|||
|
|
]
|
|||
|
|
for row in cells[1:]:
|
|||
|
|
padded = list(row) + [""] * max(0, ncols - len(row))
|
|||
|
|
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _to_int(value: Any, default: int | None) -> int | None:
|
|||
|
|
try:
|
|||
|
|
if value is None or value == "":
|
|||
|
|
return default
|
|||
|
|
return int(value)
|
|||
|
|
except Exception:
|
|||
|
|
return default
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _to_bool(value: Any, default: bool) -> bool:
|
|||
|
|
if value is None:
|
|||
|
|
return default
|
|||
|
|
if isinstance(value, bool):
|
|||
|
|
return value
|
|||
|
|
s = str(value).strip().lower()
|
|||
|
|
if s in {"1", "true", "yes", "on"}:
|
|||
|
|
return True
|
|||
|
|
if s in {"0", "false", "no", "off"}:
|
|||
|
|
return False
|
|||
|
|
return default
|