This commit is contained in:
2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions

View File

@@ -1,6 +1,5 @@
import base64
import json
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
@@ -10,306 +9,219 @@ from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
"""Convert PDF to a single Markdown file. No LLM needed.
- Auto-detect TOC and organize content by chapters.
- Extract text and tables as Markdown.
- Embed raster images as base64.
- Render vector drawings as base64 PNG.
- Output one .md file via create_blob_message.
"""
_TOC_PATTERNS = [
r"目录", r"目 录", r"\u3000录",
r"Table of Contents", r"Contents", r"目次",
]
# ── entry point ──────────────────────────────────────────
"""Convert PDF to Markdown using an external catalog array."""
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
catalog_text = (tool_parameters.get("catalog") or "").strip()
if not file:
yield self.create_text_message("Error: file is required")
return
if not catalog_text:
yield self.create_text_message("Error: catalog is required")
return
include_images = self._to_bool(tool_parameters.get("include_images"), True)
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
image_dpi = max(72, min(image_dpi, 300))
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
catalog = self._parse_catalog(catalog_text)
if not catalog:
yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
return
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
hf_texts = self._detect_headers_footers(doc, num_pages)
page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)
# 1) Build chapter map (metadata TOC → printed TOC → none)
chapters, content_offset = self._build_chapter_map(doc, num_pages)
# 2) Convert every page
page_mds: list[str] = []
for idx in range(num_pages):
md = self._page_to_markdown(
doc, doc[idx], idx,
include_images, image_dpi, max_image_bytes,
)
page_mds.append(md)
# 3) Assemble
if chapters:
final_md = self._assemble_by_chapters(
chapters, page_mds, content_offset, num_pages,
)
else:
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
# 4) Output: text (for variable aggregation) + blob (.md file)
yield self.create_text_message(final_md)
md_bytes = final_md.encode("utf-8")
yield self.create_blob_message(
blob=md_bytes,
blob=final_md.encode("utf-8"),
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
# ── chapter detection ────────────────────────────────────
def _build_chapter_map(
self, doc: fitz.Document, num_pages: int,
) -> tuple[dict, int]:
"""Return (chapters_dict, content_offset).
Try embedded PDF TOC metadata first (reliable page mapping).
Fall back to scanning printed TOC pages.
"""
toc = doc.get_toc()
if toc:
chapters = self._chapters_from_metadata(toc, num_pages)
if chapters:
return chapters, 0
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
)
chapters = self._parse_toc_lines(toc_text)
if chapters:
offset = self._guess_offset(chapters, toc_end)
return chapters, offset
return {}, 0
def _chapters_from_metadata(
self, toc: list, num_pages: int,
) -> dict[str, dict[str, int]]:
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
if not top:
return {}
chapters: dict[str, dict[str, int]] = OrderedDict()
for i, (title, start) in enumerate(top):
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
chapters[title] = {"start": start, "end": max(start, end)}
return chapters
def _find_toc_pages(self, doc, num_pages):
toc_start = toc_end = None
for pn in range(min(num_pages, 30)):
text = doc[pn].get_text() or ""
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
if toc_start is None:
toc_start = pn
toc_end = pn
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
m = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text, re.IGNORECASE | re.MULTILINE,
)
if m:
text = text[: m.start()]
pat = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
continue
m2 = pat.match(line)
if not m2:
continue
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_: ")
page = self._to_int(m2.group("page"), None)
if not title or page is None or len(title) <= 1:
continue
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
dedup: OrderedDict[str, int] = OrderedDict()
for t, p in entries:
dedup.setdefault(t, p)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = OrderedDict()
for i, t in enumerate(titles):
s = pages[i]
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
catalog[t] = {"start": s, "end": e}
return catalog
@staticmethod
def _guess_offset(chapters: dict, toc_end: int) -> int:
first_page = None
for info in chapters.values():
s = info["start"]
if first_page is None or s < first_page:
first_page = s
if first_page is None:
return 0
return (toc_end + 1) - first_page
# ── per-page conversion ──────────────────────────────────
def _page_to_markdown(
self,
doc: fitz.Document,
page: fitz.Page,
page_idx: int,
include_images: bool,
image_dpi: int,
max_image_bytes: int,
) -> str:
parts: list[str] = []
# ── text ──
text = (page.get_text("text", sort=True) or "").strip()
if text:
parts.append(text)
# ── tables → Markdown ──
def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
try:
for tab in (page.find_tables().tables or [])[:5]:
cells = tab.extract() or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
parts.append(md)
raw = json.loads(catalog_text)
except Exception:
pass
return []
if not include_images:
return "\n\n".join(parts)
if not isinstance(raw, list):
return []
# ── embedded raster images ──
result: list[dict[str, Any]] = []
for item in raw:
if not isinstance(item, dict):
continue
title = str(item.get("title") or "").strip() or "Untitled"
start_index = self._to_int(item.get("page_start_index"), None)
end_index = self._to_int(item.get("page_end_index"), start_index)
if start_index is None:
start = self._to_int(item.get("start"), None)
end = self._to_int(item.get("end"), start)
if start is None:
continue
start_index = max(0, start - 1)
end_index = max(start_index, (end if end is not None else start) - 1)
if end_index is None:
end_index = start_index
result.append(
{
"title": title,
"page_start_index": max(0, start_index),
"page_end_index": max(start_index, end_index),
}
)
return result
def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
margin_ratio = 0.08
sample_count = min(num_pages, 30)
text_counts: dict[str, int] = {}
for idx in range(sample_count):
page = doc[idx]
page_height = page.rect.height
top_limit = page_height * margin_ratio
bottom_limit = page_height * (1 - margin_ratio)
try:
blocks = page.get_text("blocks", sort=True) or []
except Exception:
continue
seen: set[str] = set()
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
y0, y1 = block[1], block[3]
text = (block[4] or "").strip()
if not text or len(text) < 2 or text in seen:
continue
if y1 <= top_limit or y0 >= bottom_limit:
seen.add(text)
text_counts[text] = text_counts.get(text, 0) + 1
threshold = max(3, sample_count * 0.35)
return {text for text, count in text_counts.items() if count >= threshold}
def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
parts: list[str] = []
page_height = page.rect.height
top_margin = page_height * 0.06
bottom_margin = page_height * 0.94
table_rects: list[fitz.Rect] = []
table_mds: list[str] = []
try:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
find_tables = getattr(page, "find_tables", None)
tables = []
if callable(find_tables):
table_finder = find_tables()
tables = getattr(table_finder, "tables", []) or []
for table in tables[:5]:
try:
data = doc.extract_image(xref)
if not data or not data.get("image"):
continue
raw = data["image"]
if len(raw) > max_image_bytes:
continue
# skip tiny icons (< 20x20)
w = data.get("width", 0)
h = data.get("height", 0)
if w < 20 and h < 20:
continue
ext = data.get("ext", "png")
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
b64 = base64.b64encode(raw).decode("ascii")
parts.append(
f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
)
table_rects.append(fitz.Rect(table.bbox))
except Exception:
pass
cells = table.extract() or []
if len(cells) < 2:
continue
if hf_texts and len(cells) <= 3:
flat = " ".join(str(cell or "") for row in cells for cell in row)
if any(hf in flat for hf in hf_texts):
continue
md_table = self._cells_to_md_table(cells)
if md_table:
table_mds.append(md_table)
except Exception:
pass
# ── vector drawings → render as PNG ──
try:
drawings = page.get_drawings()
if len(drawings) >= 3:
valid_rects: list[fitz.Rect] = []
for d in drawings:
r = d.get("rect")
if r:
try:
rect = fitz.Rect(r)
if rect.is_valid and not rect.is_empty:
valid_rects.append(rect)
except Exception:
pass
if valid_rects:
bbox = valid_rects[0]
for r in valid_rects[1:]:
bbox |= r
bbox &= page.rect
if bbox.width > 30 and bbox.height > 30:
scale = image_dpi / 72
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, clip=bbox)
png = pix.tobytes("png")
if len(png) <= max_image_bytes:
b64 = base64.b64encode(png).decode("ascii")
parts.append(
f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
)
blocks = page.get_text("blocks", sort=True) or []
except Exception:
pass
blocks = []
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
x0, y0, x1, y1 = block[:4]
text = (block[4] or "").strip()
if not text:
continue
block_rect = fitz.Rect(x0, y0, x1, y1)
if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
continue
if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
if any(hf in text for hf in hf_texts):
continue
if re.fullmatch(r"\s*\d{1,4}\s*", text):
continue
parts.append(text)
parts.extend(table_mds)
return "\n\n".join(parts)
# ── assembly ─────────────────────────────────────────────
def _assemble_by_chapters(
self,
chapters: dict[str, dict[str, int]],
page_mds: list[str],
offset: int,
num_pages: int,
) -> str:
def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
parts: list[str] = []
for name, info in chapters.items():
s = info["start"] + offset
e = info["end"] + offset
s = max(0, min(s, num_pages - 1))
e = max(s, min(e, num_pages - 1))
ch: list[str] = [f"# {name}\n"]
for idx in range(s, e + 1):
if idx < len(page_mds) and page_mds[idx].strip():
ch.append(page_mds[idx])
parts.append("\n\n".join(ch))
return "\n\n---\n\n".join(parts)
used_pages: set[int] = set()
# ── helpers ──────────────────────────────────────────────
for item in catalog:
start = max(0, min(int(item["page_start_index"]), num_pages - 1))
end = max(start, min(int(item["page_end_index"]), num_pages - 1))
chapter_parts = [f"# {item['title']}\n"]
for idx in range(start, end + 1):
if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
chapter_parts.append(page_mds[idx])
used_pages.add(idx)
if len(chapter_parts) > 1:
parts.append("\n\n".join(chapter_parts))
if parts:
return "\n\n---\n\n".join(parts)
return "\n\n---\n\n".join(m for m in page_mds if m.strip())
@staticmethod
def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
inter = block_rect & table_rect
if inter.is_empty:
return False
block_area = block_rect.width * block_rect.height
if block_area <= 0:
return False
return (inter.width * inter.height) / block_area >= 0.3
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
def clean(value: Any) -> str:
return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join(clean(cell) for cell in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
return "\n".join(lines)
@staticmethod
@@ -320,16 +232,3 @@ class PdfToMarkdownTool(Tool):
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default