更新
This commit is contained in:
335
difyPlugin/pdf/tools/pdf_to_markdown.py
Normal file
335
difyPlugin/pdf/tools/pdf_to_markdown.py
Normal file
@@ -0,0 +1,335 @@
|
||||
import base64
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
import fitz
|
||||
from dify_plugin import Tool
|
||||
from dify_plugin.entities.tool import ToolInvokeMessage
|
||||
|
||||
|
||||
class PdfToMarkdownTool(Tool):
|
||||
"""Convert PDF to a single Markdown file. No LLM needed.
|
||||
|
||||
- Auto-detect TOC and organize content by chapters.
|
||||
- Extract text and tables as Markdown.
|
||||
- Embed raster images as base64.
|
||||
- Render vector drawings as base64 PNG.
|
||||
- Output one .md file via create_blob_message.
|
||||
"""
|
||||
|
||||
_TOC_PATTERNS = [
|
||||
r"目录", r"目 录", r"目\u3000录",
|
||||
r"Table of Contents", r"Contents", r"目次",
|
||||
]
|
||||
|
||||
# ── entry point ──────────────────────────────────────────
|
||||
|
||||
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
||||
file = tool_parameters.get("file")
|
||||
if not file:
|
||||
yield self.create_text_message("Error: file is required")
|
||||
return
|
||||
|
||||
include_images = self._to_bool(tool_parameters.get("include_images"), True)
|
||||
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
|
||||
image_dpi = max(72, min(image_dpi, 300))
|
||||
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
|
||||
|
||||
doc = fitz.open(stream=file.blob, filetype="pdf")
|
||||
try:
|
||||
num_pages = len(doc)
|
||||
|
||||
# 1) Build chapter map (metadata TOC → printed TOC → none)
|
||||
chapters, content_offset = self._build_chapter_map(doc, num_pages)
|
||||
|
||||
# 2) Convert every page
|
||||
page_mds: list[str] = []
|
||||
for idx in range(num_pages):
|
||||
md = self._page_to_markdown(
|
||||
doc, doc[idx], idx,
|
||||
include_images, image_dpi, max_image_bytes,
|
||||
)
|
||||
page_mds.append(md)
|
||||
|
||||
# 3) Assemble
|
||||
if chapters:
|
||||
final_md = self._assemble_by_chapters(
|
||||
chapters, page_mds, content_offset, num_pages,
|
||||
)
|
||||
else:
|
||||
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
|
||||
|
||||
# 4) Output: text (for variable aggregation) + blob (.md file)
|
||||
yield self.create_text_message(final_md)
|
||||
md_bytes = final_md.encode("utf-8")
|
||||
yield self.create_blob_message(
|
||||
blob=md_bytes,
|
||||
meta={"mime_type": "text/markdown"},
|
||||
)
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
# ── chapter detection ────────────────────────────────────
|
||||
|
||||
def _build_chapter_map(
|
||||
self, doc: fitz.Document, num_pages: int,
|
||||
) -> tuple[dict, int]:
|
||||
"""Return (chapters_dict, content_offset).
|
||||
|
||||
Try embedded PDF TOC metadata first (reliable page mapping).
|
||||
Fall back to scanning printed TOC pages.
|
||||
"""
|
||||
toc = doc.get_toc()
|
||||
if toc:
|
||||
chapters = self._chapters_from_metadata(toc, num_pages)
|
||||
if chapters:
|
||||
return chapters, 0
|
||||
|
||||
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
|
||||
if toc_start is not None and toc_end is not None:
|
||||
toc_text = "\n".join(
|
||||
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
|
||||
)
|
||||
chapters = self._parse_toc_lines(toc_text)
|
||||
if chapters:
|
||||
offset = self._guess_offset(chapters, toc_end)
|
||||
return chapters, offset
|
||||
|
||||
return {}, 0
|
||||
|
||||
def _chapters_from_metadata(
|
||||
self, toc: list, num_pages: int,
|
||||
) -> dict[str, dict[str, int]]:
|
||||
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
|
||||
if not top:
|
||||
return {}
|
||||
chapters: dict[str, dict[str, int]] = OrderedDict()
|
||||
for i, (title, start) in enumerate(top):
|
||||
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
|
||||
chapters[title] = {"start": start, "end": max(start, end)}
|
||||
return chapters
|
||||
|
||||
def _find_toc_pages(self, doc, num_pages):
|
||||
toc_start = toc_end = None
|
||||
for pn in range(min(num_pages, 30)):
|
||||
text = doc[pn].get_text() or ""
|
||||
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
|
||||
if toc_start is None:
|
||||
toc_start = pn
|
||||
toc_end = pn
|
||||
elif toc_start is not None:
|
||||
break
|
||||
return toc_start, toc_end
|
||||
|
||||
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
|
||||
m = re.search(
|
||||
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
|
||||
text, re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
if m:
|
||||
text = text[: m.start()]
|
||||
|
||||
pat = re.compile(
|
||||
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
|
||||
)
|
||||
entries: list[tuple[str, int]] = []
|
||||
for raw in text.splitlines():
|
||||
line = raw.strip()
|
||||
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
|
||||
continue
|
||||
m2 = pat.match(line)
|
||||
if not m2:
|
||||
continue
|
||||
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ")
|
||||
page = self._to_int(m2.group("page"), None)
|
||||
if not title or page is None or len(title) <= 1:
|
||||
continue
|
||||
if title.lower() in {"page", "pages", "目录", "contents"}:
|
||||
continue
|
||||
entries.append((title, page))
|
||||
|
||||
if not entries:
|
||||
return {}
|
||||
|
||||
dedup: OrderedDict[str, int] = OrderedDict()
|
||||
for t, p in entries:
|
||||
dedup.setdefault(t, p)
|
||||
|
||||
titles = list(dedup.keys())
|
||||
pages = [dedup[t] for t in titles]
|
||||
catalog: dict[str, dict[str, int]] = OrderedDict()
|
||||
for i, t in enumerate(titles):
|
||||
s = pages[i]
|
||||
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
|
||||
catalog[t] = {"start": s, "end": e}
|
||||
return catalog
|
||||
|
||||
@staticmethod
|
||||
def _guess_offset(chapters: dict, toc_end: int) -> int:
|
||||
first_page = None
|
||||
for info in chapters.values():
|
||||
s = info["start"]
|
||||
if first_page is None or s < first_page:
|
||||
first_page = s
|
||||
if first_page is None:
|
||||
return 0
|
||||
return (toc_end + 1) - first_page
|
||||
|
||||
# ── per-page conversion ──────────────────────────────────
|
||||
|
||||
def _page_to_markdown(
|
||||
self,
|
||||
doc: fitz.Document,
|
||||
page: fitz.Page,
|
||||
page_idx: int,
|
||||
include_images: bool,
|
||||
image_dpi: int,
|
||||
max_image_bytes: int,
|
||||
) -> str:
|
||||
parts: list[str] = []
|
||||
|
||||
# ── text ──
|
||||
text = (page.get_text("text", sort=True) or "").strip()
|
||||
if text:
|
||||
parts.append(text)
|
||||
|
||||
# ── tables → Markdown ──
|
||||
try:
|
||||
for tab in (page.find_tables().tables or [])[:5]:
|
||||
cells = tab.extract() or []
|
||||
if len(cells) >= 2:
|
||||
md = self._cells_to_md_table(cells)
|
||||
if md:
|
||||
parts.append(md)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not include_images:
|
||||
return "\n\n".join(parts)
|
||||
|
||||
# ── embedded raster images ──
|
||||
try:
|
||||
for img_idx, img_info in enumerate(page.get_images(full=True)):
|
||||
xref = img_info[0]
|
||||
try:
|
||||
data = doc.extract_image(xref)
|
||||
if not data or not data.get("image"):
|
||||
continue
|
||||
raw = data["image"]
|
||||
if len(raw) > max_image_bytes:
|
||||
continue
|
||||
# skip tiny icons (< 20x20)
|
||||
w = data.get("width", 0)
|
||||
h = data.get("height", 0)
|
||||
if w < 20 and h < 20:
|
||||
continue
|
||||
ext = data.get("ext", "png")
|
||||
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
parts.append(
|
||||
f""
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── vector drawings → render as PNG ──
|
||||
try:
|
||||
drawings = page.get_drawings()
|
||||
if len(drawings) >= 3:
|
||||
valid_rects: list[fitz.Rect] = []
|
||||
for d in drawings:
|
||||
r = d.get("rect")
|
||||
if r:
|
||||
try:
|
||||
rect = fitz.Rect(r)
|
||||
if rect.is_valid and not rect.is_empty:
|
||||
valid_rects.append(rect)
|
||||
except Exception:
|
||||
pass
|
||||
if valid_rects:
|
||||
bbox = valid_rects[0]
|
||||
for r in valid_rects[1:]:
|
||||
bbox |= r
|
||||
bbox &= page.rect
|
||||
if bbox.width > 30 and bbox.height > 30:
|
||||
scale = image_dpi / 72
|
||||
mat = fitz.Matrix(scale, scale)
|
||||
pix = page.get_pixmap(matrix=mat, clip=bbox)
|
||||
png = pix.tobytes("png")
|
||||
if len(png) <= max_image_bytes:
|
||||
b64 = base64.b64encode(png).decode("ascii")
|
||||
parts.append(
|
||||
f""
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
# ── assembly ─────────────────────────────────────────────
|
||||
|
||||
def _assemble_by_chapters(
|
||||
self,
|
||||
chapters: dict[str, dict[str, int]],
|
||||
page_mds: list[str],
|
||||
offset: int,
|
||||
num_pages: int,
|
||||
) -> str:
|
||||
parts: list[str] = []
|
||||
for name, info in chapters.items():
|
||||
s = info["start"] + offset
|
||||
e = info["end"] + offset
|
||||
s = max(0, min(s, num_pages - 1))
|
||||
e = max(s, min(e, num_pages - 1))
|
||||
ch: list[str] = [f"# {name}\n"]
|
||||
for idx in range(s, e + 1):
|
||||
if idx < len(page_mds) and page_mds[idx].strip():
|
||||
ch.append(page_mds[idx])
|
||||
parts.append("\n\n".join(ch))
|
||||
return "\n\n---\n\n".join(parts)
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _cells_to_md_table(cells: list) -> str:
|
||||
if not cells:
|
||||
return ""
|
||||
header = cells[0]
|
||||
ncols = len(header)
|
||||
if ncols == 0:
|
||||
return ""
|
||||
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
|
||||
lines = [
|
||||
"| " + " | ".join(clean(c) for c in header) + " |",
|
||||
"| " + " | ".join("---" for _ in range(ncols)) + " |",
|
||||
]
|
||||
for row in cells[1:]:
|
||||
padded = list(row) + [""] * max(0, ncols - len(row))
|
||||
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
@staticmethod
|
||||
def _to_int(value: Any, default: int | None) -> int | None:
|
||||
try:
|
||||
if value is None or value == "":
|
||||
return default
|
||||
return int(value)
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
@staticmethod
|
||||
def _to_bool(value: Any, default: bool) -> bool:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
s = str(value).strip().lower()
|
||||
if s in {"1", "true", "yes", "on"}:
|
||||
return True
|
||||
if s in {"0", "false", "no", "off"}:
|
||||
return False
|
||||
return default
|
||||
Reference in New Issue
Block a user