import base64 import re from collections import OrderedDict from collections.abc import Generator from typing import Any import fitz from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage class PdfToMarkdownTool(Tool): """Convert PDF to a single Markdown file. No LLM needed. - Auto-detect TOC and organize content by chapters. - Extract text and tables as Markdown. - Embed raster images as base64. - Render vector drawings as base64 PNG. - Output one .md file via create_blob_message. """ _TOC_PATTERNS = [ r"目录", r"目 录", r"目\u3000录", r"Table of Contents", r"Contents", r"目次", ] # ── entry point ────────────────────────────────────────── def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: file = tool_parameters.get("file") if not file: yield self.create_text_message("Error: file is required") return include_images = self._to_bool(tool_parameters.get("include_images"), True) image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150) image_dpi = max(72, min(image_dpi, 300)) max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw doc = fitz.open(stream=file.blob, filetype="pdf") try: num_pages = len(doc) # 1) Build chapter map (metadata TOC → printed TOC → none) chapters, content_offset = self._build_chapter_map(doc, num_pages) # 2) Convert every page page_mds: list[str] = [] for idx in range(num_pages): md = self._page_to_markdown( doc, doc[idx], idx, include_images, image_dpi, max_image_bytes, ) page_mds.append(md) # 3) Assemble if chapters: final_md = self._assemble_by_chapters( chapters, page_mds, content_offset, num_pages, ) else: final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip()) # 4) Output: text (for variable aggregation) + blob (.md file) yield self.create_text_message(final_md) md_bytes = final_md.encode("utf-8") yield self.create_blob_message( blob=md_bytes, meta={"mime_type": "text/markdown"}, ) finally: doc.close() # ── chapter detection ──────────────────────────────────── def _build_chapter_map( self, doc: fitz.Document, num_pages: int, ) -> tuple[dict, int]: """Return (chapters_dict, content_offset). Try embedded PDF TOC metadata first (reliable page mapping). Fall back to scanning printed TOC pages. """ toc = doc.get_toc() if toc: chapters = self._chapters_from_metadata(toc, num_pages) if chapters: return chapters, 0 toc_start, toc_end = self._find_toc_pages(doc, num_pages) if toc_start is not None and toc_end is not None: toc_text = "\n".join( doc[i].get_text() or "" for i in range(toc_start, toc_end + 1) ) chapters = self._parse_toc_lines(toc_text) if chapters: offset = self._guess_offset(chapters, toc_end) return chapters, offset return {}, 0 def _chapters_from_metadata( self, toc: list, num_pages: int, ) -> dict[str, dict[str, int]]: top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1] if not top: return {} chapters: dict[str, dict[str, int]] = OrderedDict() for i, (title, start) in enumerate(top): end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1 chapters[title] = {"start": start, "end": max(start, end)} return chapters def _find_toc_pages(self, doc, num_pages): toc_start = toc_end = None for pn in range(min(num_pages, 30)): text = doc[pn].get_text() or "" if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS): if toc_start is None: toc_start = pn toc_end = pn elif toc_start is not None: break return toc_start, toc_end def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: m = re.search( r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", text, re.IGNORECASE | re.MULTILINE, ) if m: text = text[: m.start()] pat = re.compile( r"^\s*(?P.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$" ) entries: list[tuple[str, int]] = [] for raw in text.splitlines(): line = raw.strip() if not line or len(line) < 3 or re.fullmatch(r"\d+", line): continue m2 = pat.match(line) if not m2: continue title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ") page = self._to_int(m2.group("page"), None) if not title or page is None or len(title) <= 1: continue if title.lower() in {"page", "pages", "目录", "contents"}: continue entries.append((title, page)) if not entries: return {} dedup: OrderedDict[str, int] = OrderedDict() for t, p in entries: dedup.setdefault(t, p) titles = list(dedup.keys()) pages = [dedup[t] for t in titles] catalog: dict[str, dict[str, int]] = OrderedDict() for i, t in enumerate(titles): s = pages[i] e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s catalog[t] = {"start": s, "end": e} return catalog @staticmethod def _guess_offset(chapters: dict, toc_end: int) -> int: first_page = None for info in chapters.values(): s = info["start"] if first_page is None or s < first_page: first_page = s if first_page is None: return 0 return (toc_end + 1) - first_page # ── per-page conversion ────────────────────────────────── def _page_to_markdown( self, doc: fitz.Document, page: fitz.Page, page_idx: int, include_images: bool, image_dpi: int, max_image_bytes: int, ) -> str: parts: list[str] = [] # ── text ── text = (page.get_text("text", sort=True) or "").strip() if text: parts.append(text) # ── tables → Markdown ── try: for tab in (page.find_tables().tables or [])[:5]: cells = tab.extract() or [] if len(cells) >= 2: md = self._cells_to_md_table(cells) if md: parts.append(md) except Exception: pass if not include_images: return "\n\n".join(parts) # ── embedded raster images ── try: for img_idx, img_info in enumerate(page.get_images(full=True)): xref = img_info[0] try: data = doc.extract_image(xref) if not data or not data.get("image"): continue raw = data["image"] if len(raw) > max_image_bytes: continue # skip tiny icons (< 20x20) w = data.get("width", 0) h = data.get("height", 0) if w < 20 and h < 20: continue ext = data.get("ext", "png") mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}" b64 = base64.b64encode(raw).decode("ascii") parts.append( f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})" ) except Exception: pass except Exception: pass # ── vector drawings → render as PNG ── try: drawings = page.get_drawings() if len(drawings) >= 3: valid_rects: list[fitz.Rect] = [] for d in drawings: r = d.get("rect") if r: try: rect = fitz.Rect(r) if rect.is_valid and not rect.is_empty: valid_rects.append(rect) except Exception: pass if valid_rects: bbox = valid_rects[0] for r in valid_rects[1:]: bbox |= r bbox &= page.rect if bbox.width > 30 and bbox.height > 30: scale = image_dpi / 72 mat = fitz.Matrix(scale, scale) pix = page.get_pixmap(matrix=mat, clip=bbox) png = pix.tobytes("png") if len(png) <= max_image_bytes: b64 = base64.b64encode(png).decode("ascii") parts.append( f"![drawing-p{page_idx}](data:image/png;base64,{b64})" ) except Exception: pass return "\n\n".join(parts) # ── assembly ───────────────────────────────────────────── def _assemble_by_chapters( self, chapters: dict[str, dict[str, int]], page_mds: list[str], offset: int, num_pages: int, ) -> str: parts: list[str] = [] for name, info in chapters.items(): s = info["start"] + offset e = info["end"] + offset s = max(0, min(s, num_pages - 1)) e = max(s, min(e, num_pages - 1)) ch: list[str] = [f"# {name}\n"] for idx in range(s, e + 1): if idx < len(page_mds) and page_mds[idx].strip(): ch.append(page_mds[idx]) parts.append("\n\n".join(ch)) return "\n\n---\n\n".join(parts) # ── helpers ────────────────────────────────────────────── @staticmethod def _cells_to_md_table(cells: list) -> str: if not cells: return "" header = cells[0] ncols = len(header) if ncols == 0: return "" clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip() lines = [ "| " + " | ".join(clean(c) for c in header) + " |", "| " + " | ".join("---" for _ in range(ncols)) + " |", ] for row in cells[1:]: padded = list(row) + [""] * max(0, ncols - len(row)) lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |") return "\n".join(lines) @staticmethod def _to_int(value: Any, default: int | None) -> int | None: try: if value is None or value == "": return default return int(value) except Exception: return default @staticmethod def _to_bool(value: Any, default: bool) -> bool: if value is None: return default if isinstance(value, bool): return value s = str(value).strip().lower() if s in {"1", "true", "yes", "on"}: return True if s in {"0", "false", "no", "off"}: return False return default