更新

2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions
--- a/difyPlugin/pdf/tools/pdf_to_markdown.py
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.py
@@ -1,6 +1,5 @@
-import base64
+import json
 import re
-from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any

@@ -10,306 +9,219 @@ from dify_plugin.entities.tool import ToolInvokeMessage


 class PdfToMarkdownTool(Tool):
-    """Convert PDF to a single Markdown file. No LLM needed.
-
-    - Auto-detect TOC and organize content by chapters.
-    - Extract text and tables as Markdown.
-    - Embed raster images as base64.
-    - Render vector drawings as base64 PNG.
-    - Output one .md file via create_blob_message.
-    """
-
-    _TOC_PATTERNS = [
-        r"目录", r"目　录", r"目\u3000录",
-        r"Table of Contents", r"Contents", r"目次",
-    ]
-
-    # ── entry point ──────────────────────────────────────────
+    """Convert PDF to Markdown using an external catalog array."""

    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
+        catalog_text = (tool_parameters.get("catalog") or "").strip()
        if not file:
            yield self.create_text_message("Error: file is required")
            return
+        if not catalog_text:
+            yield self.create_text_message("Error: catalog is required")
+            return

-        include_images = self._to_bool(tool_parameters.get("include_images"), True)
-        image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
-        image_dpi = max(72, min(image_dpi, 300))
-        max_image_bytes = 2 * 1024 * 1024  # skip images > 2 MB raw
+        catalog = self._parse_catalog(catalog_text)
+        if not catalog:
+            yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
+            return

        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)
+            hf_texts = self._detect_headers_footers(doc, num_pages)
+            page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
+            final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)

-            # 1) Build chapter map (metadata TOC → printed TOC → none)
-            chapters, content_offset = self._build_chapter_map(doc, num_pages)
-
-            # 2) Convert every page
-            page_mds: list[str] = []
-            for idx in range(num_pages):
-                md = self._page_to_markdown(
-                    doc, doc[idx], idx,
-                    include_images, image_dpi, max_image_bytes,
-                )
-                page_mds.append(md)
-
-            # 3) Assemble
-            if chapters:
-                final_md = self._assemble_by_chapters(
-                    chapters, page_mds, content_offset, num_pages,
-                )
-            else:
-                final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
-
-            # 4) Output: text (for variable aggregation) + blob (.md file)
            yield self.create_text_message(final_md)
-            md_bytes = final_md.encode("utf-8")
            yield self.create_blob_message(
-                blob=md_bytes,
+                blob=final_md.encode("utf-8"),
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()

-    # ── chapter detection ────────────────────────────────────
-
-    def _build_chapter_map(
-        self, doc: fitz.Document, num_pages: int,
-    ) -> tuple[dict, int]:
-        """Return (chapters_dict, content_offset).
-
-        Try embedded PDF TOC metadata first (reliable page mapping).
-        Fall back to scanning printed TOC pages.
-        """
-        toc = doc.get_toc()
-        if toc:
-            chapters = self._chapters_from_metadata(toc, num_pages)
-            if chapters:
-                return chapters, 0
-
-        toc_start, toc_end = self._find_toc_pages(doc, num_pages)
-        if toc_start is not None and toc_end is not None:
-            toc_text = "\n".join(
-                doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
-            )
-            chapters = self._parse_toc_lines(toc_text)
-            if chapters:
-                offset = self._guess_offset(chapters, toc_end)
-                return chapters, offset
-
-        return {}, 0
-
-    def _chapters_from_metadata(
-        self, toc: list, num_pages: int,
-    ) -> dict[str, dict[str, int]]:
-        top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
-        if not top:
-            return {}
-        chapters: dict[str, dict[str, int]] = OrderedDict()
-        for i, (title, start) in enumerate(top):
-            end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
-            chapters[title] = {"start": start, "end": max(start, end)}
-        return chapters
-
-    def _find_toc_pages(self, doc, num_pages):
-        toc_start = toc_end = None
-        for pn in range(min(num_pages, 30)):
-            text = doc[pn].get_text() or ""
-            if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
-                if toc_start is None:
-                    toc_start = pn
-                toc_end = pn
-            elif toc_start is not None:
-                break
-        return toc_start, toc_end
-
-    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
-        m = re.search(
-            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
-            text, re.IGNORECASE | re.MULTILINE,
-        )
-        if m:
-            text = text[: m.start()]
-
-        pat = re.compile(
-            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
-        )
-        entries: list[tuple[str, int]] = []
-        for raw in text.splitlines():
-            line = raw.strip()
-            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
-                continue
-            m2 = pat.match(line)
-            if not m2:
-                continue
-            title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:： ")
-            page = self._to_int(m2.group("page"), None)
-            if not title or page is None or len(title) <= 1:
-                continue
-            if title.lower() in {"page", "pages", "目录", "contents"}:
-                continue
-            entries.append((title, page))
-
-        if not entries:
-            return {}
-
-        dedup: OrderedDict[str, int] = OrderedDict()
-        for t, p in entries:
-            dedup.setdefault(t, p)
-
-        titles = list(dedup.keys())
-        pages = [dedup[t] for t in titles]
-        catalog: dict[str, dict[str, int]] = OrderedDict()
-        for i, t in enumerate(titles):
-            s = pages[i]
-            e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
-            catalog[t] = {"start": s, "end": e}
-        return catalog
-
-    @staticmethod
-    def _guess_offset(chapters: dict, toc_end: int) -> int:
-        first_page = None
-        for info in chapters.values():
-            s = info["start"]
-            if first_page is None or s < first_page:
-                first_page = s
-        if first_page is None:
-            return 0
-        return (toc_end + 1) - first_page
-
-    # ── per-page conversion ──────────────────────────────────
-
-    def _page_to_markdown(
-        self,
-        doc: fitz.Document,
-        page: fitz.Page,
-        page_idx: int,
-        include_images: bool,
-        image_dpi: int,
-        max_image_bytes: int,
-    ) -> str:
-        parts: list[str] = []
-
-        # ── text ──
-        text = (page.get_text("text", sort=True) or "").strip()
-        if text:
-            parts.append(text)
-
-        # ── tables → Markdown ──
+    def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
        try:
-            for tab in (page.find_tables().tables or [])[:5]:
-                cells = tab.extract() or []
-                if len(cells) >= 2:
-                    md = self._cells_to_md_table(cells)
-                    if md:
-                        parts.append(md)
+            raw = json.loads(catalog_text)
        except Exception:
-            pass
+            return []

-        if not include_images:
-            return "\n\n".join(parts)
+        if not isinstance(raw, list):
+            return []

-        # ── embedded raster images ──
+        result: list[dict[str, Any]] = []
+        for item in raw:
+            if not isinstance(item, dict):
+                continue
+
+            title = str(item.get("title") or "").strip() or "Untitled"
+            start_index = self._to_int(item.get("page_start_index"), None)
+            end_index = self._to_int(item.get("page_end_index"), start_index)
+
+            if start_index is None:
+                start = self._to_int(item.get("start"), None)
+                end = self._to_int(item.get("end"), start)
+                if start is None:
+                    continue
+                start_index = max(0, start - 1)
+                end_index = max(start_index, (end if end is not None else start) - 1)
+
+            if end_index is None:
+                end_index = start_index
+
+            result.append(
+                {
+                    "title": title,
+                    "page_start_index": max(0, start_index),
+                    "page_end_index": max(start_index, end_index),
+                }
+            )
+        return result
+
+    def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
+        margin_ratio = 0.08
+        sample_count = min(num_pages, 30)
+        text_counts: dict[str, int] = {}
+
+        for idx in range(sample_count):
+            page = doc[idx]
+            page_height = page.rect.height
+            top_limit = page_height * margin_ratio
+            bottom_limit = page_height * (1 - margin_ratio)
+            try:
+                blocks = page.get_text("blocks", sort=True) or []
+            except Exception:
+                continue
+
+            seen: set[str] = set()
+            for block in blocks:
+                if len(block) < 7 or block[6] != 0:
+                    continue
+                y0, y1 = block[1], block[3]
+                text = (block[4] or "").strip()
+                if not text or len(text) < 2 or text in seen:
+                    continue
+                if y1 <= top_limit or y0 >= bottom_limit:
+                    seen.add(text)
+                    text_counts[text] = text_counts.get(text, 0) + 1
+
+        threshold = max(3, sample_count * 0.35)
+        return {text for text, count in text_counts.items() if count >= threshold}
+
+    def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
+        parts: list[str] = []
+        page_height = page.rect.height
+        top_margin = page_height * 0.06
+        bottom_margin = page_height * 0.94
+
+        table_rects: list[fitz.Rect] = []
+        table_mds: list[str] = []
        try:
-            for img_idx, img_info in enumerate(page.get_images(full=True)):
-                xref = img_info[0]
+            find_tables = getattr(page, "find_tables", None)
+            tables = []
+            if callable(find_tables):
+                table_finder = find_tables()
+                tables = getattr(table_finder, "tables", []) or []
+
+            for table in tables[:5]:
                try:
-                    data = doc.extract_image(xref)
-                    if not data or not data.get("image"):
-                        continue
-                    raw = data["image"]
-                    if len(raw) > max_image_bytes:
-                        continue
-                    # skip tiny icons (< 20x20)
-                    w = data.get("width", 0)
-                    h = data.get("height", 0)
-                    if w < 20 and h < 20:
-                        continue
-                    ext = data.get("ext", "png")
-                    mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
-                    b64 = base64.b64encode(raw).decode("ascii")
-                    parts.append(
-                        f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
-                    )
+                    table_rects.append(fitz.Rect(table.bbox))
                except Exception:
                    pass
+
+                cells = table.extract() or []
+                if len(cells) < 2:
+                    continue
+                if hf_texts and len(cells) <= 3:
+                    flat = " ".join(str(cell or "") for row in cells for cell in row)
+                    if any(hf in flat for hf in hf_texts):
+                        continue
+
+                md_table = self._cells_to_md_table(cells)
+                if md_table:
+                    table_mds.append(md_table)
        except Exception:
            pass

-        # ── vector drawings → render as PNG ──
        try:
-            drawings = page.get_drawings()
-            if len(drawings) >= 3:
-                valid_rects: list[fitz.Rect] = []
-                for d in drawings:
-                    r = d.get("rect")
-                    if r:
-                        try:
-                            rect = fitz.Rect(r)
-                            if rect.is_valid and not rect.is_empty:
-                                valid_rects.append(rect)
-                        except Exception:
-                            pass
-                if valid_rects:
-                    bbox = valid_rects[0]
-                    for r in valid_rects[1:]:
-                        bbox |= r
-                    bbox &= page.rect
-                    if bbox.width > 30 and bbox.height > 30:
-                        scale = image_dpi / 72
-                        mat = fitz.Matrix(scale, scale)
-                        pix = page.get_pixmap(matrix=mat, clip=bbox)
-                        png = pix.tobytes("png")
-                        if len(png) <= max_image_bytes:
-                            b64 = base64.b64encode(png).decode("ascii")
-                            parts.append(
-                                f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
-                            )
+            blocks = page.get_text("blocks", sort=True) or []
        except Exception:
-            pass
+            blocks = []

+        for block in blocks:
+            if len(block) < 7 or block[6] != 0:
+                continue
+            x0, y0, x1, y1 = block[:4]
+            text = (block[4] or "").strip()
+            if not text:
+                continue
+
+            block_rect = fitz.Rect(x0, y0, x1, y1)
+            if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
+                continue
+            if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
+                if any(hf in text for hf in hf_texts):
+                    continue
+            if re.fullmatch(r"\s*\d{1,4}\s*", text):
+                continue
+
+            parts.append(text)
+
+        parts.extend(table_mds)
        return "\n\n".join(parts)

-    # ── assembly ─────────────────────────────────────────────
-
-    def _assemble_by_chapters(
-        self,
-        chapters: dict[str, dict[str, int]],
-        page_mds: list[str],
-        offset: int,
-        num_pages: int,
-    ) -> str:
+    def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
        parts: list[str] = []
-        for name, info in chapters.items():
-            s = info["start"] + offset
-            e = info["end"] + offset
-            s = max(0, min(s, num_pages - 1))
-            e = max(s, min(e, num_pages - 1))
-            ch: list[str] = [f"# {name}\n"]
-            for idx in range(s, e + 1):
-                if idx < len(page_mds) and page_mds[idx].strip():
-                    ch.append(page_mds[idx])
-            parts.append("\n\n".join(ch))
-        return "\n\n---\n\n".join(parts)
+        used_pages: set[int] = set()

-    # ── helpers ──────────────────────────────────────────────
+        for item in catalog:
+            start = max(0, min(int(item["page_start_index"]), num_pages - 1))
+            end = max(start, min(int(item["page_end_index"]), num_pages - 1))
+
+            chapter_parts = [f"# {item['title']}\n"]
+            for idx in range(start, end + 1):
+                if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
+                    chapter_parts.append(page_mds[idx])
+                    used_pages.add(idx)
+
+            if len(chapter_parts) > 1:
+                parts.append("\n\n".join(chapter_parts))
+
+        if parts:
+            return "\n\n---\n\n".join(parts)
+        return "\n\n---\n\n".join(m for m in page_mds if m.strip())
+
+    @staticmethod
+    def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
+        inter = block_rect & table_rect
+        if inter.is_empty:
+            return False
+        block_area = block_rect.width * block_rect.height
+        if block_area <= 0:
+            return False
+        return (inter.width * inter.height) / block_area >= 0.3

    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""
+
        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""
-        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
+
+        def clean(value: Any) -> str:
+            return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
+
        lines = [
-            "| " + " | ".join(clean(c) for c in header) + " |",
+            "| " + " | ".join(clean(cell) for cell in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
-            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
+            lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
        return "\n".join(lines)

    @staticmethod
@@ -320,16 +232,3 @@ class PdfToMarkdownTool(Tool):
            return int(value)
        except Exception:
            return default
-
-    @staticmethod
-    def _to_bool(value: Any, default: bool) -> bool:
-        if value is None:
-            return default
-        if isinstance(value, bool):
-            return value
-        s = str(value).strip().lower()
-        if s in {"1", "true", "yes", "on"}:
-            return True
-        if s in {"0", "false", "no", "off"}:
-            return False
-        return default