urbanLifeline/difyPlugin/pdf/tools/pdf_to_markdown.py

import base64
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any

import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage


class PdfToMarkdownTool(Tool):
    """Convert PDF to a single Markdown file. No LLM needed.

    - Auto-detect TOC and organize content by chapters.
    - Extract text and tables as Markdown.
    - Embed raster images as base64.
    - Render vector drawings as base64 PNG.
    - Output one .md file via create_blob_message.
    """

    _TOC_PATTERNS = [
        r"目录", r"目　录", r"目\u3000录",
        r"Table of Contents", r"Contents", r"目次",
    ]

    # ── entry point ──────────────────────────────────────────

    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return

        include_images = self._to_bool(tool_parameters.get("include_images"), True)
        image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
        image_dpi = max(72, min(image_dpi, 300))
        max_image_bytes = 2 * 1024 * 1024  # skip images > 2 MB raw

        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)

            # 1) Build chapter map (metadata TOC → printed TOC → none)
            chapters, content_offset = self._build_chapter_map(doc, num_pages)

            # 2) Convert every page
            page_mds: list[str] = []
            for idx in range(num_pages):
                md = self._page_to_markdown(
                    doc, doc[idx], idx,
                    include_images, image_dpi, max_image_bytes,
                )
                page_mds.append(md)

            # 3) Assemble
            if chapters:
                final_md = self._assemble_by_chapters(
                    chapters, page_mds, content_offset, num_pages,
                )
            else:
                final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())

            # 4) Output: text (for variable aggregation) + blob (.md file)
            yield self.create_text_message(final_md)
            md_bytes = final_md.encode("utf-8")
            yield self.create_blob_message(
                blob=md_bytes,
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()

    # ── chapter detection ────────────────────────────────────

    def _build_chapter_map(
        self, doc: fitz.Document, num_pages: int,
    ) -> tuple[dict, int]:
        """Return (chapters_dict, content_offset).

        Try embedded PDF TOC metadata first (reliable page mapping).
        Fall back to scanning printed TOC pages.
        """
        toc = doc.get_toc()
        if toc:
            chapters = self._chapters_from_metadata(toc, num_pages)
            if chapters:
                return chapters, 0

        toc_start, toc_end = self._find_toc_pages(doc, num_pages)
        if toc_start is not None and toc_end is not None:
            toc_text = "\n".join(
                doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
            )
            chapters = self._parse_toc_lines(toc_text)
            if chapters:
                offset = self._guess_offset(chapters, toc_end)
                return chapters, offset

        return {}, 0

    def _chapters_from_metadata(
        self, toc: list, num_pages: int,
    ) -> dict[str, dict[str, int]]:
        top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
        if not top:
            return {}
        chapters: dict[str, dict[str, int]] = OrderedDict()
        for i, (title, start) in enumerate(top):
            end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
            chapters[title] = {"start": start, "end": max(start, end)}
        return chapters

    def _find_toc_pages(self, doc, num_pages):
        toc_start = toc_end = None
        for pn in range(min(num_pages, 30)):
            text = doc[pn].get_text() or ""
            if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
                if toc_start is None:
                    toc_start = pn
                toc_end = pn
            elif toc_start is not None:
                break
        return toc_start, toc_end

    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
        m = re.search(
            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
            text, re.IGNORECASE | re.MULTILINE,
        )
        if m:
            text = text[: m.start()]

        pat = re.compile(
            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
        )
        entries: list[tuple[str, int]] = []
        for raw in text.splitlines():
            line = raw.strip()
            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
                continue
            m2 = pat.match(line)
            if not m2:
                continue
            title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:： ")
            page = self._to_int(m2.group("page"), None)
            if not title or page is None or len(title) <= 1:
                continue
            if title.lower() in {"page", "pages", "目录", "contents"}:
                continue
            entries.append((title, page))

        if not entries:
            return {}

        dedup: OrderedDict[str, int] = OrderedDict()
        for t, p in entries:
            dedup.setdefault(t, p)

        titles = list(dedup.keys())
        pages = [dedup[t] for t in titles]
        catalog: dict[str, dict[str, int]] = OrderedDict()
        for i, t in enumerate(titles):
            s = pages[i]
            e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
            catalog[t] = {"start": s, "end": e}
        return catalog

    @staticmethod
    def _guess_offset(chapters: dict, toc_end: int) -> int:
        first_page = None
        for info in chapters.values():
            s = info["start"]
            if first_page is None or s < first_page:
                first_page = s
        if first_page is None:
            return 0
        return (toc_end + 1) - first_page

    # ── per-page conversion ──────────────────────────────────

    def _page_to_markdown(
        self,
        doc: fitz.Document,
        page: fitz.Page,
        page_idx: int,
        include_images: bool,
        image_dpi: int,
        max_image_bytes: int,
    ) -> str:
        parts: list[str] = []

        # ── text ──
        text = (page.get_text("text", sort=True) or "").strip()
        if text:
            parts.append(text)

        # ── tables → Markdown ──
        try:
            for tab in (page.find_tables().tables or [])[:5]:
                cells = tab.extract() or []
                if len(cells) >= 2:
                    md = self._cells_to_md_table(cells)
                    if md:
                        parts.append(md)
        except Exception:
            pass

        if not include_images:
            return "\n\n".join(parts)

        # ── embedded raster images ──
        try:
            for img_idx, img_info in enumerate(page.get_images(full=True)):
                xref = img_info[0]
                try:
                    data = doc.extract_image(xref)
                    if not data or not data.get("image"):
                        continue
                    raw = data["image"]
                    if len(raw) > max_image_bytes:
                        continue
                    # skip tiny icons (< 20x20)
                    w = data.get("width", 0)
                    h = data.get("height", 0)
                    if w < 20 and h < 20:
                        continue
                    ext = data.get("ext", "png")
                    mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
                    b64 = base64.b64encode(raw).decode("ascii")
                    parts.append(
                        f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
                    )
                except Exception:
                    pass
        except Exception:
            pass

        # ── vector drawings → render as PNG ──
        try:
            drawings = page.get_drawings()
            if len(drawings) >= 3:
                valid_rects: list[fitz.Rect] = []
                for d in drawings:
                    r = d.get("rect")
                    if r:
                        try:
                            rect = fitz.Rect(r)
                            if rect.is_valid and not rect.is_empty:
                                valid_rects.append(rect)
                        except Exception:
                            pass
                if valid_rects:
                    bbox = valid_rects[0]
                    for r in valid_rects[1:]:
                        bbox |= r
                    bbox &= page.rect
                    if bbox.width > 30 and bbox.height > 30:
                        scale = image_dpi / 72
                        mat = fitz.Matrix(scale, scale)
                        pix = page.get_pixmap(matrix=mat, clip=bbox)
                        png = pix.tobytes("png")
                        if len(png) <= max_image_bytes:
                            b64 = base64.b64encode(png).decode("ascii")
                            parts.append(
                                f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
                            )
        except Exception:
            pass

        return "\n\n".join(parts)

    # ── assembly ─────────────────────────────────────────────

    def _assemble_by_chapters(
        self,
        chapters: dict[str, dict[str, int]],
        page_mds: list[str],
        offset: int,
        num_pages: int,
    ) -> str:
        parts: list[str] = []
        for name, info in chapters.items():
            s = info["start"] + offset
            e = info["end"] + offset
            s = max(0, min(s, num_pages - 1))
            e = max(s, min(e, num_pages - 1))
            ch: list[str] = [f"# {name}\n"]
            for idx in range(s, e + 1):
                if idx < len(page_mds) and page_mds[idx].strip():
                    ch.append(page_mds[idx])
            parts.append("\n\n".join(ch))
        return "\n\n---\n\n".join(parts)

    # ── helpers ──────────────────────────────────────────────

    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""
        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""
        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
        lines = [
            "| " + " | ".join(clean(c) for c in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
        return "\n".join(lines)

    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default

    @staticmethod
    def _to_bool(value: Any, default: bool) -> bool:
        if value is None:
            return default
        if isinstance(value, bool):
            return value
        s = str(value).strip().lower()
        if s in {"1", "true", "yes", "on"}:
            return True
        if s in {"0", "false", "no", "off"}:
            return False
        return default