urbanLifeline/difyPlugin/pdf/tools/pdf_to_markdown.py

import json
import re
from collections.abc import Generator
from typing import Any

import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage


class PdfToMarkdownTool(Tool):
    """Convert PDF to Markdown using an external catalog array."""

    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        catalog_text = (tool_parameters.get("catalog") or "").strip()
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        if not catalog_text:
            yield self.create_text_message("Error: catalog is required")
            return

        catalog = self._parse_catalog(catalog_text)
        if not catalog:
            yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
            return

        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)
            hf_texts = self._detect_headers_footers(doc, num_pages)
            page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
            final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)

            yield self.create_text_message(final_md)
            yield self.create_blob_message(
                blob=final_md.encode("utf-8"),
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()

    def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
        try:
            raw = json.loads(catalog_text)
        except Exception:
            return []

        if not isinstance(raw, list):
            return []

        result: list[dict[str, Any]] = []
        for item in raw:
            if not isinstance(item, dict):
                continue

            title = str(item.get("title") or "").strip() or "Untitled"
            start_index = self._to_int(item.get("page_start_index"), None)
            end_index = self._to_int(item.get("page_end_index"), start_index)

            if start_index is None:
                start = self._to_int(item.get("start"), None)
                end = self._to_int(item.get("end"), start)
                if start is None:
                    continue
                start_index = max(0, start - 1)
                end_index = max(start_index, (end if end is not None else start) - 1)

            if end_index is None:
                end_index = start_index

            result.append(
                {
                    "title": title,
                    "page_start_index": max(0, start_index),
                    "page_end_index": max(start_index, end_index),
                }
            )
        return result

    def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
        margin_ratio = 0.08
        sample_count = min(num_pages, 30)
        text_counts: dict[str, int] = {}

        for idx in range(sample_count):
            page = doc[idx]
            page_height = page.rect.height
            top_limit = page_height * margin_ratio
            bottom_limit = page_height * (1 - margin_ratio)
            try:
                blocks = page.get_text("blocks", sort=True) or []
            except Exception:
                continue

            seen: set[str] = set()
            for block in blocks:
                if len(block) < 7 or block[6] != 0:
                    continue
                y0, y1 = block[1], block[3]
                text = (block[4] or "").strip()
                if not text or len(text) < 2 or text in seen:
                    continue
                if y1 <= top_limit or y0 >= bottom_limit:
                    seen.add(text)
                    text_counts[text] = text_counts.get(text, 0) + 1

        threshold = max(3, sample_count * 0.35)
        return {text for text, count in text_counts.items() if count >= threshold}

    def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
        parts: list[str] = []
        page_height = page.rect.height
        top_margin = page_height * 0.06
        bottom_margin = page_height * 0.94

        table_rects: list[fitz.Rect] = []
        table_mds: list[str] = []
        try:
            find_tables = getattr(page, "find_tables", None)
            tables = []
            if callable(find_tables):
                table_finder = find_tables()
                tables = getattr(table_finder, "tables", []) or []

            for table in tables[:5]:
                try:
                    table_rects.append(fitz.Rect(table.bbox))
                except Exception:
                    pass

                cells = table.extract() or []
                if len(cells) < 2:
                    continue
                if hf_texts and len(cells) <= 3:
                    flat = " ".join(str(cell or "") for row in cells for cell in row)
                    if any(hf in flat for hf in hf_texts):
                        continue

                md_table = self._cells_to_md_table(cells)
                if md_table:
                    table_mds.append(md_table)
        except Exception:
            pass

        try:
            blocks = page.get_text("blocks", sort=True) or []
        except Exception:
            blocks = []

        for block in blocks:
            if len(block) < 7 or block[6] != 0:
                continue
            x0, y0, x1, y1 = block[:4]
            text = (block[4] or "").strip()
            if not text:
                continue

            block_rect = fitz.Rect(x0, y0, x1, y1)
            if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
                continue
            if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
                if any(hf in text for hf in hf_texts):
                    continue
            if re.fullmatch(r"\s*\d{1,4}\s*", text):
                continue

            parts.append(text)

        parts.extend(table_mds)
        return "\n\n".join(parts)

    def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
        parts: list[str] = []
        used_pages: set[int] = set()

        for item in catalog:
            start = max(0, min(int(item["page_start_index"]), num_pages - 1))
            end = max(start, min(int(item["page_end_index"]), num_pages - 1))

            chapter_parts = [f"# {item['title']}\n"]
            for idx in range(start, end + 1):
                if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
                    chapter_parts.append(page_mds[idx])
                    used_pages.add(idx)

            if len(chapter_parts) > 1:
                parts.append("\n\n".join(chapter_parts))

        if parts:
            return "\n\n---\n\n".join(parts)
        return "\n\n---\n\n".join(m for m in page_mds if m.strip())

    @staticmethod
    def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
        inter = block_rect & table_rect
        if inter.is_empty:
            return False
        block_area = block_rect.width * block_rect.height
        if block_area <= 0:
            return False
        return (inter.width * inter.height) / block_area >= 0.3

    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""

        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""

        def clean(value: Any) -> str:
            return str(value or "").replace("|", "\\|").replace("\n", " ").strip()

        lines = [
            "| " + " | ".join(clean(cell) for cell in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
            lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
        return "\n".join(lines)

    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default
更新 2026-03-15 13:00:30 +08:00			`import json`
更新 2026-03-06 14:50:43 +08:00			`import re`
			`from collections.abc import Generator`
			`from typing import Any`

			`import fitz`
			`from dify_plugin import Tool`
			`from dify_plugin.entities.tool import ToolInvokeMessage`


			`class PdfToMarkdownTool(Tool):`
更新 2026-03-15 13:00:30 +08:00			`"""Convert PDF to Markdown using an external catalog array."""`
更新 2026-03-06 14:50:43 +08:00
			`def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:`
			`file = tool_parameters.get("file")`
更新 2026-03-15 13:00:30 +08:00			`catalog_text = (tool_parameters.get("catalog") or "").strip()`
更新 2026-03-06 14:50:43 +08:00			`if not file:`
			`yield self.create_text_message("Error: file is required")`
			`return`
更新 2026-03-15 13:00:30 +08:00			`if not catalog_text:`
			`yield self.create_text_message("Error: catalog is required")`
			`return`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`catalog = self._parse_catalog(catalog_text)`
			`if not catalog:`
			`yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")`
			`return`
更新 2026-03-06 14:50:43 +08:00
			`doc = fitz.open(stream=file.blob, filetype="pdf")`
			`try:`
			`num_pages = len(doc)`
更新 2026-03-15 13:00:30 +08:00			`hf_texts = self._detect_headers_footers(doc, num_pages)`
			`page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]`
			`final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)`
更新 2026-03-06 14:50:43 +08:00
			`yield self.create_text_message(final_md)`
			`yield self.create_blob_message(`
更新 2026-03-15 13:00:30 +08:00			`blob=final_md.encode("utf-8"),`
更新 2026-03-06 14:50:43 +08:00			`meta={"mime_type": "text/markdown"},`
			`)`
			`finally:`
			`doc.close()`

更新 2026-03-15 13:00:30 +08:00			`def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:`
			`try:`
			`raw = json.loads(catalog_text)`
			`except Exception:`
			`return []`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`if not isinstance(raw, list):`
			`return []`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`result: list[dict[str, Any]] = []`
			`for item in raw:`
			`if not isinstance(item, dict):`
			`continue`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`title = str(item.get("title") or "").strip() or "Untitled"`
			`start_index = self._to_int(item.get("page_start_index"), None)`
			`end_index = self._to_int(item.get("page_end_index"), start_index)`

			`if start_index is None:`
			`start = self._to_int(item.get("start"), None)`
			`end = self._to_int(item.get("end"), start)`
			`if start is None:`
			`continue`
			`start_index = max(0, start - 1)`
			`end_index = max(start_index, (end if end is not None else start) - 1)`

			`if end_index is None:`
			`end_index = start_index`

			`result.append(`
			`{`
			`"title": title,`
			`"page_start_index": max(0, start_index),`
			`"page_end_index": max(start_index, end_index),`
			`}`
			`)`
			`return result`

			`def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:`
			`margin_ratio = 0.08`
			`sample_count = min(num_pages, 30)`
			`text_counts: dict[str, int] = {}`

			`for idx in range(sample_count):`
			`page = doc[idx]`
			`page_height = page.rect.height`
			`top_limit = page_height * margin_ratio`
			`bottom_limit = page_height * (1 - margin_ratio)`
			`try:`
			`blocks = page.get_text("blocks", sort=True) or []`
			`except Exception:`
			`continue`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`seen: set[str] = set()`
			`for block in blocks:`
			`if len(block) < 7 or block[6] != 0:`
			`continue`
			`y0, y1 = block[1], block[3]`
			`text = (block[4] or "").strip()`
			`if not text or len(text) < 2 or text in seen:`
			`continue`
			`if y1 <= top_limit or y0 >= bottom_limit:`
			`seen.add(text)`
			`text_counts[text] = text_counts.get(text, 0) + 1`

			`threshold = max(3, sample_count * 0.35)`
			`return {text for text, count in text_counts.items() if count >= threshold}`

			`def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:`
更新 2026-03-06 14:50:43 +08:00			`parts: list[str] = []`
更新 2026-03-15 13:00:30 +08:00			`page_height = page.rect.height`
			`top_margin = page_height * 0.06`
			`bottom_margin = page_height * 0.94`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`table_rects: list[fitz.Rect] = []`
			`table_mds: list[str] = []`
更新 2026-03-06 14:50:43 +08:00			`try:`
更新 2026-03-15 13:00:30 +08:00			`find_tables = getattr(page, "find_tables", None)`
			`tables = []`
			`if callable(find_tables):`
			`table_finder = find_tables()`
			`tables = getattr(table_finder, "tables", []) or []`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`for table in tables[:5]:`
更新 2026-03-06 14:50:43 +08:00			`try:`
更新 2026-03-15 13:00:30 +08:00			`table_rects.append(fitz.Rect(table.bbox))`
更新 2026-03-06 14:50:43 +08:00			`except Exception:`
			`pass`
更新 2026-03-15 13:00:30 +08:00
			`cells = table.extract() or []`
			`if len(cells) < 2:`
			`continue`
			`if hf_texts and len(cells) <= 3:`
			`flat = " ".join(str(cell or "") for row in cells for cell in row)`
			`if any(hf in flat for hf in hf_texts):`
			`continue`

			`md_table = self._cells_to_md_table(cells)`
			`if md_table:`
			`table_mds.append(md_table)`
更新 2026-03-06 14:50:43 +08:00			`except Exception:`
			`pass`

			`try:`
更新 2026-03-15 13:00:30 +08:00			`blocks = page.get_text("blocks", sort=True) or []`
更新 2026-03-06 14:50:43 +08:00			`except Exception:`
更新 2026-03-15 13:00:30 +08:00			`blocks = []`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`for block in blocks:`
			`if len(block) < 7 or block[6] != 0:`
			`continue`
			`x0, y0, x1, y1 = block[:4]`
			`text = (block[4] or "").strip()`
			`if not text:`
			`continue`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`block_rect = fitz.Rect(x0, y0, x1, y1)`
			`if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):`
			`continue`
			`if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):`
			`if any(hf in text for hf in hf_texts):`
			`continue`
			`if re.fullmatch(r"\s\d{1,4}\s", text):`
			`continue`
更新 2026-03-06 14:50:43 +08:00
更新 2026-03-15 13:00:30 +08:00			`parts.append(text)`

			`parts.extend(table_mds)`
			`return "\n\n".join(parts)`

			`def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:`
更新 2026-03-06 14:50:43 +08:00			`parts: list[str] = []`
更新 2026-03-15 13:00:30 +08:00			`used_pages: set[int] = set()`

			`for item in catalog:`
			`start = max(0, min(int(item["page_start_index"]), num_pages - 1))`
			`end = max(start, min(int(item["page_end_index"]), num_pages - 1))`

			`chapter_parts = [f"# {item['title']}\n"]`
			`for idx in range(start, end + 1):`
			`if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:`
			`chapter_parts.append(page_mds[idx])`
			`used_pages.add(idx)`

			`if len(chapter_parts) > 1:`
			`parts.append("\n\n".join(chapter_parts))`

			`if parts:`
			`return "\n\n---\n\n".join(parts)`
			`return "\n\n---\n\n".join(m for m in page_mds if m.strip())`

			`@staticmethod`
			`def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:`
			`inter = block_rect & table_rect`
			`if inter.is_empty:`
			`return False`
			`block_area = block_rect.width * block_rect.height`
			`if block_area <= 0:`
			`return False`
			`return (inter.width * inter.height) / block_area >= 0.3`
更新 2026-03-06 14:50:43 +08:00
			`@staticmethod`
			`def _cells_to_md_table(cells: list) -> str:`
			`if not cells:`
			`return ""`
更新 2026-03-15 13:00:30 +08:00
更新 2026-03-06 14:50:43 +08:00			`header = cells[0]`
			`ncols = len(header)`
			`if ncols == 0:`
			`return ""`
更新 2026-03-15 13:00:30 +08:00
			`def clean(value: Any) -> str:`
			`return str(value or "").replace("\|", "\\\|").replace("\n", " ").strip()`

更新 2026-03-06 14:50:43 +08:00			`lines = [`
更新 2026-03-15 13:00:30 +08:00			`"\| " + " \| ".join(clean(cell) for cell in header) + " \|",`
更新 2026-03-06 14:50:43 +08:00			`"\| " + " \| ".join("---" for _ in range(ncols)) + " \|",`
			`]`
			`for row in cells[1:]:`
			`padded = list(row) + [""] * max(0, ncols - len(row))`
更新 2026-03-15 13:00:30 +08:00			`lines.append("\| " + " \| ".join(clean(cell) for cell in padded[:ncols]) + " \|")`
更新 2026-03-06 14:50:43 +08:00			`return "\n".join(lines)`

			`@staticmethod`
			`def _to_int(value: Any, default: int \| None) -> int \| None:`
			`try:`
			`if value is None or value == "":`
			`return default`
			`return int(value)`
			`except Exception:`
			`return default`