import json import re from collections.abc import Generator from typing import Any import fitz from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage class PdfToMarkdownTool(Tool): """Convert PDF to Markdown using an external catalog array.""" def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: file = tool_parameters.get("file") catalog_text = (tool_parameters.get("catalog") or "").strip() if not file: yield self.create_text_message("Error: file is required") return if not catalog_text: yield self.create_text_message("Error: catalog is required") return catalog = self._parse_catalog(catalog_text) if not catalog: yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes") return doc = fitz.open(stream=file.blob, filetype="pdf") try: num_pages = len(doc) hf_texts = self._detect_headers_footers(doc, num_pages) page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)] final_md = self._assemble_by_catalog(catalog, page_mds, num_pages) yield self.create_text_message(final_md) yield self.create_blob_message( blob=final_md.encode("utf-8"), meta={"mime_type": "text/markdown"}, ) finally: doc.close() def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]: try: raw = json.loads(catalog_text) except Exception: return [] if not isinstance(raw, list): return [] result: list[dict[str, Any]] = [] for item in raw: if not isinstance(item, dict): continue title = str(item.get("title") or "").strip() or "Untitled" start_index = self._to_int(item.get("page_start_index"), None) end_index = self._to_int(item.get("page_end_index"), start_index) if start_index is None: start = self._to_int(item.get("start"), None) end = self._to_int(item.get("end"), start) if start is None: continue start_index = max(0, start - 1) end_index = max(start_index, (end if end is not None else start) - 1) if end_index is None: end_index = start_index result.append( { "title": title, "page_start_index": max(0, start_index), "page_end_index": max(start_index, end_index), } ) return result def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]: margin_ratio = 0.08 sample_count = min(num_pages, 30) text_counts: dict[str, int] = {} for idx in range(sample_count): page = doc[idx] page_height = page.rect.height top_limit = page_height * margin_ratio bottom_limit = page_height * (1 - margin_ratio) try: blocks = page.get_text("blocks", sort=True) or [] except Exception: continue seen: set[str] = set() for block in blocks: if len(block) < 7 or block[6] != 0: continue y0, y1 = block[1], block[3] text = (block[4] or "").strip() if not text or len(text) < 2 or text in seen: continue if y1 <= top_limit or y0 >= bottom_limit: seen.add(text) text_counts[text] = text_counts.get(text, 0) + 1 threshold = max(3, sample_count * 0.35) return {text for text, count in text_counts.items() if count >= threshold} def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str: parts: list[str] = [] page_height = page.rect.height top_margin = page_height * 0.06 bottom_margin = page_height * 0.94 table_rects: list[fitz.Rect] = [] table_mds: list[str] = [] try: find_tables = getattr(page, "find_tables", None) tables = [] if callable(find_tables): table_finder = find_tables() tables = getattr(table_finder, "tables", []) or [] for table in tables[:5]: try: table_rects.append(fitz.Rect(table.bbox)) except Exception: pass cells = table.extract() or [] if len(cells) < 2: continue if hf_texts and len(cells) <= 3: flat = " ".join(str(cell or "") for row in cells for cell in row) if any(hf in flat for hf in hf_texts): continue md_table = self._cells_to_md_table(cells) if md_table: table_mds.append(md_table) except Exception: pass try: blocks = page.get_text("blocks", sort=True) or [] except Exception: blocks = [] for block in blocks: if len(block) < 7 or block[6] != 0: continue x0, y0, x1, y1 = block[:4] text = (block[4] or "").strip() if not text: continue block_rect = fitz.Rect(x0, y0, x1, y1) if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects): continue if hf_texts and (y1 <= top_margin or y0 >= bottom_margin): if any(hf in text for hf in hf_texts): continue if re.fullmatch(r"\s*\d{1,4}\s*", text): continue parts.append(text) parts.extend(table_mds) return "\n\n".join(parts) def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str: parts: list[str] = [] used_pages: set[int] = set() for item in catalog: start = max(0, min(int(item["page_start_index"]), num_pages - 1)) end = max(start, min(int(item["page_end_index"]), num_pages - 1)) chapter_parts = [f"# {item['title']}\n"] for idx in range(start, end + 1): if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages: chapter_parts.append(page_mds[idx]) used_pages.add(idx) if len(chapter_parts) > 1: parts.append("\n\n".join(chapter_parts)) if parts: return "\n\n---\n\n".join(parts) return "\n\n---\n\n".join(m for m in page_mds if m.strip()) @staticmethod def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool: inter = block_rect & table_rect if inter.is_empty: return False block_area = block_rect.width * block_rect.height if block_area <= 0: return False return (inter.width * inter.height) / block_area >= 0.3 @staticmethod def _cells_to_md_table(cells: list) -> str: if not cells: return "" header = cells[0] ncols = len(header) if ncols == 0: return "" def clean(value: Any) -> str: return str(value or "").replace("|", "\\|").replace("\n", " ").strip() lines = [ "| " + " | ".join(clean(cell) for cell in header) + " |", "| " + " | ".join("---" for _ in range(ncols)) + " |", ] for row in cells[1:]: padded = list(row) + [""] * max(0, ncols - len(row)) lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |") return "\n".join(lines) @staticmethod def _to_int(value: Any, default: int | None) -> int | None: try: if value is None or value == "": return default return int(value) except Exception: return default