Files
urbanLifeline/difyPlugin/pdf/tools/pdf_to_markdown.py

235 lines
8.1 KiB
Python
Raw Normal View History

2026-03-15 13:00:30 +08:00
import json
2026-03-06 14:50:43 +08:00
import re
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
2026-03-15 13:00:30 +08:00
"""Convert PDF to Markdown using an external catalog array."""
2026-03-06 14:50:43 +08:00
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
2026-03-15 13:00:30 +08:00
catalog_text = (tool_parameters.get("catalog") or "").strip()
2026-03-06 14:50:43 +08:00
if not file:
yield self.create_text_message("Error: file is required")
return
2026-03-15 13:00:30 +08:00
if not catalog_text:
yield self.create_text_message("Error: catalog is required")
return
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
catalog = self._parse_catalog(catalog_text)
if not catalog:
yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
return
2026-03-06 14:50:43 +08:00
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
2026-03-15 13:00:30 +08:00
hf_texts = self._detect_headers_footers(doc, num_pages)
page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)
2026-03-06 14:50:43 +08:00
yield self.create_text_message(final_md)
yield self.create_blob_message(
2026-03-15 13:00:30 +08:00
blob=final_md.encode("utf-8"),
2026-03-06 14:50:43 +08:00
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
2026-03-15 13:00:30 +08:00
def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
try:
raw = json.loads(catalog_text)
except Exception:
return []
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
if not isinstance(raw, list):
return []
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
result: list[dict[str, Any]] = []
for item in raw:
if not isinstance(item, dict):
continue
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
title = str(item.get("title") or "").strip() or "Untitled"
start_index = self._to_int(item.get("page_start_index"), None)
end_index = self._to_int(item.get("page_end_index"), start_index)
if start_index is None:
start = self._to_int(item.get("start"), None)
end = self._to_int(item.get("end"), start)
if start is None:
continue
start_index = max(0, start - 1)
end_index = max(start_index, (end if end is not None else start) - 1)
if end_index is None:
end_index = start_index
result.append(
{
"title": title,
"page_start_index": max(0, start_index),
"page_end_index": max(start_index, end_index),
}
)
return result
def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
margin_ratio = 0.08
sample_count = min(num_pages, 30)
text_counts: dict[str, int] = {}
for idx in range(sample_count):
page = doc[idx]
page_height = page.rect.height
top_limit = page_height * margin_ratio
bottom_limit = page_height * (1 - margin_ratio)
try:
blocks = page.get_text("blocks", sort=True) or []
except Exception:
continue
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
seen: set[str] = set()
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
y0, y1 = block[1], block[3]
text = (block[4] or "").strip()
if not text or len(text) < 2 or text in seen:
continue
if y1 <= top_limit or y0 >= bottom_limit:
seen.add(text)
text_counts[text] = text_counts.get(text, 0) + 1
threshold = max(3, sample_count * 0.35)
return {text for text, count in text_counts.items() if count >= threshold}
def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
2026-03-06 14:50:43 +08:00
parts: list[str] = []
2026-03-15 13:00:30 +08:00
page_height = page.rect.height
top_margin = page_height * 0.06
bottom_margin = page_height * 0.94
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
table_rects: list[fitz.Rect] = []
table_mds: list[str] = []
2026-03-06 14:50:43 +08:00
try:
2026-03-15 13:00:30 +08:00
find_tables = getattr(page, "find_tables", None)
tables = []
if callable(find_tables):
table_finder = find_tables()
tables = getattr(table_finder, "tables", []) or []
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
for table in tables[:5]:
2026-03-06 14:50:43 +08:00
try:
2026-03-15 13:00:30 +08:00
table_rects.append(fitz.Rect(table.bbox))
2026-03-06 14:50:43 +08:00
except Exception:
pass
2026-03-15 13:00:30 +08:00
cells = table.extract() or []
if len(cells) < 2:
continue
if hf_texts and len(cells) <= 3:
flat = " ".join(str(cell or "") for row in cells for cell in row)
if any(hf in flat for hf in hf_texts):
continue
md_table = self._cells_to_md_table(cells)
if md_table:
table_mds.append(md_table)
2026-03-06 14:50:43 +08:00
except Exception:
pass
try:
2026-03-15 13:00:30 +08:00
blocks = page.get_text("blocks", sort=True) or []
2026-03-06 14:50:43 +08:00
except Exception:
2026-03-15 13:00:30 +08:00
blocks = []
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
x0, y0, x1, y1 = block[:4]
text = (block[4] or "").strip()
if not text:
continue
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
block_rect = fitz.Rect(x0, y0, x1, y1)
if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
continue
if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
if any(hf in text for hf in hf_texts):
continue
if re.fullmatch(r"\s*\d{1,4}\s*", text):
continue
2026-03-06 14:50:43 +08:00
2026-03-15 13:00:30 +08:00
parts.append(text)
parts.extend(table_mds)
return "\n\n".join(parts)
def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
2026-03-06 14:50:43 +08:00
parts: list[str] = []
2026-03-15 13:00:30 +08:00
used_pages: set[int] = set()
for item in catalog:
start = max(0, min(int(item["page_start_index"]), num_pages - 1))
end = max(start, min(int(item["page_end_index"]), num_pages - 1))
chapter_parts = [f"# {item['title']}\n"]
for idx in range(start, end + 1):
if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
chapter_parts.append(page_mds[idx])
used_pages.add(idx)
if len(chapter_parts) > 1:
parts.append("\n\n".join(chapter_parts))
if parts:
return "\n\n---\n\n".join(parts)
return "\n\n---\n\n".join(m for m in page_mds if m.strip())
@staticmethod
def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
inter = block_rect & table_rect
if inter.is_empty:
return False
block_area = block_rect.width * block_rect.height
if block_area <= 0:
return False
return (inter.width * inter.height) / block_area >= 0.3
2026-03-06 14:50:43 +08:00
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
2026-03-15 13:00:30 +08:00
2026-03-06 14:50:43 +08:00
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
2026-03-15 13:00:30 +08:00
def clean(value: Any) -> str:
return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
2026-03-06 14:50:43 +08:00
lines = [
2026-03-15 13:00:30 +08:00
"| " + " | ".join(clean(cell) for cell in header) + " |",
2026-03-06 14:50:43 +08:00
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
2026-03-15 13:00:30 +08:00
lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
2026-03-06 14:50:43 +08:00
return "\n".join(lines)
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default