2026-03-15 13:00:30 +08:00
|
|
|
import json
|
2026-03-06 14:50:43 +08:00
|
|
|
import re
|
|
|
|
|
from collections.abc import Generator
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import fitz
|
|
|
|
|
from dify_plugin import Tool
|
|
|
|
|
from dify_plugin.entities.tool import ToolInvokeMessage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PdfToMarkdownTool(Tool):
|
2026-03-15 13:00:30 +08:00
|
|
|
"""Convert PDF to Markdown using an external catalog array."""
|
2026-03-06 14:50:43 +08:00
|
|
|
|
|
|
|
|
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
|
|
|
|
file = tool_parameters.get("file")
|
2026-03-15 13:00:30 +08:00
|
|
|
catalog_text = (tool_parameters.get("catalog") or "").strip()
|
2026-03-06 14:50:43 +08:00
|
|
|
if not file:
|
|
|
|
|
yield self.create_text_message("Error: file is required")
|
|
|
|
|
return
|
2026-03-15 13:00:30 +08:00
|
|
|
if not catalog_text:
|
|
|
|
|
yield self.create_text_message("Error: catalog is required")
|
|
|
|
|
return
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
catalog = self._parse_catalog(catalog_text)
|
|
|
|
|
if not catalog:
|
|
|
|
|
yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
|
|
|
|
|
return
|
2026-03-06 14:50:43 +08:00
|
|
|
|
|
|
|
|
doc = fitz.open(stream=file.blob, filetype="pdf")
|
|
|
|
|
try:
|
|
|
|
|
num_pages = len(doc)
|
2026-03-15 13:00:30 +08:00
|
|
|
hf_texts = self._detect_headers_footers(doc, num_pages)
|
|
|
|
|
page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
|
|
|
|
|
final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)
|
2026-03-06 14:50:43 +08:00
|
|
|
|
|
|
|
|
yield self.create_text_message(final_md)
|
|
|
|
|
yield self.create_blob_message(
|
2026-03-15 13:00:30 +08:00
|
|
|
blob=final_md.encode("utf-8"),
|
2026-03-06 14:50:43 +08:00
|
|
|
meta={"mime_type": "text/markdown"},
|
|
|
|
|
)
|
|
|
|
|
finally:
|
|
|
|
|
doc.close()
|
|
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
|
|
|
|
|
try:
|
|
|
|
|
raw = json.loads(catalog_text)
|
|
|
|
|
except Exception:
|
|
|
|
|
return []
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
if not isinstance(raw, list):
|
|
|
|
|
return []
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
result: list[dict[str, Any]] = []
|
|
|
|
|
for item in raw:
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
continue
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
title = str(item.get("title") or "").strip() or "Untitled"
|
|
|
|
|
start_index = self._to_int(item.get("page_start_index"), None)
|
|
|
|
|
end_index = self._to_int(item.get("page_end_index"), start_index)
|
|
|
|
|
|
|
|
|
|
if start_index is None:
|
|
|
|
|
start = self._to_int(item.get("start"), None)
|
|
|
|
|
end = self._to_int(item.get("end"), start)
|
|
|
|
|
if start is None:
|
|
|
|
|
continue
|
|
|
|
|
start_index = max(0, start - 1)
|
|
|
|
|
end_index = max(start_index, (end if end is not None else start) - 1)
|
|
|
|
|
|
|
|
|
|
if end_index is None:
|
|
|
|
|
end_index = start_index
|
|
|
|
|
|
|
|
|
|
result.append(
|
|
|
|
|
{
|
|
|
|
|
"title": title,
|
|
|
|
|
"page_start_index": max(0, start_index),
|
|
|
|
|
"page_end_index": max(start_index, end_index),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
|
|
|
|
|
margin_ratio = 0.08
|
|
|
|
|
sample_count = min(num_pages, 30)
|
|
|
|
|
text_counts: dict[str, int] = {}
|
|
|
|
|
|
|
|
|
|
for idx in range(sample_count):
|
|
|
|
|
page = doc[idx]
|
|
|
|
|
page_height = page.rect.height
|
|
|
|
|
top_limit = page_height * margin_ratio
|
|
|
|
|
bottom_limit = page_height * (1 - margin_ratio)
|
|
|
|
|
try:
|
|
|
|
|
blocks = page.get_text("blocks", sort=True) or []
|
|
|
|
|
except Exception:
|
|
|
|
|
continue
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
seen: set[str] = set()
|
|
|
|
|
for block in blocks:
|
|
|
|
|
if len(block) < 7 or block[6] != 0:
|
|
|
|
|
continue
|
|
|
|
|
y0, y1 = block[1], block[3]
|
|
|
|
|
text = (block[4] or "").strip()
|
|
|
|
|
if not text or len(text) < 2 or text in seen:
|
|
|
|
|
continue
|
|
|
|
|
if y1 <= top_limit or y0 >= bottom_limit:
|
|
|
|
|
seen.add(text)
|
|
|
|
|
text_counts[text] = text_counts.get(text, 0) + 1
|
|
|
|
|
|
|
|
|
|
threshold = max(3, sample_count * 0.35)
|
|
|
|
|
return {text for text, count in text_counts.items() if count >= threshold}
|
|
|
|
|
|
|
|
|
|
def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
|
2026-03-06 14:50:43 +08:00
|
|
|
parts: list[str] = []
|
2026-03-15 13:00:30 +08:00
|
|
|
page_height = page.rect.height
|
|
|
|
|
top_margin = page_height * 0.06
|
|
|
|
|
bottom_margin = page_height * 0.94
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
table_rects: list[fitz.Rect] = []
|
|
|
|
|
table_mds: list[str] = []
|
2026-03-06 14:50:43 +08:00
|
|
|
try:
|
2026-03-15 13:00:30 +08:00
|
|
|
find_tables = getattr(page, "find_tables", None)
|
|
|
|
|
tables = []
|
|
|
|
|
if callable(find_tables):
|
|
|
|
|
table_finder = find_tables()
|
|
|
|
|
tables = getattr(table_finder, "tables", []) or []
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
for table in tables[:5]:
|
2026-03-06 14:50:43 +08:00
|
|
|
try:
|
2026-03-15 13:00:30 +08:00
|
|
|
table_rects.append(fitz.Rect(table.bbox))
|
2026-03-06 14:50:43 +08:00
|
|
|
except Exception:
|
|
|
|
|
pass
|
2026-03-15 13:00:30 +08:00
|
|
|
|
|
|
|
|
cells = table.extract() or []
|
|
|
|
|
if len(cells) < 2:
|
|
|
|
|
continue
|
|
|
|
|
if hf_texts and len(cells) <= 3:
|
|
|
|
|
flat = " ".join(str(cell or "") for row in cells for cell in row)
|
|
|
|
|
if any(hf in flat for hf in hf_texts):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
md_table = self._cells_to_md_table(cells)
|
|
|
|
|
if md_table:
|
|
|
|
|
table_mds.append(md_table)
|
2026-03-06 14:50:43 +08:00
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
try:
|
2026-03-15 13:00:30 +08:00
|
|
|
blocks = page.get_text("blocks", sort=True) or []
|
2026-03-06 14:50:43 +08:00
|
|
|
except Exception:
|
2026-03-15 13:00:30 +08:00
|
|
|
blocks = []
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
for block in blocks:
|
|
|
|
|
if len(block) < 7 or block[6] != 0:
|
|
|
|
|
continue
|
|
|
|
|
x0, y0, x1, y1 = block[:4]
|
|
|
|
|
text = (block[4] or "").strip()
|
|
|
|
|
if not text:
|
|
|
|
|
continue
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
block_rect = fitz.Rect(x0, y0, x1, y1)
|
|
|
|
|
if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
|
|
|
|
|
continue
|
|
|
|
|
if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
|
|
|
|
|
if any(hf in text for hf in hf_texts):
|
|
|
|
|
continue
|
|
|
|
|
if re.fullmatch(r"\s*\d{1,4}\s*", text):
|
|
|
|
|
continue
|
2026-03-06 14:50:43 +08:00
|
|
|
|
2026-03-15 13:00:30 +08:00
|
|
|
parts.append(text)
|
|
|
|
|
|
|
|
|
|
parts.extend(table_mds)
|
|
|
|
|
return "\n\n".join(parts)
|
|
|
|
|
|
|
|
|
|
def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
|
2026-03-06 14:50:43 +08:00
|
|
|
parts: list[str] = []
|
2026-03-15 13:00:30 +08:00
|
|
|
used_pages: set[int] = set()
|
|
|
|
|
|
|
|
|
|
for item in catalog:
|
|
|
|
|
start = max(0, min(int(item["page_start_index"]), num_pages - 1))
|
|
|
|
|
end = max(start, min(int(item["page_end_index"]), num_pages - 1))
|
|
|
|
|
|
|
|
|
|
chapter_parts = [f"# {item['title']}\n"]
|
|
|
|
|
for idx in range(start, end + 1):
|
|
|
|
|
if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
|
|
|
|
|
chapter_parts.append(page_mds[idx])
|
|
|
|
|
used_pages.add(idx)
|
|
|
|
|
|
|
|
|
|
if len(chapter_parts) > 1:
|
|
|
|
|
parts.append("\n\n".join(chapter_parts))
|
|
|
|
|
|
|
|
|
|
if parts:
|
|
|
|
|
return "\n\n---\n\n".join(parts)
|
|
|
|
|
return "\n\n---\n\n".join(m for m in page_mds if m.strip())
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
|
|
|
|
|
inter = block_rect & table_rect
|
|
|
|
|
if inter.is_empty:
|
|
|
|
|
return False
|
|
|
|
|
block_area = block_rect.width * block_rect.height
|
|
|
|
|
if block_area <= 0:
|
|
|
|
|
return False
|
|
|
|
|
return (inter.width * inter.height) / block_area >= 0.3
|
2026-03-06 14:50:43 +08:00
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _cells_to_md_table(cells: list) -> str:
|
|
|
|
|
if not cells:
|
|
|
|
|
return ""
|
2026-03-15 13:00:30 +08:00
|
|
|
|
2026-03-06 14:50:43 +08:00
|
|
|
header = cells[0]
|
|
|
|
|
ncols = len(header)
|
|
|
|
|
if ncols == 0:
|
|
|
|
|
return ""
|
2026-03-15 13:00:30 +08:00
|
|
|
|
|
|
|
|
def clean(value: Any) -> str:
|
|
|
|
|
return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
|
|
|
|
|
|
2026-03-06 14:50:43 +08:00
|
|
|
lines = [
|
2026-03-15 13:00:30 +08:00
|
|
|
"| " + " | ".join(clean(cell) for cell in header) + " |",
|
2026-03-06 14:50:43 +08:00
|
|
|
"| " + " | ".join("---" for _ in range(ncols)) + " |",
|
|
|
|
|
]
|
|
|
|
|
for row in cells[1:]:
|
|
|
|
|
padded = list(row) + [""] * max(0, ncols - len(row))
|
2026-03-15 13:00:30 +08:00
|
|
|
lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
|
2026-03-06 14:50:43 +08:00
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _to_int(value: Any, default: int | None) -> int | None:
|
|
|
|
|
try:
|
|
|
|
|
if value is None or value == "":
|
|
|
|
|
return default
|
|
|
|
|
return int(value)
|
|
|
|
|
except Exception:
|
|
|
|
|
return default
|