This commit is contained in:
2026-03-06 14:50:43 +08:00
parent 843146cdd7
commit 91ff28bdcf
18 changed files with 1316 additions and 100 deletions

2
.gitignore vendored
View File

@@ -5,3 +5,5 @@
.trae
**/*.difypkg
urbanLifeServ/*
*/.data

27
.vscode/launch.json vendored
View File

@@ -1,27 +0,0 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: FastAPI Server",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/difyPlugin/main.py",
"console": "integratedTerminal",
"justMyCode": true,
"env": {
"PYTHONUNBUFFERED": "1"
},
"cwd": "${workspaceFolder}/difyPlugin",
"args": []
},
{
"name": "Python: Debug Plugin",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/difyPlugin/app/plugins/pdf/__init__.py",
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${workspaceFolder}/difyPlugin"
}
]
}

Submodule ai-management-platform updated: 199d8180a6...96f7c3aa4c

View File

@@ -19,6 +19,9 @@ resource:
permission:
tool:
enabled: true
model:
enabled: true
llm: true
plugins:
tools:
- provider/pdf.yaml

View File

@@ -56,8 +56,12 @@ identity:
# en_US: "Access Token"
tools:
- tools/pdf.yaml
- tools/pdf_column_range.yaml
- tools/pdf_single_page.yaml
- tools/pdf_summary.yaml
- tools/pdf_toc.yaml
- tools/pdf_extract_range.yaml
- tools/pdf_to_markdown.yaml
extra:
python:
source: provider/pdf.py

View File

@@ -1,2 +1,2 @@
dify_plugin>=0.4.0,<0.7.0
PyPDF2>=3.0.1
pymupdf>=1.27.1

View File

@@ -1,61 +0,0 @@
import re
from collections.abc import Generator
from io import BytesIO
from typing import Any
import PyPDF2
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
# file.blob returns bytes
pdf_bytes = file.blob
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
num_pages = len(reader.pages)
toc_start = None
toc_end = None
toc_patterns = [
r'目录',
r'Table of Contents',
r'Contents',
r'目次'
]
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
if toc_start is None:
yield self.create_json_message({
"start": None,
"end": None,
"pages": []
})
return
toc_pages = []
for page_num in range(toc_start, toc_end + 1):
page = reader.pages[page_num]
toc_pages.append(page.extract_text() or "")
yield self.create_json_message({
"start": toc_start,
"end": toc_end,
"pages": toc_pages
})

View File

@@ -0,0 +1,107 @@
import json
import re
from collections.abc import Generator
from io import BytesIO
from typing import Any
import fitz # PyMuPDF 核心库
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
# 从字节流加载 PDF替换 PyPDF2
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
toc_start = None
toc_end = None
# 目录匹配正则(与原代码一致)
toc_patterns = [
r'目录',
r'目 录',
r'\u3000录',
r'Table of Contents',
r'Contents',
r'目次'
]
# 遍历页面识别目录页(逻辑不变,仅替换文本提取方式)
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text() or "" # PyMuPDF 提取文本
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
# 提取目录页文本
toc_pages = []
if toc_start is not None and toc_end is not None:
for page_num in range(toc_start, toc_end + 1):
page = doc[page_num]
toc_pages.append(page.get_text() or "")
# 关闭文档
doc.close()
result = {
"start": toc_start,
"end": toc_end,
"pages": toc_pages,
"pages_text": "\n".join(toc_pages) if toc_pages else "",
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)
if __name__ == "__main__":
# 测试代码(改用 PyMuPDF
pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf"
doc = fitz.open(pdf_path) # 本地文件直接打开
num_pages = len(doc)
toc_start = None
toc_end = None
toc_patterns = [
r'目录',
r'目 录',
r'\u3000录',
r'Table of Contents',
r'Contents',
r'目次'
]
# 遍历页面找目录
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
# 提取目录页文本
toc_pages = []
toc_start = toc_start if toc_start is not None else 18
toc_end = toc_end if toc_end is not None else toc_start + 9
for page_num in range(toc_start, toc_end):
page = doc[page_num]
toc_pages.append(page.get_text() or "")
print(toc_start, toc_end, toc_pages)
doc.close() # 关闭文档

View File

@@ -33,4 +33,4 @@ parameters:
- "pdf"
extra:
python:
source: tools/pdf.py
source: tools/pdf_column_range.py

View File

@@ -0,0 +1,48 @@
import json
from collections.abc import Generator
from typing import Any
import fitz # PyMuPDF
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfExtractRangeTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
start_page = int(tool_parameters.get("start_page", 0))
end_page = int(tool_parameters.get("end_page", 0))
# 打开 PDF
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
# 边界处理
start_page = max(0, min(start_page, num_pages - 1))
end_page = max(start_page, min(end_page, num_pages - 1))
# 逐页提取文本
page_texts = []
for page_idx in range(start_page, end_page + 1):
page = doc[page_idx]
text = page.get_text("text", sort=True) or ""
page_texts.append(text)
doc.close()
# 拼接所有页面文本
full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
result = {
"start": start_page,
"end": end_page,
"total_pages": end_page - start_page + 1,
"text": full_text,
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)

View File

@@ -0,0 +1,68 @@
identity:
name: "pdf_extract_range"
author: "yslg"
label:
en_US: "Extract Page Range Text"
zh_Hans: "提取页面范围文本"
pt_BR: "Extrair Texto do Intervalo de Páginas"
ja_JP: "ページ範囲テキスト抽出"
description:
human:
en_US: "Extract plain text from a specified page range of a PDF file"
zh_Hans: "从PDF文件的指定页码范围提取纯文本"
pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF"
ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出"
llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range."
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to extract text from"
zh_Hans: "要提取文本的 PDF 文件"
pt_BR: "Arquivo PDF para extrair texto"
ja_JP: "テキストを抽出するPDFファイル"
llm_description: "PDF file to extract page range text from"
form: llm
fileTypes:
- "pdf"
- name: start_page
type: number
required: true
label:
en_US: Start Page
zh_Hans: 起始页码
pt_BR: Página Inicial
ja_JP: 開始ページ
human_description:
en_US: "Start page index (0-based)"
zh_Hans: "起始页码从0开始"
pt_BR: "Índice da página inicial (base 0)"
ja_JP: "開始ページ番号0始まり"
llm_description: "Start page index (0-based)"
form: llm
default: 0
- name: end_page
type: number
required: true
label:
en_US: End Page
zh_Hans: 结束页码
pt_BR: Página Final
ja_JP: 終了ページ
human_description:
en_US: "End page index (0-based, inclusive)"
zh_Hans: "结束页码从0开始包含该页"
pt_BR: "Índice da página final (base 0, inclusivo)"
ja_JP: "終了ページ番号0始まり、含む"
llm_description: "End page index (0-based, inclusive)"
form: llm
default: 0
extra:
python:
source: tools/pdf_extract_range.py

View File

@@ -1,8 +1,9 @@
import json
from collections.abc import Generator
from io import BytesIO
from typing import Any
import PyPDF2
import fitz # PyMuPDF 核心库
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
@@ -16,21 +17,29 @@ class PdfSinglePageTool(Tool):
yield self.create_text_message("Error: file is required")
return
# 从字节流加载 PDF替换 PyPDF2 的 PdfReader
pdf_bytes = file.blob
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
num_pages = len(reader.pages)
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开
num_pages = len(doc)
# 页码边界处理(逻辑与原代码一致)
page_index = int(page)
if page_index < 0:
page_index = 0
if page_index >= num_pages:
page_index = num_pages - 1
selected_page = reader.pages[page_index]
text = selected_page.extract_text() or ""
# 提取指定页面文本PyMuPDF 方式)
selected_page = doc[page_index]
text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准
yield self.create_json_message({
# 关闭文档释放资源
doc.close()
result = {
"start": page_index,
"end": page_index,
"pages": [text]
})
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)

View File

@@ -0,0 +1,209 @@
import json
import re
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.model.llm import LLMModelConfig
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfSummaryTool(Tool):
"""Fast PDF page summary tool.
Default behavior is optimized for throughput in large workflows:
- Extract plain text and lightweight table data only.
- Skip expensive image base64 and drawing path extraction.
- Skip LLM by default unless `use_llm=true` is explicitly passed.
"""
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0)
end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0)
model_config = tool_parameters.get("model")
use_llm = self._to_bool(tool_parameters.get("use_llm"), False)
max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000)
max_chars_per_page = max(800, min(max_chars_per_page, 20000))
llm_prompt = tool_parameters.get(
"llm_prompt",
"请基于输入的PDF页面文本做简洁准确摘要输出中文要点。不要输出思考过程。",
)
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
num_pages = len(doc)
start_page = max(0, min(start_page, num_pages - 1))
end_page = max(start_page, min(end_page, num_pages - 1))
pages_data: list[dict[str, Any]] = []
for page_idx in range(start_page, end_page + 1):
page = doc[page_idx]
page_data = self._extract_page_fast(page, page_idx, max_chars_per_page)
pages_data.append(page_data)
result = {
"total_pages_extracted": len(pages_data),
"page_range": {"start": start_page, "end": end_page},
"pages": pages_data,
}
yield self.create_json_message(result)
# Fast local summary first (deterministic, no model latency)
local_text = self._build_local_summary(pages_data)
# Optional LLM refinement, explicitly enabled only
if use_llm and model_config:
refined = self._summarize_with_llm(local_text, llm_prompt, model_config)
final_text = refined if refined else local_text
else:
final_text = local_text
if final_text:
yield self.create_text_message(final_text)
finally:
doc.close()
def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]:
text = (page.get_text("text") or "").strip()
if len(text) > max_chars_per_page:
text = text[:max_chars_per_page] + "\n...[truncated]"
tables: list[dict[str, Any]] = []
try:
tabs = page.find_tables()
for tab_idx, tab in enumerate(tabs.tables[:3]):
cells = tab.extract() or []
tables.append(
{
"index": tab_idx,
"rows": tab.row_count,
"cols": tab.col_count,
"cells": cells[:10],
}
)
except Exception:
pass
return {
"page_number": page_idx,
"text": text,
"tables": tables,
"images": [],
"drawings_summary": [],
"text_blocks": [],
"width": float(page.rect.width),
"height": float(page.rect.height),
}
def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str:
"""Output actual page content as Markdown (text + tables).
No LLM needed downstream — the text is already usable Markdown.
"""
parts: list[str] = []
for page in pages_data:
text = (page.get("text") or "").strip()
tables = page.get("tables") or []
page_parts: list[str] = []
if text:
page_parts.append(text)
for tab in tables:
cells = tab.get("cells") or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
page_parts.append(md)
if page_parts:
parts.append("\n\n".join(page_parts))
return "\n\n--- 分页 ---\n\n".join(parts)
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
return "\n".join(lines)
def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str:
response = self.session.model.llm.invoke(
model_config=LLMModelConfig(**model_config),
prompt_messages=[
SystemPromptMessage(content=llm_prompt),
UserPromptMessage(content=local_text),
],
stream=False,
)
llm_text = ""
if hasattr(response, "message") and response.message:
content = response.message.content
if isinstance(content, str):
llm_text = content
elif isinstance(content, list):
llm_text = "".join(
item.data if hasattr(item, "data") else str(item)
for item in content
)
return self._extract_visible_answer(llm_text)
@staticmethod
def _extract_visible_answer(text: str) -> str:
if not text:
return ""
box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text)
if box_match:
text = box_match.group(1)
else:
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
text = re.sub(r"<\|[^>]+\|>", "", text)
return text.strip()
@staticmethod
def _to_int(value: Any, default: int) -> int:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default

View File

@@ -0,0 +1,99 @@
identity:
name: "pdf_summary"
author: "yslg"
label:
en_US: "PDF Page Summary"
zh_Hans: "PDF页面概述"
pt_BR: "Resumo de Página PDF"
ja_JP: "PDFページ概要"
description:
human:
en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM"
zh_Hans: "提取PDF页面核心元素文本、图片、表格、路径及坐标并通过LLM进行概述"
pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM"
ja_JP: "PDFページからコア要素テキスト、画像、テーブル、パスを座標付きで抽出し、LLMで要約"
llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to extract elements from and summarize"
form: llm
fileTypes:
- "pdf"
- name: pdf_start_page
type: number
required: true
label:
en_US: Start Page
zh_Hans: 起始页码
pt_BR: Página Inicial
ja_JP: 開始ページ
human_description:
en_US: "Start page index (0-based)"
zh_Hans: "起始页码从0开始"
pt_BR: "Índice da página inicial (base 0)"
ja_JP: "開始ページ番号0始まり"
llm_description: "Start page index (0-based) for element extraction"
form: llm
default: 0
- name: pdf_end_page
type: number
required: true
label:
en_US: End Page
zh_Hans: 结束页码
pt_BR: Página Final
ja_JP: 終了ページ
human_description:
en_US: "End page index (0-based, inclusive)"
zh_Hans: "结束页码从0开始包含该页"
pt_BR: "Índice da página final (base 0, inclusivo)"
ja_JP: "終了ページ番号0始まり、含む"
llm_description: "End page index (0-based, inclusive) for element extraction"
form: llm
default: 0
- name: model
type: model-selector
scope: llm
required: true
label:
en_US: LLM Model
zh_Hans: LLM 模型
pt_BR: Modelo LLM
ja_JP: LLMモデル
human_description:
en_US: "LLM model used for summarizing extracted content"
zh_Hans: "用于概述提取内容的 LLM 模型"
pt_BR: "Modelo LLM usado para resumir o conteúdo extraído"
ja_JP: "抽出内容の要約に使用するLLMモデル"
form: form
- name: llm_prompt
type: string
required: false
label:
en_US: LLM Prompt
zh_Hans: LLM 提示词
pt_BR: Prompt do LLM
ja_JP: LLMプロンプト
human_description:
en_US: "System prompt for LLM summarization"
zh_Hans: "LLM 概述的系统提示词"
pt_BR: "Prompt do sistema para resumo LLM"
ja_JP: "LLM要約用のシステムプロンプト"
llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content"
form: form
default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容包含文本、图片信息、表格和矢量图形对每页内容进行准确、简洁的概述。"
extra:
python:
source: tools/pdf_summary.py

View File

@@ -0,0 +1,335 @@
import base64
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
"""Convert PDF to a single Markdown file. No LLM needed.
- Auto-detect TOC and organize content by chapters.
- Extract text and tables as Markdown.
- Embed raster images as base64.
- Render vector drawings as base64 PNG.
- Output one .md file via create_blob_message.
"""
_TOC_PATTERNS = [
r"目录", r"目 录", r"\u3000录",
r"Table of Contents", r"Contents", r"目次",
]
# ── entry point ──────────────────────────────────────────
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
include_images = self._to_bool(tool_parameters.get("include_images"), True)
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
image_dpi = max(72, min(image_dpi, 300))
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
# 1) Build chapter map (metadata TOC → printed TOC → none)
chapters, content_offset = self._build_chapter_map(doc, num_pages)
# 2) Convert every page
page_mds: list[str] = []
for idx in range(num_pages):
md = self._page_to_markdown(
doc, doc[idx], idx,
include_images, image_dpi, max_image_bytes,
)
page_mds.append(md)
# 3) Assemble
if chapters:
final_md = self._assemble_by_chapters(
chapters, page_mds, content_offset, num_pages,
)
else:
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
# 4) Output: text (for variable aggregation) + blob (.md file)
yield self.create_text_message(final_md)
md_bytes = final_md.encode("utf-8")
yield self.create_blob_message(
blob=md_bytes,
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
# ── chapter detection ────────────────────────────────────
def _build_chapter_map(
self, doc: fitz.Document, num_pages: int,
) -> tuple[dict, int]:
"""Return (chapters_dict, content_offset).
Try embedded PDF TOC metadata first (reliable page mapping).
Fall back to scanning printed TOC pages.
"""
toc = doc.get_toc()
if toc:
chapters = self._chapters_from_metadata(toc, num_pages)
if chapters:
return chapters, 0
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
)
chapters = self._parse_toc_lines(toc_text)
if chapters:
offset = self._guess_offset(chapters, toc_end)
return chapters, offset
return {}, 0
def _chapters_from_metadata(
self, toc: list, num_pages: int,
) -> dict[str, dict[str, int]]:
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
if not top:
return {}
chapters: dict[str, dict[str, int]] = OrderedDict()
for i, (title, start) in enumerate(top):
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
chapters[title] = {"start": start, "end": max(start, end)}
return chapters
def _find_toc_pages(self, doc, num_pages):
toc_start = toc_end = None
for pn in range(min(num_pages, 30)):
text = doc[pn].get_text() or ""
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
if toc_start is None:
toc_start = pn
toc_end = pn
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
m = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text, re.IGNORECASE | re.MULTILINE,
)
if m:
text = text[: m.start()]
pat = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
continue
m2 = pat.match(line)
if not m2:
continue
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_: ")
page = self._to_int(m2.group("page"), None)
if not title or page is None or len(title) <= 1:
continue
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
dedup: OrderedDict[str, int] = OrderedDict()
for t, p in entries:
dedup.setdefault(t, p)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = OrderedDict()
for i, t in enumerate(titles):
s = pages[i]
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
catalog[t] = {"start": s, "end": e}
return catalog
@staticmethod
def _guess_offset(chapters: dict, toc_end: int) -> int:
first_page = None
for info in chapters.values():
s = info["start"]
if first_page is None or s < first_page:
first_page = s
if first_page is None:
return 0
return (toc_end + 1) - first_page
# ── per-page conversion ──────────────────────────────────
def _page_to_markdown(
self,
doc: fitz.Document,
page: fitz.Page,
page_idx: int,
include_images: bool,
image_dpi: int,
max_image_bytes: int,
) -> str:
parts: list[str] = []
# ── text ──
text = (page.get_text("text", sort=True) or "").strip()
if text:
parts.append(text)
# ── tables → Markdown ──
try:
for tab in (page.find_tables().tables or [])[:5]:
cells = tab.extract() or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
parts.append(md)
except Exception:
pass
if not include_images:
return "\n\n".join(parts)
# ── embedded raster images ──
try:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
data = doc.extract_image(xref)
if not data or not data.get("image"):
continue
raw = data["image"]
if len(raw) > max_image_bytes:
continue
# skip tiny icons (< 20x20)
w = data.get("width", 0)
h = data.get("height", 0)
if w < 20 and h < 20:
continue
ext = data.get("ext", "png")
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
b64 = base64.b64encode(raw).decode("ascii")
parts.append(
f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
)
except Exception:
pass
except Exception:
pass
# ── vector drawings → render as PNG ──
try:
drawings = page.get_drawings()
if len(drawings) >= 3:
valid_rects: list[fitz.Rect] = []
for d in drawings:
r = d.get("rect")
if r:
try:
rect = fitz.Rect(r)
if rect.is_valid and not rect.is_empty:
valid_rects.append(rect)
except Exception:
pass
if valid_rects:
bbox = valid_rects[0]
for r in valid_rects[1:]:
bbox |= r
bbox &= page.rect
if bbox.width > 30 and bbox.height > 30:
scale = image_dpi / 72
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, clip=bbox)
png = pix.tobytes("png")
if len(png) <= max_image_bytes:
b64 = base64.b64encode(png).decode("ascii")
parts.append(
f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
)
except Exception:
pass
return "\n\n".join(parts)
# ── assembly ─────────────────────────────────────────────
def _assemble_by_chapters(
self,
chapters: dict[str, dict[str, int]],
page_mds: list[str],
offset: int,
num_pages: int,
) -> str:
parts: list[str] = []
for name, info in chapters.items():
s = info["start"] + offset
e = info["end"] + offset
s = max(0, min(s, num_pages - 1))
e = max(s, min(e, num_pages - 1))
ch: list[str] = [f"# {name}\n"]
for idx in range(s, e + 1):
if idx < len(page_mds) and page_mds[idx].strip():
ch.append(page_mds[idx])
parts.append("\n\n".join(ch))
return "\n\n---\n\n".join(parts)
# ── helpers ──────────────────────────────────────────────
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
return "\n".join(lines)
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default

View File

@@ -0,0 +1,68 @@
identity:
name: "pdf_to_markdown"
author: "yslg"
label:
en_US: "PDF to Markdown"
zh_Hans: "PDF转Markdown"
pt_BR: "PDF para Markdown"
ja_JP: "PDFからMarkdown"
description:
human:
en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed."
zh_Hans: "将PDF转换为单个Markdown文件图片以base64嵌入无需大模型"
pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM."
ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。"
llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed."
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to convert"
zh_Hans: "要转换的 PDF 文件"
pt_BR: "Arquivo PDF para converter"
ja_JP: "変換するPDFファイル"
llm_description: "PDF file to convert to Markdown"
form: llm
fileTypes:
- "pdf"
- name: include_images
type: boolean
required: false
label:
en_US: Include Images
zh_Hans: 包含图片
pt_BR: Incluir Imagens
ja_JP: 画像を含める
human_description:
en_US: "Whether to embed images as base64 in the Markdown output (default: true)"
zh_Hans: "是否将图片以base64嵌入Markdown输出默认"
pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)"
ja_JP: "Markdown出力にbase64として画像を埋め込むかどうかデフォルトはい"
llm_description: "Set to true to embed images as base64, false to skip images"
form: form
default: true
- name: image_dpi
type: number
required: false
label:
en_US: Image DPI
zh_Hans: 图片DPI
pt_BR: DPI da Imagem
ja_JP: 画像DPI
human_description:
en_US: "DPI for rendering vector drawings (72-300, default: 150)"
zh_Hans: "矢量图渲染DPI72-300默认150"
pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)"
ja_JP: "ベクター描画のレンダリングDPI72-300、デフォルト150"
llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150."
form: form
default: 150
extra:
python:
source: tools/pdf_to_markdown.py

View File

@@ -0,0 +1,273 @@
import json
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
from dify_plugin import Tool
from dify_plugin.entities.model.llm import LLMModelConfig
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
from dify_plugin.entities.tool import ToolInvokeMessage
_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
Return only valid JSON object, no markdown fences, no explanation.
Output schema:
{
"Chapter Name": {"start": 1, "end": 5},
"Another": {"start": 6, "end": 20}
}
Rules:
- start/end are integer printed page numbers from TOC.
- If end is unknown, use same value as start.
- Keep chapter names exactly as in TOC text.
"""
class PdfTocTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
toc_start = self._to_int(tool_parameters.get("toc_start"), None)
toc_end = self._to_int(tool_parameters.get("toc_end"), None)
toc_pages = (tool_parameters.get("toc_pages") or "").strip()
model_config = tool_parameters.get("model")
if toc_start is None or toc_end is None:
yield self.create_text_message("Error: toc_start and toc_end are required")
return
if not toc_pages:
yield self.create_text_message("Error: toc_pages text is empty")
return
cleaned = self._strip_index_lists(toc_pages)
# 1) deterministic parser first
catalog = self._parse_toc_lines(cleaned)
# 2) optional LLM fallback/enhance only when deterministic parser gives no result
llm_raw_output = ""
llm_error = None
if not catalog and model_config:
llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
toc_start=toc_start,
toc_end=toc_end,
toc_pages=cleaned,
model_config=model_config,
)
if llm_catalog:
catalog = self._normalize_catalog(llm_catalog)
result: dict[str, Any] = {
"toc_start": toc_start,
"toc_end": toc_end,
"catalog": catalog,
"meta": {
"catalog_size": len(catalog),
"parser": "rule" if catalog else "none",
},
}
if llm_raw_output:
result["meta"]["llm_used"] = True
if llm_error:
result["meta"]["llm_error"] = llm_error
# always return valid json text payload for downstream json.loads
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)
def _parse_with_llm(
self,
toc_start: int,
toc_end: int,
toc_pages: str,
model_config: dict[str, Any],
) -> tuple[dict[str, Any] | None, str, str | None]:
user_content = (
f"TOC page index range: {toc_start}..{toc_end}\n\n"
f"TOC raw text:\n{toc_pages}"
)
response = self.session.model.llm.invoke(
model_config=LLMModelConfig(**model_config),
prompt_messages=[
SystemPromptMessage(content=_SYSTEM_PROMPT),
UserPromptMessage(content=user_content),
],
stream=False,
)
llm_text = ""
if hasattr(response, "message") and response.message:
content = response.message.content
if isinstance(content, str):
llm_text = content
elif isinstance(content, list):
llm_text = "".join(
item.data if hasattr(item, "data") else str(item) for item in content
)
parsed = self._extract_json_object(llm_text)
if parsed is None:
return None, llm_text, "Failed to parse LLM output as JSON"
if not isinstance(parsed, dict):
return None, llm_text, "LLM output JSON is not an object"
return parsed, llm_text, None
@staticmethod
def _strip_index_lists(text: str) -> str:
# Stop before common appendix lists that pollute TOC parsing.
pattern = re.compile(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
re.IGNORECASE | re.MULTILINE,
)
m = pattern.search(text)
return text[: m.start()].rstrip() if m else text
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
"""Parse lines like:
1.2 Engine Overview ........ 35
Appendix A 120
"""
line_pattern = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3:
continue
if re.fullmatch(r"\d+", line):
continue
m = line_pattern.match(line)
if not m:
continue
title = re.sub(r"\s+", " ", m.group("title")).strip("-_: ")
page = self._to_int(m.group("page"), None)
if not title or page is None:
continue
# Skip obvious noise.
if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
# Deduplicate keeping earliest appearance.
dedup: OrderedDict[str, int] = OrderedDict()
for title, page in entries:
if title not in dedup:
dedup[title] = page
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = {}
for i, title in enumerate(titles):
start = pages[i]
if i + 1 < len(pages):
next_start = pages[i + 1]
end = max(start, next_start - 1)
else:
end = start
catalog[title] = {"start": int(start), "end": int(end)}
return catalog
def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
catalog: dict[str, dict[str, int]] = {}
source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
if not isinstance(source, dict):
return catalog
for name, value in source.items():
if not isinstance(name, str) or not isinstance(value, dict):
continue
start = self._to_int(value.get("start"), None)
end = self._to_int(value.get("end"), start)
if start is None:
continue
if end is None:
end = start
catalog[name] = {"start": int(start), "end": int(max(start, end))}
return catalog
@staticmethod
def _extract_json_object(text: str) -> Any:
if not text:
return None
candidates: list[str] = []
code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
candidates.extend([c.strip() for c in code_blocks if c.strip()])
brace_candidate = PdfTocTool._extract_first_brace_object(text)
if brace_candidate:
candidates.append(brace_candidate)
candidates.append(text.strip())
for cand in candidates:
parsed = PdfTocTool._json_try_parse(cand)
if parsed is not None:
return parsed
return None
@staticmethod
def _extract_first_brace_object(text: str) -> str | None:
start = text.find("{")
if start < 0:
return None
depth = 0
in_str = False
escape = False
for i in range(start, len(text)):
ch = text[i]
if in_str:
if escape:
escape = False
elif ch == "\\":
escape = True
elif ch == '"':
in_str = False
continue
if ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return text[start : i + 1]
return None
@staticmethod
def _json_try_parse(text: str) -> Any:
try:
return json.loads(text)
except Exception:
pass
# Minimal repair: remove trailing commas before } or ]
repaired = re.sub(r",\s*([}\]])", r"\1", text)
try:
return json.loads(repaired)
except Exception:
return None
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default

View File

@@ -0,0 +1,79 @@
identity:
name: "pdf_toc"
author: "yslg"
label:
en_US: "PDF TOC Parser"
zh_Hans: "PDF目录解析"
pt_BR: "Analisador de Sumário PDF"
ja_JP: "PDF目次解析"
description:
human:
en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
zh_Hans: "通过LLM将PDF目录文本来自目录页提取工具的输出解析为结构化JSON目录"
pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
parameters:
- name: toc_start
type: number
required: true
label:
en_US: TOC Start Page
zh_Hans: 目录起始页
pt_BR: Página Inicial do Sumário
ja_JP: 目次開始ページ
human_description:
en_US: "Start page index of TOC (from pdf_column_range output)"
zh_Hans: "目录起始页码(来自目录页提取工具输出的 start"
pt_BR: "Índice da página inicial do sumário"
ja_JP: "目次の開始ページ番号"
llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
form: llm
- name: toc_end
type: number
required: true
label:
en_US: TOC End Page
zh_Hans: 目录结束页
pt_BR: Página Final do Sumário
ja_JP: 目次終了ページ
human_description:
en_US: "End page index of TOC (from pdf_column_range output)"
zh_Hans: "目录结束页码(来自目录页提取工具输出的 end"
pt_BR: "Índice da página final do sumário"
ja_JP: "目次の終了ページ番号"
llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
form: llm
- name: toc_pages
type: string
required: true
label:
en_US: TOC Page Text
zh_Hans: 目录页文本
pt_BR: Texto das Páginas do Sumário
ja_JP: 目次ページテキスト
human_description:
en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
zh_Hans: "目录页原始文本内容(来自目录页提取工具输出的 pages 数组)"
pt_BR: "Conteúdo de texto bruto das páginas do sumário"
ja_JP: "目次ページの生テキスト内容"
llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
form: llm
- name: model
type: model-selector
scope: llm
required: true
label:
en_US: LLM Model
zh_Hans: LLM 模型
pt_BR: Modelo LLM
ja_JP: LLMモデル
human_description:
en_US: "LLM model for parsing TOC into structured JSON"
zh_Hans: "用于解析目录的 LLM 模型"
pt_BR: "Modelo LLM para análise do sumário"
ja_JP: "目次解析用のLLMモデル"
form: form
extra:
python:
source: tools/pdf_toc.py