diff --git a/.gitignore b/.gitignore
index 4cbe3caa..195f9bd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,6 @@
.tmp
.trae
-**/*.difypkg
\ No newline at end of file
+**/*.difypkg
+urbanLifeServ/*
+*/.data
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 203429db..e69de29b 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,27 +0,0 @@
-{
- "version": "0.2.0",
- "configurations": [
- {
- "name": "Python: FastAPI Server",
- "type": "python",
- "request": "launch",
- "program": "${workspaceFolder}/difyPlugin/main.py",
- "console": "integratedTerminal",
- "justMyCode": true,
- "env": {
- "PYTHONUNBUFFERED": "1"
- },
- "cwd": "${workspaceFolder}/difyPlugin",
- "args": []
- },
- {
- "name": "Python: Debug Plugin",
- "type": "python",
- "request": "launch",
- "program": "${workspaceFolder}/difyPlugin/app/plugins/pdf/__init__.py",
- "console": "integratedTerminal",
- "justMyCode": true,
- "cwd": "${workspaceFolder}/difyPlugin"
- }
- ]
-}
\ No newline at end of file
diff --git a/ai-management-platform b/ai-management-platform
index 199d8180..96f7c3aa 160000
--- a/ai-management-platform
+++ b/ai-management-platform
@@ -1 +1 @@
-Subproject commit 199d8180a698c62d79c5c853302733050fe9c0fa
+Subproject commit 96f7c3aa4c9ac8b00e0b98b5a4998b5f910d5337
diff --git a/difyPlugin/pdf/manifest.yaml b/difyPlugin/pdf/manifest.yaml
index fb7631bf..27f075f3 100644
--- a/difyPlugin/pdf/manifest.yaml
+++ b/difyPlugin/pdf/manifest.yaml
@@ -19,6 +19,9 @@ resource:
permission:
tool:
enabled: true
+ model:
+ enabled: true
+ llm: true
plugins:
tools:
- provider/pdf.yaml
diff --git a/difyPlugin/pdf/provider/pdf.yaml b/difyPlugin/pdf/provider/pdf.yaml
index 83a55577..c7473239 100644
--- a/difyPlugin/pdf/provider/pdf.yaml
+++ b/difyPlugin/pdf/provider/pdf.yaml
@@ -56,8 +56,12 @@ identity:
# en_US: "Access Token"
tools:
- - tools/pdf.yaml
+ - tools/pdf_column_range.yaml
- tools/pdf_single_page.yaml
+ - tools/pdf_summary.yaml
+ - tools/pdf_toc.yaml
+ - tools/pdf_extract_range.yaml
+ - tools/pdf_to_markdown.yaml
extra:
python:
source: provider/pdf.py
diff --git a/difyPlugin/pdf/requirements.txt b/difyPlugin/pdf/requirements.txt
index e9cf72f9..80735ec2 100644
--- a/difyPlugin/pdf/requirements.txt
+++ b/difyPlugin/pdf/requirements.txt
@@ -1,2 +1,2 @@
dify_plugin>=0.4.0,<0.7.0
-PyPDF2>=3.0.1
+pymupdf>=1.27.1
\ No newline at end of file
diff --git a/difyPlugin/pdf/tools/pdf.py b/difyPlugin/pdf/tools/pdf.py
deleted file mode 100644
index fc226c04..00000000
--- a/difyPlugin/pdf/tools/pdf.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import re
-from collections.abc import Generator
-from io import BytesIO
-from typing import Any
-
-import PyPDF2
-from dify_plugin import Tool
-from dify_plugin.entities.tool import ToolInvokeMessage
-
-
-class PdfTool(Tool):
- def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
- file = tool_parameters.get("file")
- if not file:
- yield self.create_text_message("Error: file is required")
- return
-
- # file.blob returns bytes
- pdf_bytes = file.blob
- reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
- num_pages = len(reader.pages)
-
- toc_start = None
- toc_end = None
-
- toc_patterns = [
- r'目录',
- r'Table of Contents',
- r'Contents',
- r'目次'
- ]
-
- for page_num in range(num_pages):
- page = reader.pages[page_num]
- text = page.extract_text() or ""
-
- if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
- if toc_start is None:
- toc_start = page_num
- toc_end = page_num
- elif toc_start is not None and toc_end is not None:
- break
-
- if toc_start is None:
- yield self.create_json_message({
- "start": None,
- "end": None,
- "pages": []
- })
- return
-
- toc_pages = []
- for page_num in range(toc_start, toc_end + 1):
- page = reader.pages[page_num]
- toc_pages.append(page.extract_text() or "")
-
- yield self.create_json_message({
- "start": toc_start,
- "end": toc_end,
- "pages": toc_pages
- })
\ No newline at end of file
diff --git a/difyPlugin/pdf/tools/pdf_column_range.py b/difyPlugin/pdf/tools/pdf_column_range.py
new file mode 100644
index 00000000..5d5f5db8
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_column_range.py
@@ -0,0 +1,107 @@
+import json
+import re
+from collections.abc import Generator
+from io import BytesIO
+from typing import Any
+
+import fitz # PyMuPDF 核心库
+from dify_plugin import Tool
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfTool(Tool):
+ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+ file = tool_parameters.get("file")
+ if not file:
+ yield self.create_text_message("Error: file is required")
+ return
+
+ # 从字节流加载 PDF(替换 PyPDF2)
+ pdf_bytes = file.blob
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+ num_pages = len(doc)
+
+ toc_start = None
+ toc_end = None
+
+ # 目录匹配正则(与原代码一致)
+ toc_patterns = [
+ r'目录',
+ r'目 录',
+ r'目\u3000录',
+ r'Table of Contents',
+ r'Contents',
+ r'目次'
+ ]
+
+ # 遍历页面识别目录页(逻辑不变,仅替换文本提取方式)
+ for page_num in range(num_pages):
+ page = doc[page_num]
+ text = page.get_text() or "" # PyMuPDF 提取文本
+
+ if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
+ if toc_start is None:
+ toc_start = page_num
+ toc_end = page_num
+ elif toc_start is not None and toc_end is not None:
+ break
+
+ # 提取目录页文本
+ toc_pages = []
+ if toc_start is not None and toc_end is not None:
+ for page_num in range(toc_start, toc_end + 1):
+ page = doc[page_num]
+ toc_pages.append(page.get_text() or "")
+
+ # 关闭文档
+ doc.close()
+
+ result = {
+ "start": toc_start,
+ "end": toc_end,
+ "pages": toc_pages,
+ "pages_text": "\n".join(toc_pages) if toc_pages else "",
+ }
+ yield self.create_text_message(json.dumps(result, ensure_ascii=False))
+ yield self.create_json_message(result)
+
+
+if __name__ == "__main__":
+ # 测试代码(改用 PyMuPDF)
+ pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf"
+ doc = fitz.open(pdf_path) # 本地文件直接打开
+ num_pages = len(doc)
+
+ toc_start = None
+ toc_end = None
+
+ toc_patterns = [
+ r'目录',
+ r'目 录',
+ r'目\u3000录',
+ r'Table of Contents',
+ r'Contents',
+ r'目次'
+ ]
+
+ # 遍历页面找目录
+ for page_num in range(num_pages):
+ page = doc[page_num]
+ text = page.get_text() or ""
+ if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
+ if toc_start is None:
+ toc_start = page_num
+ toc_end = page_num
+ elif toc_start is not None and toc_end is not None:
+ break
+
+ # 提取目录页文本
+ toc_pages = []
+ toc_start = toc_start if toc_start is not None else 18
+ toc_end = toc_end if toc_end is not None else toc_start + 9
+ for page_num in range(toc_start, toc_end):
+ page = doc[page_num]
+ toc_pages.append(page.get_text() or "")
+
+ print(toc_start, toc_end, toc_pages)
+ doc.close() # 关闭文档
\ No newline at end of file
diff --git a/difyPlugin/pdf/tools/pdf.yaml b/difyPlugin/pdf/tools/pdf_column_range.yaml
similarity index 96%
rename from difyPlugin/pdf/tools/pdf.yaml
rename to difyPlugin/pdf/tools/pdf_column_range.yaml
index fe18f6ab..8f758dd7 100644
--- a/difyPlugin/pdf/tools/pdf.yaml
+++ b/difyPlugin/pdf/tools/pdf_column_range.yaml
@@ -33,4 +33,4 @@ parameters:
- "pdf"
extra:
python:
- source: tools/pdf.py
\ No newline at end of file
+ source: tools/pdf_column_range.py
\ No newline at end of file
diff --git a/difyPlugin/pdf/tools/pdf_extract_range.py b/difyPlugin/pdf/tools/pdf_extract_range.py
new file mode 100644
index 00000000..fbaa3927
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_extract_range.py
@@ -0,0 +1,48 @@
+import json
+from collections.abc import Generator
+from typing import Any
+
+import fitz # PyMuPDF
+from dify_plugin import Tool
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfExtractRangeTool(Tool):
+ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+ file = tool_parameters.get("file")
+ if not file:
+ yield self.create_text_message("Error: file is required")
+ return
+
+ start_page = int(tool_parameters.get("start_page", 0))
+ end_page = int(tool_parameters.get("end_page", 0))
+
+ # 打开 PDF
+ pdf_bytes = file.blob
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+ num_pages = len(doc)
+
+ # 边界处理
+ start_page = max(0, min(start_page, num_pages - 1))
+ end_page = max(start_page, min(end_page, num_pages - 1))
+
+ # 逐页提取文本
+ page_texts = []
+ for page_idx in range(start_page, end_page + 1):
+ page = doc[page_idx]
+ text = page.get_text("text", sort=True) or ""
+ page_texts.append(text)
+
+ doc.close()
+
+ # 拼接所有页面文本
+ full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
+
+ result = {
+ "start": start_page,
+ "end": end_page,
+ "total_pages": end_page - start_page + 1,
+ "text": full_text,
+ }
+ yield self.create_text_message(json.dumps(result, ensure_ascii=False))
+ yield self.create_json_message(result)
diff --git a/difyPlugin/pdf/tools/pdf_extract_range.yaml b/difyPlugin/pdf/tools/pdf_extract_range.yaml
new file mode 100644
index 00000000..0bc10b6f
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_extract_range.yaml
@@ -0,0 +1,68 @@
+identity:
+ name: "pdf_extract_range"
+ author: "yslg"
+ label:
+ en_US: "Extract Page Range Text"
+ zh_Hans: "提取页面范围文本"
+ pt_BR: "Extrair Texto do Intervalo de Páginas"
+ ja_JP: "ページ範囲テキスト抽出"
+description:
+ human:
+ en_US: "Extract plain text from a specified page range of a PDF file"
+ zh_Hans: "从PDF文件的指定页码范围提取纯文本"
+ pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF"
+ ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出"
+ llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range."
+parameters:
+ - name: file
+ type: file
+ required: true
+ label:
+ en_US: PDF File
+ zh_Hans: PDF 文件
+ pt_BR: Arquivo PDF
+ ja_JP: PDFファイル
+ human_description:
+ en_US: "PDF file to extract text from"
+ zh_Hans: "要提取文本的 PDF 文件"
+ pt_BR: "Arquivo PDF para extrair texto"
+ ja_JP: "テキストを抽出するPDFファイル"
+ llm_description: "PDF file to extract page range text from"
+ form: llm
+ fileTypes:
+ - "pdf"
+ - name: start_page
+ type: number
+ required: true
+ label:
+ en_US: Start Page
+ zh_Hans: 起始页码
+ pt_BR: Página Inicial
+ ja_JP: 開始ページ
+ human_description:
+ en_US: "Start page index (0-based)"
+ zh_Hans: "起始页码(从0开始)"
+ pt_BR: "Índice da página inicial (base 0)"
+ ja_JP: "開始ページ番号(0始まり)"
+ llm_description: "Start page index (0-based)"
+ form: llm
+ default: 0
+ - name: end_page
+ type: number
+ required: true
+ label:
+ en_US: End Page
+ zh_Hans: 结束页码
+ pt_BR: Página Final
+ ja_JP: 終了ページ
+ human_description:
+ en_US: "End page index (0-based, inclusive)"
+ zh_Hans: "结束页码(从0开始,包含该页)"
+ pt_BR: "Índice da página final (base 0, inclusivo)"
+ ja_JP: "終了ページ番号(0始まり、含む)"
+ llm_description: "End page index (0-based, inclusive)"
+ form: llm
+ default: 0
+extra:
+ python:
+ source: tools/pdf_extract_range.py
diff --git a/difyPlugin/pdf/tools/pdf_single_page.py b/difyPlugin/pdf/tools/pdf_single_page.py
index 5ed41ecf..0fa67660 100644
--- a/difyPlugin/pdf/tools/pdf_single_page.py
+++ b/difyPlugin/pdf/tools/pdf_single_page.py
@@ -1,8 +1,9 @@
+import json
from collections.abc import Generator
from io import BytesIO
from typing import Any
-import PyPDF2
+import fitz # PyMuPDF 核心库
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
@@ -16,21 +17,29 @@ class PdfSinglePageTool(Tool):
yield self.create_text_message("Error: file is required")
return
+ # 从字节流加载 PDF(替换 PyPDF2 的 PdfReader)
pdf_bytes = file.blob
- reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
- num_pages = len(reader.pages)
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开
+ num_pages = len(doc)
+ # 页码边界处理(逻辑与原代码一致)
page_index = int(page)
if page_index < 0:
page_index = 0
if page_index >= num_pages:
page_index = num_pages - 1
- selected_page = reader.pages[page_index]
- text = selected_page.extract_text() or ""
+ # 提取指定页面文本(PyMuPDF 方式)
+ selected_page = doc[page_index]
+ text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准
- yield self.create_json_message({
+ # 关闭文档释放资源
+ doc.close()
+
+ result = {
"start": page_index,
"end": page_index,
"pages": [text]
- })
+ }
+ yield self.create_text_message(json.dumps(result, ensure_ascii=False))
+ yield self.create_json_message(result)
\ No newline at end of file
diff --git a/difyPlugin/pdf/tools/pdf_summary.py b/difyPlugin/pdf/tools/pdf_summary.py
new file mode 100644
index 00000000..684914c7
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_summary.py
@@ -0,0 +1,209 @@
+import json
+import re
+from collections.abc import Generator
+from typing import Any
+
+import fitz
+from dify_plugin import Tool
+from dify_plugin.entities.model.llm import LLMModelConfig
+from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfSummaryTool(Tool):
+ """Fast PDF page summary tool.
+
+ Default behavior is optimized for throughput in large workflows:
+ - Extract plain text and lightweight table data only.
+ - Skip expensive image base64 and drawing path extraction.
+ - Skip LLM by default unless `use_llm=true` is explicitly passed.
+ """
+
+ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+ file = tool_parameters.get("file")
+ if not file:
+ yield self.create_text_message("Error: file is required")
+ return
+
+ start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0)
+ end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0)
+ model_config = tool_parameters.get("model")
+ use_llm = self._to_bool(tool_parameters.get("use_llm"), False)
+
+ max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000)
+ max_chars_per_page = max(800, min(max_chars_per_page, 20000))
+
+ llm_prompt = tool_parameters.get(
+ "llm_prompt",
+ "请基于输入的PDF页面文本做简洁准确摘要,输出中文要点。不要输出思考过程。",
+ )
+
+ pdf_bytes = file.blob
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+ try:
+ num_pages = len(doc)
+ start_page = max(0, min(start_page, num_pages - 1))
+ end_page = max(start_page, min(end_page, num_pages - 1))
+
+ pages_data: list[dict[str, Any]] = []
+ for page_idx in range(start_page, end_page + 1):
+ page = doc[page_idx]
+ page_data = self._extract_page_fast(page, page_idx, max_chars_per_page)
+ pages_data.append(page_data)
+
+ result = {
+ "total_pages_extracted": len(pages_data),
+ "page_range": {"start": start_page, "end": end_page},
+ "pages": pages_data,
+ }
+ yield self.create_json_message(result)
+
+ # Fast local summary first (deterministic, no model latency)
+ local_text = self._build_local_summary(pages_data)
+
+ # Optional LLM refinement, explicitly enabled only
+ if use_llm and model_config:
+ refined = self._summarize_with_llm(local_text, llm_prompt, model_config)
+ final_text = refined if refined else local_text
+ else:
+ final_text = local_text
+
+ if final_text:
+ yield self.create_text_message(final_text)
+ finally:
+ doc.close()
+
+ def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]:
+ text = (page.get_text("text") or "").strip()
+ if len(text) > max_chars_per_page:
+ text = text[:max_chars_per_page] + "\n...[truncated]"
+
+ tables: list[dict[str, Any]] = []
+ try:
+ tabs = page.find_tables()
+ for tab_idx, tab in enumerate(tabs.tables[:3]):
+ cells = tab.extract() or []
+ tables.append(
+ {
+ "index": tab_idx,
+ "rows": tab.row_count,
+ "cols": tab.col_count,
+ "cells": cells[:10],
+ }
+ )
+ except Exception:
+ pass
+
+ return {
+ "page_number": page_idx,
+ "text": text,
+ "tables": tables,
+ "images": [],
+ "drawings_summary": [],
+ "text_blocks": [],
+ "width": float(page.rect.width),
+ "height": float(page.rect.height),
+ }
+
+ def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str:
+ """Output actual page content as Markdown (text + tables).
+
+ No LLM needed downstream — the text is already usable Markdown.
+ """
+ parts: list[str] = []
+ for page in pages_data:
+ text = (page.get("text") or "").strip()
+ tables = page.get("tables") or []
+
+ page_parts: list[str] = []
+ if text:
+ page_parts.append(text)
+
+ for tab in tables:
+ cells = tab.get("cells") or []
+ if len(cells) >= 2:
+ md = self._cells_to_md_table(cells)
+ if md:
+ page_parts.append(md)
+
+ if page_parts:
+ parts.append("\n\n".join(page_parts))
+
+ return "\n\n--- 分页 ---\n\n".join(parts)
+
+ @staticmethod
+ def _cells_to_md_table(cells: list) -> str:
+ if not cells:
+ return ""
+ header = cells[0]
+ ncols = len(header)
+ if ncols == 0:
+ return ""
+ clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
+ lines = [
+ "| " + " | ".join(clean(c) for c in header) + " |",
+ "| " + " | ".join("---" for _ in range(ncols)) + " |",
+ ]
+ for row in cells[1:]:
+ padded = list(row) + [""] * max(0, ncols - len(row))
+ lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
+ return "\n".join(lines)
+
+ def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str:
+ response = self.session.model.llm.invoke(
+ model_config=LLMModelConfig(**model_config),
+ prompt_messages=[
+ SystemPromptMessage(content=llm_prompt),
+ UserPromptMessage(content=local_text),
+ ],
+ stream=False,
+ )
+
+ llm_text = ""
+ if hasattr(response, "message") and response.message:
+ content = response.message.content
+ if isinstance(content, str):
+ llm_text = content
+ elif isinstance(content, list):
+ llm_text = "".join(
+ item.data if hasattr(item, "data") else str(item)
+ for item in content
+ )
+
+ return self._extract_visible_answer(llm_text)
+
+ @staticmethod
+ def _extract_visible_answer(text: str) -> str:
+ if not text:
+ return ""
+
+ box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text)
+ if box_match:
+ text = box_match.group(1)
+ else:
+ text = re.sub(r"[\s\S]*?", "", text, flags=re.IGNORECASE)
+
+ text = re.sub(r"<\|[^>]+\|>", "", text)
+ return text.strip()
+
+ @staticmethod
+ def _to_int(value: Any, default: int) -> int:
+ try:
+ if value is None or value == "":
+ return default
+ return int(value)
+ except Exception:
+ return default
+
+ @staticmethod
+ def _to_bool(value: Any, default: bool) -> bool:
+ if value is None:
+ return default
+ if isinstance(value, bool):
+ return value
+ s = str(value).strip().lower()
+ if s in {"1", "true", "yes", "on"}:
+ return True
+ if s in {"0", "false", "no", "off"}:
+ return False
+ return default
diff --git a/difyPlugin/pdf/tools/pdf_summary.yaml b/difyPlugin/pdf/tools/pdf_summary.yaml
new file mode 100644
index 00000000..059c920d
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_summary.yaml
@@ -0,0 +1,99 @@
+identity:
+ name: "pdf_summary"
+ author: "yslg"
+ label:
+ en_US: "PDF Page Summary"
+ zh_Hans: "PDF页面概述"
+ pt_BR: "Resumo de Página PDF"
+ ja_JP: "PDFページ概要"
+description:
+ human:
+ en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM"
+ zh_Hans: "提取PDF页面核心元素(文本、图片、表格、路径)及坐标,并通过LLM进行概述"
+ pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM"
+ ja_JP: "PDFページからコア要素(テキスト、画像、テーブル、パス)を座標付きで抽出し、LLMで要約"
+ llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content"
+parameters:
+ - name: file
+ type: file
+ required: true
+ label:
+ en_US: PDF File
+ zh_Hans: PDF 文件
+ pt_BR: Arquivo PDF
+ ja_JP: PDFファイル
+ human_description:
+ en_US: "PDF file to process"
+ zh_Hans: "要处理的 PDF 文件"
+ pt_BR: "Arquivo PDF para processar"
+ ja_JP: "処理するPDFファイル"
+ llm_description: "PDF file to extract elements from and summarize"
+ form: llm
+ fileTypes:
+ - "pdf"
+ - name: pdf_start_page
+ type: number
+ required: true
+ label:
+ en_US: Start Page
+ zh_Hans: 起始页码
+ pt_BR: Página Inicial
+ ja_JP: 開始ページ
+ human_description:
+ en_US: "Start page index (0-based)"
+ zh_Hans: "起始页码(从0开始)"
+ pt_BR: "Índice da página inicial (base 0)"
+ ja_JP: "開始ページ番号(0始まり)"
+ llm_description: "Start page index (0-based) for element extraction"
+ form: llm
+ default: 0
+ - name: pdf_end_page
+ type: number
+ required: true
+ label:
+ en_US: End Page
+ zh_Hans: 结束页码
+ pt_BR: Página Final
+ ja_JP: 終了ページ
+ human_description:
+ en_US: "End page index (0-based, inclusive)"
+ zh_Hans: "结束页码(从0开始,包含该页)"
+ pt_BR: "Índice da página final (base 0, inclusivo)"
+ ja_JP: "終了ページ番号(0始まり、含む)"
+ llm_description: "End page index (0-based, inclusive) for element extraction"
+ form: llm
+ default: 0
+ - name: model
+ type: model-selector
+ scope: llm
+ required: true
+ label:
+ en_US: LLM Model
+ zh_Hans: LLM 模型
+ pt_BR: Modelo LLM
+ ja_JP: LLMモデル
+ human_description:
+ en_US: "LLM model used for summarizing extracted content"
+ zh_Hans: "用于概述提取内容的 LLM 模型"
+ pt_BR: "Modelo LLM usado para resumir o conteúdo extraído"
+ ja_JP: "抽出内容の要約に使用するLLMモデル"
+ form: form
+ - name: llm_prompt
+ type: string
+ required: false
+ label:
+ en_US: LLM Prompt
+ zh_Hans: LLM 提示词
+ pt_BR: Prompt do LLM
+ ja_JP: LLMプロンプト
+ human_description:
+ en_US: "System prompt for LLM summarization"
+ zh_Hans: "LLM 概述的系统提示词"
+ pt_BR: "Prompt do sistema para resumo LLM"
+ ja_JP: "LLM要約用のシステムプロンプト"
+ llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content"
+ form: form
+ default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容(包含文本、图片信息、表格和矢量图形),对每页内容进行准确、简洁的概述。"
+extra:
+ python:
+ source: tools/pdf_summary.py
diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.py b/difyPlugin/pdf/tools/pdf_to_markdown.py
new file mode 100644
index 00000000..c00ab31e
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.py
@@ -0,0 +1,335 @@
+import base64
+import re
+from collections import OrderedDict
+from collections.abc import Generator
+from typing import Any
+
+import fitz
+from dify_plugin import Tool
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfToMarkdownTool(Tool):
+ """Convert PDF to a single Markdown file. No LLM needed.
+
+ - Auto-detect TOC and organize content by chapters.
+ - Extract text and tables as Markdown.
+ - Embed raster images as base64.
+ - Render vector drawings as base64 PNG.
+ - Output one .md file via create_blob_message.
+ """
+
+ _TOC_PATTERNS = [
+ r"目录", r"目 录", r"目\u3000录",
+ r"Table of Contents", r"Contents", r"目次",
+ ]
+
+ # ── entry point ──────────────────────────────────────────
+
+ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+ file = tool_parameters.get("file")
+ if not file:
+ yield self.create_text_message("Error: file is required")
+ return
+
+ include_images = self._to_bool(tool_parameters.get("include_images"), True)
+ image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
+ image_dpi = max(72, min(image_dpi, 300))
+ max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
+
+ doc = fitz.open(stream=file.blob, filetype="pdf")
+ try:
+ num_pages = len(doc)
+
+ # 1) Build chapter map (metadata TOC → printed TOC → none)
+ chapters, content_offset = self._build_chapter_map(doc, num_pages)
+
+ # 2) Convert every page
+ page_mds: list[str] = []
+ for idx in range(num_pages):
+ md = self._page_to_markdown(
+ doc, doc[idx], idx,
+ include_images, image_dpi, max_image_bytes,
+ )
+ page_mds.append(md)
+
+ # 3) Assemble
+ if chapters:
+ final_md = self._assemble_by_chapters(
+ chapters, page_mds, content_offset, num_pages,
+ )
+ else:
+ final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
+
+ # 4) Output: text (for variable aggregation) + blob (.md file)
+ yield self.create_text_message(final_md)
+ md_bytes = final_md.encode("utf-8")
+ yield self.create_blob_message(
+ blob=md_bytes,
+ meta={"mime_type": "text/markdown"},
+ )
+ finally:
+ doc.close()
+
+ # ── chapter detection ────────────────────────────────────
+
+ def _build_chapter_map(
+ self, doc: fitz.Document, num_pages: int,
+ ) -> tuple[dict, int]:
+ """Return (chapters_dict, content_offset).
+
+ Try embedded PDF TOC metadata first (reliable page mapping).
+ Fall back to scanning printed TOC pages.
+ """
+ toc = doc.get_toc()
+ if toc:
+ chapters = self._chapters_from_metadata(toc, num_pages)
+ if chapters:
+ return chapters, 0
+
+ toc_start, toc_end = self._find_toc_pages(doc, num_pages)
+ if toc_start is not None and toc_end is not None:
+ toc_text = "\n".join(
+ doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
+ )
+ chapters = self._parse_toc_lines(toc_text)
+ if chapters:
+ offset = self._guess_offset(chapters, toc_end)
+ return chapters, offset
+
+ return {}, 0
+
+ def _chapters_from_metadata(
+ self, toc: list, num_pages: int,
+ ) -> dict[str, dict[str, int]]:
+ top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
+ if not top:
+ return {}
+ chapters: dict[str, dict[str, int]] = OrderedDict()
+ for i, (title, start) in enumerate(top):
+ end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
+ chapters[title] = {"start": start, "end": max(start, end)}
+ return chapters
+
+ def _find_toc_pages(self, doc, num_pages):
+ toc_start = toc_end = None
+ for pn in range(min(num_pages, 30)):
+ text = doc[pn].get_text() or ""
+ if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
+ if toc_start is None:
+ toc_start = pn
+ toc_end = pn
+ elif toc_start is not None:
+ break
+ return toc_start, toc_end
+
+ def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
+ m = re.search(
+ r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
+ text, re.IGNORECASE | re.MULTILINE,
+ )
+ if m:
+ text = text[: m.start()]
+
+ pat = re.compile(
+ r"^\s*(?P
.+?)\s*(?:\.{2,}|\s)\s*(?P\d{1,5})\s*$"
+ )
+ entries: list[tuple[str, int]] = []
+ for raw in text.splitlines():
+ line = raw.strip()
+ if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
+ continue
+ m2 = pat.match(line)
+ if not m2:
+ continue
+ title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ")
+ page = self._to_int(m2.group("page"), None)
+ if not title or page is None or len(title) <= 1:
+ continue
+ if title.lower() in {"page", "pages", "目录", "contents"}:
+ continue
+ entries.append((title, page))
+
+ if not entries:
+ return {}
+
+ dedup: OrderedDict[str, int] = OrderedDict()
+ for t, p in entries:
+ dedup.setdefault(t, p)
+
+ titles = list(dedup.keys())
+ pages = [dedup[t] for t in titles]
+ catalog: dict[str, dict[str, int]] = OrderedDict()
+ for i, t in enumerate(titles):
+ s = pages[i]
+ e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
+ catalog[t] = {"start": s, "end": e}
+ return catalog
+
+ @staticmethod
+ def _guess_offset(chapters: dict, toc_end: int) -> int:
+ first_page = None
+ for info in chapters.values():
+ s = info["start"]
+ if first_page is None or s < first_page:
+ first_page = s
+ if first_page is None:
+ return 0
+ return (toc_end + 1) - first_page
+
+ # ── per-page conversion ──────────────────────────────────
+
+ def _page_to_markdown(
+ self,
+ doc: fitz.Document,
+ page: fitz.Page,
+ page_idx: int,
+ include_images: bool,
+ image_dpi: int,
+ max_image_bytes: int,
+ ) -> str:
+ parts: list[str] = []
+
+ # ── text ──
+ text = (page.get_text("text", sort=True) or "").strip()
+ if text:
+ parts.append(text)
+
+ # ── tables → Markdown ──
+ try:
+ for tab in (page.find_tables().tables or [])[:5]:
+ cells = tab.extract() or []
+ if len(cells) >= 2:
+ md = self._cells_to_md_table(cells)
+ if md:
+ parts.append(md)
+ except Exception:
+ pass
+
+ if not include_images:
+ return "\n\n".join(parts)
+
+ # ── embedded raster images ──
+ try:
+ for img_idx, img_info in enumerate(page.get_images(full=True)):
+ xref = img_info[0]
+ try:
+ data = doc.extract_image(xref)
+ if not data or not data.get("image"):
+ continue
+ raw = data["image"]
+ if len(raw) > max_image_bytes:
+ continue
+ # skip tiny icons (< 20x20)
+ w = data.get("width", 0)
+ h = data.get("height", 0)
+ if w < 20 and h < 20:
+ continue
+ ext = data.get("ext", "png")
+ mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
+ b64 = base64.b64encode(raw).decode("ascii")
+ parts.append(
+ f""
+ )
+ except Exception:
+ pass
+ except Exception:
+ pass
+
+ # ── vector drawings → render as PNG ──
+ try:
+ drawings = page.get_drawings()
+ if len(drawings) >= 3:
+ valid_rects: list[fitz.Rect] = []
+ for d in drawings:
+ r = d.get("rect")
+ if r:
+ try:
+ rect = fitz.Rect(r)
+ if rect.is_valid and not rect.is_empty:
+ valid_rects.append(rect)
+ except Exception:
+ pass
+ if valid_rects:
+ bbox = valid_rects[0]
+ for r in valid_rects[1:]:
+ bbox |= r
+ bbox &= page.rect
+ if bbox.width > 30 and bbox.height > 30:
+ scale = image_dpi / 72
+ mat = fitz.Matrix(scale, scale)
+ pix = page.get_pixmap(matrix=mat, clip=bbox)
+ png = pix.tobytes("png")
+ if len(png) <= max_image_bytes:
+ b64 = base64.b64encode(png).decode("ascii")
+ parts.append(
+ f""
+ )
+ except Exception:
+ pass
+
+ return "\n\n".join(parts)
+
+ # ── assembly ─────────────────────────────────────────────
+
+ def _assemble_by_chapters(
+ self,
+ chapters: dict[str, dict[str, int]],
+ page_mds: list[str],
+ offset: int,
+ num_pages: int,
+ ) -> str:
+ parts: list[str] = []
+ for name, info in chapters.items():
+ s = info["start"] + offset
+ e = info["end"] + offset
+ s = max(0, min(s, num_pages - 1))
+ e = max(s, min(e, num_pages - 1))
+ ch: list[str] = [f"# {name}\n"]
+ for idx in range(s, e + 1):
+ if idx < len(page_mds) and page_mds[idx].strip():
+ ch.append(page_mds[idx])
+ parts.append("\n\n".join(ch))
+ return "\n\n---\n\n".join(parts)
+
+ # ── helpers ──────────────────────────────────────────────
+
+ @staticmethod
+ def _cells_to_md_table(cells: list) -> str:
+ if not cells:
+ return ""
+ header = cells[0]
+ ncols = len(header)
+ if ncols == 0:
+ return ""
+ clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
+ lines = [
+ "| " + " | ".join(clean(c) for c in header) + " |",
+ "| " + " | ".join("---" for _ in range(ncols)) + " |",
+ ]
+ for row in cells[1:]:
+ padded = list(row) + [""] * max(0, ncols - len(row))
+ lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
+ return "\n".join(lines)
+
+ @staticmethod
+ def _to_int(value: Any, default: int | None) -> int | None:
+ try:
+ if value is None or value == "":
+ return default
+ return int(value)
+ except Exception:
+ return default
+
+ @staticmethod
+ def _to_bool(value: Any, default: bool) -> bool:
+ if value is None:
+ return default
+ if isinstance(value, bool):
+ return value
+ s = str(value).strip().lower()
+ if s in {"1", "true", "yes", "on"}:
+ return True
+ if s in {"0", "false", "no", "off"}:
+ return False
+ return default
diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.yaml b/difyPlugin/pdf/tools/pdf_to_markdown.yaml
new file mode 100644
index 00000000..87505722
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.yaml
@@ -0,0 +1,68 @@
+identity:
+ name: "pdf_to_markdown"
+ author: "yslg"
+ label:
+ en_US: "PDF to Markdown"
+ zh_Hans: "PDF转Markdown"
+ pt_BR: "PDF para Markdown"
+ ja_JP: "PDFからMarkdown"
+description:
+ human:
+ en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed."
+ zh_Hans: "将PDF转换为单个Markdown文件,图片以base64嵌入,无需大模型"
+ pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM."
+ ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。"
+ llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed."
+parameters:
+ - name: file
+ type: file
+ required: true
+ label:
+ en_US: PDF File
+ zh_Hans: PDF 文件
+ pt_BR: Arquivo PDF
+ ja_JP: PDFファイル
+ human_description:
+ en_US: "PDF file to convert"
+ zh_Hans: "要转换的 PDF 文件"
+ pt_BR: "Arquivo PDF para converter"
+ ja_JP: "変換するPDFファイル"
+ llm_description: "PDF file to convert to Markdown"
+ form: llm
+ fileTypes:
+ - "pdf"
+ - name: include_images
+ type: boolean
+ required: false
+ label:
+ en_US: Include Images
+ zh_Hans: 包含图片
+ pt_BR: Incluir Imagens
+ ja_JP: 画像を含める
+ human_description:
+ en_US: "Whether to embed images as base64 in the Markdown output (default: true)"
+ zh_Hans: "是否将图片以base64嵌入Markdown输出(默认:是)"
+ pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)"
+ ja_JP: "Markdown出力にbase64として画像を埋め込むかどうか(デフォルト:はい)"
+ llm_description: "Set to true to embed images as base64, false to skip images"
+ form: form
+ default: true
+ - name: image_dpi
+ type: number
+ required: false
+ label:
+ en_US: Image DPI
+ zh_Hans: 图片DPI
+ pt_BR: DPI da Imagem
+ ja_JP: 画像DPI
+ human_description:
+ en_US: "DPI for rendering vector drawings (72-300, default: 150)"
+ zh_Hans: "矢量图渲染DPI(72-300,默认150)"
+ pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)"
+ ja_JP: "ベクター描画のレンダリングDPI(72-300、デフォルト:150)"
+ llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150."
+ form: form
+ default: 150
+extra:
+ python:
+ source: tools/pdf_to_markdown.py
diff --git a/difyPlugin/pdf/tools/pdf_toc.py b/difyPlugin/pdf/tools/pdf_toc.py
new file mode 100644
index 00000000..a96b86b1
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_toc.py
@@ -0,0 +1,273 @@
+import json
+import re
+from collections import OrderedDict
+from collections.abc import Generator
+from typing import Any
+
+from dify_plugin import Tool
+from dify_plugin.entities.model.llm import LLMModelConfig
+from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
+Return only valid JSON object, no markdown fences, no explanation.
+Output schema:
+{
+ "Chapter Name": {"start": 1, "end": 5},
+ "Another": {"start": 6, "end": 20}
+}
+Rules:
+- start/end are integer printed page numbers from TOC.
+- If end is unknown, use same value as start.
+- Keep chapter names exactly as in TOC text.
+"""
+
+
+class PdfTocTool(Tool):
+ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+ toc_start = self._to_int(tool_parameters.get("toc_start"), None)
+ toc_end = self._to_int(tool_parameters.get("toc_end"), None)
+ toc_pages = (tool_parameters.get("toc_pages") or "").strip()
+ model_config = tool_parameters.get("model")
+
+ if toc_start is None or toc_end is None:
+ yield self.create_text_message("Error: toc_start and toc_end are required")
+ return
+
+ if not toc_pages:
+ yield self.create_text_message("Error: toc_pages text is empty")
+ return
+
+ cleaned = self._strip_index_lists(toc_pages)
+
+ # 1) deterministic parser first
+ catalog = self._parse_toc_lines(cleaned)
+
+ # 2) optional LLM fallback/enhance only when deterministic parser gives no result
+ llm_raw_output = ""
+ llm_error = None
+ if not catalog and model_config:
+ llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
+ toc_start=toc_start,
+ toc_end=toc_end,
+ toc_pages=cleaned,
+ model_config=model_config,
+ )
+ if llm_catalog:
+ catalog = self._normalize_catalog(llm_catalog)
+
+ result: dict[str, Any] = {
+ "toc_start": toc_start,
+ "toc_end": toc_end,
+ "catalog": catalog,
+ "meta": {
+ "catalog_size": len(catalog),
+ "parser": "rule" if catalog else "none",
+ },
+ }
+
+ if llm_raw_output:
+ result["meta"]["llm_used"] = True
+ if llm_error:
+ result["meta"]["llm_error"] = llm_error
+
+ # always return valid json text payload for downstream json.loads
+ yield self.create_text_message(json.dumps(result, ensure_ascii=False))
+ yield self.create_json_message(result)
+
+ def _parse_with_llm(
+ self,
+ toc_start: int,
+ toc_end: int,
+ toc_pages: str,
+ model_config: dict[str, Any],
+ ) -> tuple[dict[str, Any] | None, str, str | None]:
+ user_content = (
+ f"TOC page index range: {toc_start}..{toc_end}\n\n"
+ f"TOC raw text:\n{toc_pages}"
+ )
+ response = self.session.model.llm.invoke(
+ model_config=LLMModelConfig(**model_config),
+ prompt_messages=[
+ SystemPromptMessage(content=_SYSTEM_PROMPT),
+ UserPromptMessage(content=user_content),
+ ],
+ stream=False,
+ )
+
+ llm_text = ""
+ if hasattr(response, "message") and response.message:
+ content = response.message.content
+ if isinstance(content, str):
+ llm_text = content
+ elif isinstance(content, list):
+ llm_text = "".join(
+ item.data if hasattr(item, "data") else str(item) for item in content
+ )
+
+ parsed = self._extract_json_object(llm_text)
+ if parsed is None:
+ return None, llm_text, "Failed to parse LLM output as JSON"
+ if not isinstance(parsed, dict):
+ return None, llm_text, "LLM output JSON is not an object"
+
+ return parsed, llm_text, None
+
+ @staticmethod
+ def _strip_index_lists(text: str) -> str:
+ # Stop before common appendix lists that pollute TOC parsing.
+ pattern = re.compile(
+ r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
+ re.IGNORECASE | re.MULTILINE,
+ )
+ m = pattern.search(text)
+ return text[: m.start()].rstrip() if m else text
+
+ def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
+ """Parse lines like:
+ 1.2 Engine Overview ........ 35
+ Appendix A 120
+ """
+ line_pattern = re.compile(
+ r"^\s*(?P.+?)\s*(?:\.{2,}|\s)\s*(?P\d{1,5})\s*$"
+ )
+
+ entries: list[tuple[str, int]] = []
+ for raw in text.splitlines():
+ line = raw.strip()
+ if not line or len(line) < 3:
+ continue
+ if re.fullmatch(r"\d+", line):
+ continue
+
+ m = line_pattern.match(line)
+ if not m:
+ continue
+
+ title = re.sub(r"\s+", " ", m.group("title")).strip("-_:: ")
+ page = self._to_int(m.group("page"), None)
+ if not title or page is None:
+ continue
+
+ # Skip obvious noise.
+ if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
+ continue
+
+ entries.append((title, page))
+
+ if not entries:
+ return {}
+
+ # Deduplicate keeping earliest appearance.
+ dedup: OrderedDict[str, int] = OrderedDict()
+ for title, page in entries:
+ if title not in dedup:
+ dedup[title] = page
+
+ titles = list(dedup.keys())
+ pages = [dedup[t] for t in titles]
+
+ catalog: dict[str, dict[str, int]] = {}
+ for i, title in enumerate(titles):
+ start = pages[i]
+ if i + 1 < len(pages):
+ next_start = pages[i + 1]
+ end = max(start, next_start - 1)
+ else:
+ end = start
+ catalog[title] = {"start": int(start), "end": int(end)}
+
+ return catalog
+
+ def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
+ catalog: dict[str, dict[str, int]] = {}
+ source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
+ if not isinstance(source, dict):
+ return catalog
+
+ for name, value in source.items():
+ if not isinstance(name, str) or not isinstance(value, dict):
+ continue
+ start = self._to_int(value.get("start"), None)
+ end = self._to_int(value.get("end"), start)
+ if start is None:
+ continue
+ if end is None:
+ end = start
+ catalog[name] = {"start": int(start), "end": int(max(start, end))}
+ return catalog
+
+ @staticmethod
+ def _extract_json_object(text: str) -> Any:
+ if not text:
+ return None
+
+ candidates: list[str] = []
+
+ code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
+ candidates.extend([c.strip() for c in code_blocks if c.strip()])
+
+ brace_candidate = PdfTocTool._extract_first_brace_object(text)
+ if brace_candidate:
+ candidates.append(brace_candidate)
+
+ candidates.append(text.strip())
+
+ for cand in candidates:
+ parsed = PdfTocTool._json_try_parse(cand)
+ if parsed is not None:
+ return parsed
+ return None
+
+ @staticmethod
+ def _extract_first_brace_object(text: str) -> str | None:
+ start = text.find("{")
+ if start < 0:
+ return None
+
+ depth = 0
+ in_str = False
+ escape = False
+ for i in range(start, len(text)):
+ ch = text[i]
+ if in_str:
+ if escape:
+ escape = False
+ elif ch == "\\":
+ escape = True
+ elif ch == '"':
+ in_str = False
+ continue
+
+ if ch == '"':
+ in_str = True
+ elif ch == "{":
+ depth += 1
+ elif ch == "}":
+ depth -= 1
+ if depth == 0:
+ return text[start : i + 1]
+ return None
+
+ @staticmethod
+ def _json_try_parse(text: str) -> Any:
+ try:
+ return json.loads(text)
+ except Exception:
+ pass
+
+ # Minimal repair: remove trailing commas before } or ]
+ repaired = re.sub(r",\s*([}\]])", r"\1", text)
+ try:
+ return json.loads(repaired)
+ except Exception:
+ return None
+
+ @staticmethod
+ def _to_int(value: Any, default: int | None) -> int | None:
+ try:
+ if value is None or value == "":
+ return default
+ return int(value)
+ except Exception:
+ return default
diff --git a/difyPlugin/pdf/tools/pdf_toc.yaml b/difyPlugin/pdf/tools/pdf_toc.yaml
new file mode 100644
index 00000000..d938c681
--- /dev/null
+++ b/difyPlugin/pdf/tools/pdf_toc.yaml
@@ -0,0 +1,79 @@
+identity:
+ name: "pdf_toc"
+ author: "yslg"
+ label:
+ en_US: "PDF TOC Parser"
+ zh_Hans: "PDF目录解析"
+ pt_BR: "Analisador de Sumário PDF"
+ ja_JP: "PDF目次解析"
+description:
+ human:
+ en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
+ zh_Hans: "通过LLM将PDF目录文本(来自目录页提取工具的输出)解析为结构化JSON目录"
+ pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
+ ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
+ llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
+parameters:
+ - name: toc_start
+ type: number
+ required: true
+ label:
+ en_US: TOC Start Page
+ zh_Hans: 目录起始页
+ pt_BR: Página Inicial do Sumário
+ ja_JP: 目次開始ページ
+ human_description:
+ en_US: "Start page index of TOC (from pdf_column_range output)"
+ zh_Hans: "目录起始页码(来自目录页提取工具输出的 start)"
+ pt_BR: "Índice da página inicial do sumário"
+ ja_JP: "目次の開始ページ番号"
+ llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
+ form: llm
+ - name: toc_end
+ type: number
+ required: true
+ label:
+ en_US: TOC End Page
+ zh_Hans: 目录结束页
+ pt_BR: Página Final do Sumário
+ ja_JP: 目次終了ページ
+ human_description:
+ en_US: "End page index of TOC (from pdf_column_range output)"
+ zh_Hans: "目录结束页码(来自目录页提取工具输出的 end)"
+ pt_BR: "Índice da página final do sumário"
+ ja_JP: "目次の終了ページ番号"
+ llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
+ form: llm
+ - name: toc_pages
+ type: string
+ required: true
+ label:
+ en_US: TOC Page Text
+ zh_Hans: 目录页文本
+ pt_BR: Texto das Páginas do Sumário
+ ja_JP: 目次ページテキスト
+ human_description:
+ en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
+ zh_Hans: "目录页原始文本内容(来自目录页提取工具输出的 pages 数组)"
+ pt_BR: "Conteúdo de texto bruto das páginas do sumário"
+ ja_JP: "目次ページの生テキスト内容"
+ llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
+ form: llm
+ - name: model
+ type: model-selector
+ scope: llm
+ required: true
+ label:
+ en_US: LLM Model
+ zh_Hans: LLM 模型
+ pt_BR: Modelo LLM
+ ja_JP: LLMモデル
+ human_description:
+ en_US: "LLM model for parsing TOC into structured JSON"
+ zh_Hans: "用于解析目录的 LLM 模型"
+ pt_BR: "Modelo LLM para análise do sumário"
+ ja_JP: "目次解析用のLLMモデル"
+ form: form
+extra:
+ python:
+ source: tools/pdf_toc.py