更新

2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions
--- a/difyPlugin/pdf/tools/pdf_toc.yaml
+++ b/difyPlugin/pdf/tools/pdf_toc.yaml
@@ -2,63 +2,35 @@ identity:
  name: "pdf_toc"
  author: "yslg"
  label:
-    en_US: "PDF TOC Parser"
-    zh_Hans: "PDF目录解析"
-    pt_BR: "Analisador de Sumário PDF"
-    ja_JP: "PDF目次解析"
+    en_US: "PDF TOC"
+    zh_Hans: "PDF 目录提取"
+    pt_BR: "PDF TOC"
+    ja_JP: "PDF TOC"
 description:
  human:
-    en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
-    zh_Hans: "通过LLM将PDF目录文本（来自目录页提取工具的输出）解析为结构化JSON目录"
-    pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
-    ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
-  llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
+    en_US: "Extract the catalog array from a PDF file using metadata or LLM."
+    zh_Hans: "从PDF文件中提取目录数组，优先使用元数据，回退使用LLM解析。"
+    pt_BR: "Extrair o array de catálogo de um arquivo PDF."
+    ja_JP: "PDFファイルからカタログ配列を抽出する。"
+  llm: "Extract a catalog array from a PDF file. Returns JSON text like [{title,start,end,page_start_index,page_end_index}]."
 parameters:
-  - name: toc_start
-    type: number
+  - name: file
+    type: file
    required: true
    label:
-      en_US: TOC Start Page
-      zh_Hans: 目录起始页
-      pt_BR: Página Inicial do Sumário
-      ja_JP: 目次開始ページ
+      en_US: PDF File
+      zh_Hans: PDF 文件
+      pt_BR: PDF File
+      ja_JP: PDF File
    human_description:
-      en_US: "Start page index of TOC (from pdf_column_range output)"
-      zh_Hans: "目录起始页码（来自目录页提取工具输出的 start）"
-      pt_BR: "Índice da página inicial do sumário"
-      ja_JP: "目次の開始ページ番号"
-    llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
-    form: llm
-  - name: toc_end
-    type: number
-    required: true
-    label:
-      en_US: TOC End Page
-      zh_Hans: 目录结束页
-      pt_BR: Página Final do Sumário
-      ja_JP: 目次終了ページ
-    human_description:
-      en_US: "End page index of TOC (from pdf_column_range output)"
-      zh_Hans: "目录结束页码（来自目录页提取工具输出的 end）"
-      pt_BR: "Índice da página final do sumário"
-      ja_JP: "目次の終了ページ番号"
-    llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
-    form: llm
-  - name: toc_pages
-    type: string
-    required: true
-    label:
-      en_US: TOC Page Text
-      zh_Hans: 目录页文本
-      pt_BR: Texto das Páginas do Sumário
-      ja_JP: 目次ページテキスト
-    human_description:
-      en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
-      zh_Hans: "目录页原始文本内容（来自目录页提取工具输出的 pages 数组）"
-      pt_BR: "Conteúdo de texto bruto das páginas do sumário"
-      ja_JP: "目次ページの生テキスト内容"
-    llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
+      en_US: "PDF file to inspect"
+      zh_Hans: "要解析的PDF文件"
+      pt_BR: "PDF file to inspect"
+      ja_JP: "PDF file to inspect"
+    llm_description: "PDF file to extract catalog from"
    form: llm
+    fileTypes:
+      - "pdf"
  - name: model
    type: model-selector
    scope: llm
@@ -69,10 +41,10 @@ parameters:
      pt_BR: Modelo LLM
      ja_JP: LLMモデル
    human_description:
-      en_US: "LLM model for parsing TOC into structured JSON"
-      zh_Hans: "用于解析目录的 LLM 模型"
-      pt_BR: "Modelo LLM para análise do sumário"
-      ja_JP: "目次解析用のLLMモデル"
+      en_US: "LLM model used for parsing TOC when metadata is unavailable"
+      zh_Hans: "当元数据不可用时，用于解析目录的LLM模型"
+      pt_BR: "Modelo LLM para análise de TOC"
+      ja_JP: "メタデータが利用できない場合のTOC解析用LLMモデル"
    form: form
 extra:
  python: