插件

2026-03-02 17:12:17 +08:00
parent b30af4aff8
commit 843146cdd7
2489 changed files with 7434 additions and 61841 deletions
--- a/difyPlugin/pdf/tools/pdf.py
+++ b/difyPlugin/pdf/tools/pdf.py
@@ -0,0 +1,61 @@
+import re
+from collections.abc import Generator
+from io import BytesIO
+from typing import Any
+
+import PyPDF2
+from dify_plugin import Tool
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfTool(Tool):
+    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+        file = tool_parameters.get("file")
+        if not file:
+            yield self.create_text_message("Error: file is required")
+            return
+
+        # file.blob returns bytes
+        pdf_bytes = file.blob
+        reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
+        num_pages = len(reader.pages)
+
+        toc_start = None
+        toc_end = None
+
+        toc_patterns = [
+            r'目录',
+            r'Table of Contents',
+            r'Contents',
+            r'目次'
+        ]
+
+        for page_num in range(num_pages):
+            page = reader.pages[page_num]
+            text = page.extract_text() or ""
+
+            if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
+                if toc_start is None:
+                    toc_start = page_num
+                toc_end = page_num
+            elif toc_start is not None and toc_end is not None:
+                break
+
+        if toc_start is None:
+            yield self.create_json_message({
+                "start": None,
+                "end": None,
+                "pages": []
+            })
+            return
+
+        toc_pages = []
+        for page_num in range(toc_start, toc_end + 1):
+            page = reader.pages[page_num]
+            toc_pages.append(page.extract_text() or "")
+
+        yield self.create_json_message({
+            "start": toc_start,
+            "end": toc_end,
+            "pages": toc_pages
+        })
--- a/difyPlugin/pdf/tools/pdf.yaml
+++ b/difyPlugin/pdf/tools/pdf.yaml
@@ -0,0 +1,36 @@
+identity:
+  name: "pdf"
+  author: "yslg"
+  label:
+    en_US: "Extract TOC Pages and Content"
+    zh_Hans: "提取目录页和内容"
+    pt_BR: "Extrair páginas de sumário e conteúdo"
+    ja_JP: "目次ページと内容を抽出"
+description:
+  human:
+    en_US: "Extract table-of-contents page range and all page text in that range"
+    zh_Hans: "提取目录页范围以及该范围内所有页文本"
+    pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo"
+    ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出"
+  llm: "Extract table-of-contents page range and all page text in that range"
+parameters:
+  - name: file
+    type: file
+    required: true
+    label:
+      en_US: PDF File
+      zh_Hans: PDF 文件
+      pt_BR: Arquivo PDF
+      ja_JP: PDFファイル
+    human_description:
+      en_US: "PDF file to process"
+      zh_Hans: "要处理的 PDF 文件"
+      pt_BR: "Arquivo PDF para processar"
+      ja_JP: "処理するPDFファイル"
+    llm_description: "PDF file to process, output contains start/end/pages"
+    form: llm
+    fileTypes:
+      - "pdf"
+extra:
+  python:
+    source: tools/pdf.py
--- a/difyPlugin/pdf/tools/pdf_single_page.py
+++ b/difyPlugin/pdf/tools/pdf_single_page.py
@@ -0,0 +1,36 @@
+from collections.abc import Generator
+from io import BytesIO
+from typing import Any
+
+import PyPDF2
+from dify_plugin import Tool
+from dify_plugin.entities.tool import ToolInvokeMessage
+
+
+class PdfSinglePageTool(Tool):
+    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
+        file = tool_parameters.get("file")
+        page = tool_parameters.get("page", 0)
+
+        if not file:
+            yield self.create_text_message("Error: file is required")
+            return
+
+        pdf_bytes = file.blob
+        reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
+        num_pages = len(reader.pages)
+
+        page_index = int(page)
+        if page_index < 0:
+            page_index = 0
+        if page_index >= num_pages:
+            page_index = num_pages - 1
+
+        selected_page = reader.pages[page_index]
+        text = selected_page.extract_text() or ""
+
+        yield self.create_json_message({
+            "start": page_index,
+            "end": page_index,
+            "pages": [text]
+        })
--- a/difyPlugin/pdf/tools/pdf_single_page.yaml
+++ b/difyPlugin/pdf/tools/pdf_single_page.yaml
@@ -0,0 +1,52 @@
+identity:
+  name: "pdf_single_page"
+  author: "yslg"
+  label:
+    en_US: "Extract Single-Page Text"
+    zh_Hans: "提取单页文字"
+    pt_BR: "Extrair texto de página única"
+    ja_JP: "単一ページのテキストを抽出"
+description:
+  human:
+    en_US: "Extract text from one specified page"
+    zh_Hans: "提取指定单页文字"
+    pt_BR: "Extrair texto de uma página especificada"
+    ja_JP: "指定した1ページのテキストを抽出"
+  llm: "Extract text from one specified page"
+parameters:
+  - name: file
+    type: file
+    required: true
+    label:
+      en_US: PDF File
+      zh_Hans: PDF 文件
+      pt_BR: Arquivo PDF
+      ja_JP: PDFファイル
+    human_description:
+      en_US: "PDF file to process"
+      zh_Hans: "要处理的 PDF 文件"
+      pt_BR: "Arquivo PDF para processar"
+      ja_JP: "処理するPDFファイル"
+    llm_description: "PDF file to process"
+    form: llm
+    fileTypes:
+      - "pdf"
+  - name: page
+    type: number
+    required: true
+    label:
+      en_US: Page Index
+      zh_Hans: 页码
+      pt_BR: Índice da Página
+      ja_JP: ページ番号
+    human_description:
+      en_US: "Single page index to extract"
+      zh_Hans: "要提取的单页页码"
+      pt_BR: "Índice da página única para extrair"
+      ja_JP: "抽出対象のページ番号"
+    llm_description: "Single page index to extract"
+    form: llm
+    default: 0
+extra:
+  python:
+    source: tools/pdf_single_page.py