This commit is contained in:
2026-03-02 17:12:17 +08:00
parent b30af4aff8
commit 843146cdd7
2489 changed files with 7434 additions and 61841 deletions

View File

@@ -0,0 +1,61 @@
import re
from collections.abc import Generator
from io import BytesIO
from typing import Any
import PyPDF2
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
# file.blob returns bytes
pdf_bytes = file.blob
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
num_pages = len(reader.pages)
toc_start = None
toc_end = None
toc_patterns = [
r'目录',
r'Table of Contents',
r'Contents',
r'目次'
]
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
if toc_start is None:
yield self.create_json_message({
"start": None,
"end": None,
"pages": []
})
return
toc_pages = []
for page_num in range(toc_start, toc_end + 1):
page = reader.pages[page_num]
toc_pages.append(page.extract_text() or "")
yield self.create_json_message({
"start": toc_start,
"end": toc_end,
"pages": toc_pages
})

View File

@@ -0,0 +1,36 @@
identity:
name: "pdf"
author: "yslg"
label:
en_US: "Extract TOC Pages and Content"
zh_Hans: "提取目录页和内容"
pt_BR: "Extrair páginas de sumário e conteúdo"
ja_JP: "目次ページと内容を抽出"
description:
human:
en_US: "Extract table-of-contents page range and all page text in that range"
zh_Hans: "提取目录页范围以及该范围内所有页文本"
pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo"
ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出"
llm: "Extract table-of-contents page range and all page text in that range"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to process, output contains start/end/pages"
form: llm
fileTypes:
- "pdf"
extra:
python:
source: tools/pdf.py

View File

@@ -0,0 +1,36 @@
from collections.abc import Generator
from io import BytesIO
from typing import Any
import PyPDF2
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfSinglePageTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
page = tool_parameters.get("page", 0)
if not file:
yield self.create_text_message("Error: file is required")
return
pdf_bytes = file.blob
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
num_pages = len(reader.pages)
page_index = int(page)
if page_index < 0:
page_index = 0
if page_index >= num_pages:
page_index = num_pages - 1
selected_page = reader.pages[page_index]
text = selected_page.extract_text() or ""
yield self.create_json_message({
"start": page_index,
"end": page_index,
"pages": [text]
})

View File

@@ -0,0 +1,52 @@
identity:
name: "pdf_single_page"
author: "yslg"
label:
en_US: "Extract Single-Page Text"
zh_Hans: "提取单页文字"
pt_BR: "Extrair texto de página única"
ja_JP: "単一ページのテキストを抽出"
description:
human:
en_US: "Extract text from one specified page"
zh_Hans: "提取指定单页文字"
pt_BR: "Extrair texto de uma página especificada"
ja_JP: "指定した1ページのテキストを抽出"
llm: "Extract text from one specified page"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to process"
form: llm
fileTypes:
- "pdf"
- name: page
type: number
required: true
label:
en_US: Page Index
zh_Hans: 页码
pt_BR: Índice da Página
ja_JP: ページ番号
human_description:
en_US: "Single page index to extract"
zh_Hans: "要提取的单页页码"
pt_BR: "Índice da página única para extrair"
ja_JP: "抽出対象のページ番号"
llm_description: "Single page index to extract"
form: llm
default: 0
extra:
python:
source: tools/pdf_single_page.py