插件
This commit is contained in:
61
difyPlugin/pdf/tools/pdf.py
Normal file
61
difyPlugin/pdf/tools/pdf.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import re
|
||||
from collections.abc import Generator
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
import PyPDF2
|
||||
from dify_plugin import Tool
|
||||
from dify_plugin.entities.tool import ToolInvokeMessage
|
||||
|
||||
|
||||
class PdfTool(Tool):
|
||||
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
||||
file = tool_parameters.get("file")
|
||||
if not file:
|
||||
yield self.create_text_message("Error: file is required")
|
||||
return
|
||||
|
||||
# file.blob returns bytes
|
||||
pdf_bytes = file.blob
|
||||
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
toc_start = None
|
||||
toc_end = None
|
||||
|
||||
toc_patterns = [
|
||||
r'目录',
|
||||
r'Table of Contents',
|
||||
r'Contents',
|
||||
r'目次'
|
||||
]
|
||||
|
||||
for page_num in range(num_pages):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text() or ""
|
||||
|
||||
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
|
||||
if toc_start is None:
|
||||
toc_start = page_num
|
||||
toc_end = page_num
|
||||
elif toc_start is not None and toc_end is not None:
|
||||
break
|
||||
|
||||
if toc_start is None:
|
||||
yield self.create_json_message({
|
||||
"start": None,
|
||||
"end": None,
|
||||
"pages": []
|
||||
})
|
||||
return
|
||||
|
||||
toc_pages = []
|
||||
for page_num in range(toc_start, toc_end + 1):
|
||||
page = reader.pages[page_num]
|
||||
toc_pages.append(page.extract_text() or "")
|
||||
|
||||
yield self.create_json_message({
|
||||
"start": toc_start,
|
||||
"end": toc_end,
|
||||
"pages": toc_pages
|
||||
})
|
||||
36
difyPlugin/pdf/tools/pdf.yaml
Normal file
36
difyPlugin/pdf/tools/pdf.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
identity:
|
||||
name: "pdf"
|
||||
author: "yslg"
|
||||
label:
|
||||
en_US: "Extract TOC Pages and Content"
|
||||
zh_Hans: "提取目录页和内容"
|
||||
pt_BR: "Extrair páginas de sumário e conteúdo"
|
||||
ja_JP: "目次ページと内容を抽出"
|
||||
description:
|
||||
human:
|
||||
en_US: "Extract table-of-contents page range and all page text in that range"
|
||||
zh_Hans: "提取目录页范围以及该范围内所有页文本"
|
||||
pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo"
|
||||
ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出"
|
||||
llm: "Extract table-of-contents page range and all page text in that range"
|
||||
parameters:
|
||||
- name: file
|
||||
type: file
|
||||
required: true
|
||||
label:
|
||||
en_US: PDF File
|
||||
zh_Hans: PDF 文件
|
||||
pt_BR: Arquivo PDF
|
||||
ja_JP: PDFファイル
|
||||
human_description:
|
||||
en_US: "PDF file to process"
|
||||
zh_Hans: "要处理的 PDF 文件"
|
||||
pt_BR: "Arquivo PDF para processar"
|
||||
ja_JP: "処理するPDFファイル"
|
||||
llm_description: "PDF file to process, output contains start/end/pages"
|
||||
form: llm
|
||||
fileTypes:
|
||||
- "pdf"
|
||||
extra:
|
||||
python:
|
||||
source: tools/pdf.py
|
||||
36
difyPlugin/pdf/tools/pdf_single_page.py
Normal file
36
difyPlugin/pdf/tools/pdf_single_page.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from collections.abc import Generator
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
import PyPDF2
|
||||
from dify_plugin import Tool
|
||||
from dify_plugin.entities.tool import ToolInvokeMessage
|
||||
|
||||
|
||||
class PdfSinglePageTool(Tool):
|
||||
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
||||
file = tool_parameters.get("file")
|
||||
page = tool_parameters.get("page", 0)
|
||||
|
||||
if not file:
|
||||
yield self.create_text_message("Error: file is required")
|
||||
return
|
||||
|
||||
pdf_bytes = file.blob
|
||||
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
page_index = int(page)
|
||||
if page_index < 0:
|
||||
page_index = 0
|
||||
if page_index >= num_pages:
|
||||
page_index = num_pages - 1
|
||||
|
||||
selected_page = reader.pages[page_index]
|
||||
text = selected_page.extract_text() or ""
|
||||
|
||||
yield self.create_json_message({
|
||||
"start": page_index,
|
||||
"end": page_index,
|
||||
"pages": [text]
|
||||
})
|
||||
52
difyPlugin/pdf/tools/pdf_single_page.yaml
Normal file
52
difyPlugin/pdf/tools/pdf_single_page.yaml
Normal file
@@ -0,0 +1,52 @@
|
||||
identity:
|
||||
name: "pdf_single_page"
|
||||
author: "yslg"
|
||||
label:
|
||||
en_US: "Extract Single-Page Text"
|
||||
zh_Hans: "提取单页文字"
|
||||
pt_BR: "Extrair texto de página única"
|
||||
ja_JP: "単一ページのテキストを抽出"
|
||||
description:
|
||||
human:
|
||||
en_US: "Extract text from one specified page"
|
||||
zh_Hans: "提取指定单页文字"
|
||||
pt_BR: "Extrair texto de uma página especificada"
|
||||
ja_JP: "指定した1ページのテキストを抽出"
|
||||
llm: "Extract text from one specified page"
|
||||
parameters:
|
||||
- name: file
|
||||
type: file
|
||||
required: true
|
||||
label:
|
||||
en_US: PDF File
|
||||
zh_Hans: PDF 文件
|
||||
pt_BR: Arquivo PDF
|
||||
ja_JP: PDFファイル
|
||||
human_description:
|
||||
en_US: "PDF file to process"
|
||||
zh_Hans: "要处理的 PDF 文件"
|
||||
pt_BR: "Arquivo PDF para processar"
|
||||
ja_JP: "処理するPDFファイル"
|
||||
llm_description: "PDF file to process"
|
||||
form: llm
|
||||
fileTypes:
|
||||
- "pdf"
|
||||
- name: page
|
||||
type: number
|
||||
required: true
|
||||
label:
|
||||
en_US: Page Index
|
||||
zh_Hans: 页码
|
||||
pt_BR: Índice da Página
|
||||
ja_JP: ページ番号
|
||||
human_description:
|
||||
en_US: "Single page index to extract"
|
||||
zh_Hans: "要提取的单页页码"
|
||||
pt_BR: "Índice da página única para extrair"
|
||||
ja_JP: "抽出対象のページ番号"
|
||||
llm_description: "Single page index to extract"
|
||||
form: llm
|
||||
default: 0
|
||||
extra:
|
||||
python:
|
||||
source: tools/pdf_single_page.py
|
||||
Reference in New Issue
Block a user