61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
import re
|
|
from collections.abc import Generator
|
|
from io import BytesIO
|
|
from typing import Any
|
|
|
|
import PyPDF2
|
|
from dify_plugin import Tool
|
|
from dify_plugin.entities.tool import ToolInvokeMessage
|
|
|
|
|
|
class PdfTool(Tool):
|
|
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
|
file = tool_parameters.get("file")
|
|
if not file:
|
|
yield self.create_text_message("Error: file is required")
|
|
return
|
|
|
|
# file.blob returns bytes
|
|
pdf_bytes = file.blob
|
|
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
|
|
num_pages = len(reader.pages)
|
|
|
|
toc_start = None
|
|
toc_end = None
|
|
|
|
toc_patterns = [
|
|
r'目录',
|
|
r'Table of Contents',
|
|
r'Contents',
|
|
r'目次'
|
|
]
|
|
|
|
for page_num in range(num_pages):
|
|
page = reader.pages[page_num]
|
|
text = page.extract_text() or ""
|
|
|
|
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
|
|
if toc_start is None:
|
|
toc_start = page_num
|
|
toc_end = page_num
|
|
elif toc_start is not None and toc_end is not None:
|
|
break
|
|
|
|
if toc_start is None:
|
|
yield self.create_json_message({
|
|
"start": None,
|
|
"end": None,
|
|
"pages": []
|
|
})
|
|
return
|
|
|
|
toc_pages = []
|
|
for page_num in range(toc_start, toc_end + 1):
|
|
page = reader.pages[page_num]
|
|
toc_pages.append(page.extract_text() or "")
|
|
|
|
yield self.create_json_message({
|
|
"start": toc_start,
|
|
"end": toc_end,
|
|
"pages": toc_pages
|
|
}) |