Files
urbanLifeline/difyPlugin/pdf/tools/pdf.py

61 lines
1.7 KiB
Python
Raw Normal View History

2026-03-02 17:12:17 +08:00
import re
from collections.abc import Generator
from io import BytesIO
from typing import Any
import PyPDF2
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
# file.blob returns bytes
pdf_bytes = file.blob
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
num_pages = len(reader.pages)
toc_start = None
toc_end = None
toc_patterns = [
r'目录',
r'Table of Contents',
r'Contents',
r'目次'
]
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
if toc_start is None:
yield self.create_json_message({
"start": None,
"end": None,
"pages": []
})
return
toc_pages = []
for page_num in range(toc_start, toc_end + 1):
page = reader.pages[page_num]
toc_pages.append(page.extract_text() or "")
yield self.create_json_message({
"start": toc_start,
"end": toc_end,
"pages": toc_pages
})