import re from collections.abc import Generator from io import BytesIO from typing import Any import PyPDF2 from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage class PdfTool(Tool): def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: file = tool_parameters.get("file") if not file: yield self.create_text_message("Error: file is required") return # file.blob returns bytes pdf_bytes = file.blob reader = PyPDF2.PdfReader(BytesIO(pdf_bytes)) num_pages = len(reader.pages) toc_start = None toc_end = None toc_patterns = [ r'目录', r'Table of Contents', r'Contents', r'目次' ] for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() or "" if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): if toc_start is None: toc_start = page_num toc_end = page_num elif toc_start is not None and toc_end is not None: break if toc_start is None: yield self.create_json_message({ "start": None, "end": None, "pages": [] }) return toc_pages = [] for page_num in range(toc_start, toc_end + 1): page = reader.pages[page_num] toc_pages.append(page.extract_text() or "") yield self.create_json_message({ "start": toc_start, "end": toc_end, "pages": toc_pages })