import json from collections.abc import Generator from typing import Any import fitz # PyMuPDF from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage class PdfExtractRangeTool(Tool): def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: file = tool_parameters.get("file") if not file: yield self.create_text_message("Error: file is required") return start_page = int(tool_parameters.get("start_page", 0)) end_page = int(tool_parameters.get("end_page", 0)) # 打开 PDF pdf_bytes = file.blob doc = fitz.open(stream=pdf_bytes, filetype="pdf") num_pages = len(doc) # 边界处理 start_page = max(0, min(start_page, num_pages - 1)) end_page = max(start_page, min(end_page, num_pages - 1)) # 逐页提取文本 page_texts = [] for page_idx in range(start_page, end_page + 1): page = doc[page_idx] text = page.get_text("text", sort=True) or "" page_texts.append(text) doc.close() # 拼接所有页面文本 full_text = "\n\n--- 分页 ---\n\n".join(page_texts) result = { "start": start_page, "end": end_page, "total_pages": end_page - start_page + 1, "text": full_text, } yield self.create_text_message(json.dumps(result, ensure_ascii=False)) yield self.create_json_message(result)