Files
urbanLifeline/difyPlugin/pdf/tools/pdf_extract_range.py
2026-03-06 14:50:43 +08:00

49 lines
1.5 KiB
Python

import json
from collections.abc import Generator
from typing import Any
import fitz # PyMuPDF
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfExtractRangeTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
start_page = int(tool_parameters.get("start_page", 0))
end_page = int(tool_parameters.get("end_page", 0))
# 打开 PDF
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
# 边界处理
start_page = max(0, min(start_page, num_pages - 1))
end_page = max(start_page, min(end_page, num_pages - 1))
# 逐页提取文本
page_texts = []
for page_idx in range(start_page, end_page + 1):
page = doc[page_idx]
text = page.get_text("text", sort=True) or ""
page_texts.append(text)
doc.close()
# 拼接所有页面文本
full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
result = {
"start": start_page,
"end": end_page,
"total_pages": end_page - start_page + 1,
"text": full_text,
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)