49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
|
|
import json
|
||
|
|
from collections.abc import Generator
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
import fitz # PyMuPDF
|
||
|
|
from dify_plugin import Tool
|
||
|
|
from dify_plugin.entities.tool import ToolInvokeMessage
|
||
|
|
|
||
|
|
|
||
|
|
class PdfExtractRangeTool(Tool):
|
||
|
|
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
||
|
|
file = tool_parameters.get("file")
|
||
|
|
if not file:
|
||
|
|
yield self.create_text_message("Error: file is required")
|
||
|
|
return
|
||
|
|
|
||
|
|
start_page = int(tool_parameters.get("start_page", 0))
|
||
|
|
end_page = int(tool_parameters.get("end_page", 0))
|
||
|
|
|
||
|
|
# 打开 PDF
|
||
|
|
pdf_bytes = file.blob
|
||
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||
|
|
num_pages = len(doc)
|
||
|
|
|
||
|
|
# 边界处理
|
||
|
|
start_page = max(0, min(start_page, num_pages - 1))
|
||
|
|
end_page = max(start_page, min(end_page, num_pages - 1))
|
||
|
|
|
||
|
|
# 逐页提取文本
|
||
|
|
page_texts = []
|
||
|
|
for page_idx in range(start_page, end_page + 1):
|
||
|
|
page = doc[page_idx]
|
||
|
|
text = page.get_text("text", sort=True) or ""
|
||
|
|
page_texts.append(text)
|
||
|
|
|
||
|
|
doc.close()
|
||
|
|
|
||
|
|
# 拼接所有页面文本
|
||
|
|
full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
|
||
|
|
|
||
|
|
result = {
|
||
|
|
"start": start_page,
|
||
|
|
"end": end_page,
|
||
|
|
"total_pages": end_page - start_page + 1,
|
||
|
|
"text": full_text,
|
||
|
|
}
|
||
|
|
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
|
||
|
|
yield self.create_json_message(result)
|