Files
urbanLifeline/difyPlugin/pdf/tools/pdf_single_page.py

45 lines
1.4 KiB
Python
Raw Normal View History

2026-03-06 14:50:43 +08:00
import json
2026-03-02 17:12:17 +08:00
from collections.abc import Generator
from io import BytesIO
from typing import Any
2026-03-06 14:50:43 +08:00
import fitz # PyMuPDF 核心库
2026-03-02 17:12:17 +08:00
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfSinglePageTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
page = tool_parameters.get("page", 0)
if not file:
yield self.create_text_message("Error: file is required")
return
2026-03-06 14:50:43 +08:00
# 从字节流加载 PDF替换 PyPDF2 的 PdfReader
2026-03-02 17:12:17 +08:00
pdf_bytes = file.blob
2026-03-06 14:50:43 +08:00
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开
num_pages = len(doc)
2026-03-02 17:12:17 +08:00
2026-03-06 14:50:43 +08:00
# 页码边界处理(逻辑与原代码一致)
2026-03-02 17:12:17 +08:00
page_index = int(page)
if page_index < 0:
page_index = 0
if page_index >= num_pages:
page_index = num_pages - 1
2026-03-06 14:50:43 +08:00
# 提取指定页面文本PyMuPDF 方式)
selected_page = doc[page_index]
text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准
2026-03-02 17:12:17 +08:00
2026-03-06 14:50:43 +08:00
# 关闭文档释放资源
doc.close()
result = {
2026-03-02 17:12:17 +08:00
"start": page_index,
"end": page_index,
"pages": [text]
2026-03-06 14:50:43 +08:00
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)