更新
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
import PyPDF2
|
||||
import fitz # PyMuPDF 核心库
|
||||
from dify_plugin import Tool
|
||||
from dify_plugin.entities.tool import ToolInvokeMessage
|
||||
|
||||
@@ -16,21 +17,29 @@ class PdfSinglePageTool(Tool):
|
||||
yield self.create_text_message("Error: file is required")
|
||||
return
|
||||
|
||||
# 从字节流加载 PDF(替换 PyPDF2 的 PdfReader)
|
||||
pdf_bytes = file.blob
|
||||
reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
|
||||
num_pages = len(reader.pages)
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开
|
||||
num_pages = len(doc)
|
||||
|
||||
# 页码边界处理(逻辑与原代码一致)
|
||||
page_index = int(page)
|
||||
if page_index < 0:
|
||||
page_index = 0
|
||||
if page_index >= num_pages:
|
||||
page_index = num_pages - 1
|
||||
|
||||
selected_page = reader.pages[page_index]
|
||||
text = selected_page.extract_text() or ""
|
||||
# 提取指定页面文本(PyMuPDF 方式)
|
||||
selected_page = doc[page_index]
|
||||
text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准
|
||||
|
||||
yield self.create_json_message({
|
||||
# 关闭文档释放资源
|
||||
doc.close()
|
||||
|
||||
result = {
|
||||
"start": page_index,
|
||||
"end": page_index,
|
||||
"pages": [text]
|
||||
})
|
||||
}
|
||||
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
|
||||
yield self.create_json_message(result)
|
||||
Reference in New Issue
Block a user