import json
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.model.llm import LLMModelConfig
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
from dify_plugin.entities.tool import ToolInvokeMessage
_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。
要求:
1. 识别所有一级和二级标题及其对应的页码
2. 只返回纯JSON数组,不要markdown代码块,不要任何解释
3. 格式: [{"title": "章节标题", "page": 页码数字}]
4. 页码必须是文档中标注的实际页码数字
5. 如果无法识别目录,返回空数组 []"""
class PdfTocTool(Tool):
_TOC_PATTERNS = [
r"目录",
r"目\s*录",
r"目\u3000录",
r"Table of Contents",
r"Contents",
r"目次",
]
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
model_config = tool_parameters.get("model")
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
# 1) 优先从PDF元数据提取目录
catalog = self._catalog_from_metadata(doc.get_toc(), num_pages)
# 2) 元数据无目录时,使用LLM解析
if not catalog and model_config:
catalog = self._extract_toc_with_llm(doc, num_pages, model_config)
# 3) 无LLM配置时回退到正则解析
if not catalog:
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
)
printed_catalog = self._parse_toc_lines(toc_text)
catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages)
if not catalog:
catalog = []
yield self.create_text_message(json.dumps(catalog, ensure_ascii=False))
finally:
doc.close()
def _extract_toc_with_llm(
self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any]
) -> list[dict[str, int | str]]:
# 先尝试定位目录页
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
# 有目录页,提取目录页文本
toc_text = "\n".join(
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
)
content_offset = toc_end
else:
# 无目录页,提取前15页文本让LLM识别章节结构
sample = min(num_pages, 15)
toc_text = "\n\n--- 第{}页 ---\n".join(
[""] + [doc[i].get_text() or "" for i in range(sample)]
)
toc_text = toc_text.strip()
if not toc_text:
return []
content_offset = 0
# 截断过长文本
if len(toc_text) > 15000:
toc_text = toc_text[:15000] + "\n...[截断]"
try:
response = self.session.model.llm.invoke(
model_config=LLMModelConfig(**model_config),
prompt_messages=[
SystemPromptMessage(content=_TOC_SYSTEM_PROMPT),
UserPromptMessage(content=toc_text),
],
stream=False,
)
llm_text = self._get_response_text(response)
if not llm_text:
return []
raw_catalog = self._parse_llm_json(llm_text)
if not raw_catalog:
return []
# 转换LLM返回的简单格式为完整catalog
return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages)
except Exception:
return []
def _build_catalog_from_llm(
self, raw: list[dict], content_offset: int, num_pages: int
) -> list[dict[str, int | str]]:
entries: list[tuple[str, int]] = []
for item in raw:
title = str(item.get("title") or "").strip()
page = self._to_int(item.get("page"), None)
if not title or page is None:
continue
entries.append((title, page))
if not entries:
return []
# 计算偏移量:第一个条目的页码与实际内容起始页的差值
first_printed_page = entries[0][1]
offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0
result: list[dict[str, int | str]] = []
for i, (title, page) in enumerate(entries):
next_page = entries[i + 1][1] if i + 1 < len(entries) else page
page_start_index = max(0, min(page + offset - 1, num_pages - 1))
page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1))
if i == len(entries) - 1:
page_end_index = num_pages - 1
result.append({
"title": title,
"start": page,
"end": max(page, next_page - 1) if i + 1 < len(entries) else page,
"page_start_index": page_start_index,
"page_end_index": page_end_index,
})
return result
@staticmethod
def _get_response_text(response: Any) -> str:
if not hasattr(response, "message") or not response.message:
return ""
content = response.message.content
if isinstance(content, str):
text = content
elif isinstance(content, list):
text = "".join(
item.data if hasattr(item, "data") else str(item) for item in content
)
else:
text = str(content)
# 清理思考标签
text = re.sub(r"