更新
This commit is contained in:
@@ -4,264 +4,303 @@ from collections import OrderedDict
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
import fitz
|
||||
from dify_plugin import Tool
|
||||
from dify_plugin.entities.model.llm import LLMModelConfig
|
||||
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
|
||||
from dify_plugin.entities.tool import ToolInvokeMessage
|
||||
|
||||
_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
|
||||
Return only valid JSON object, no markdown fences, no explanation.
|
||||
Output schema:
|
||||
{
|
||||
"Chapter Name": {"start": 1, "end": 5},
|
||||
"Another": {"start": 6, "end": 20}
|
||||
}
|
||||
Rules:
|
||||
- start/end are integer printed page numbers from TOC.
|
||||
- If end is unknown, use same value as start.
|
||||
- Keep chapter names exactly as in TOC text.
|
||||
"""
|
||||
_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。
|
||||
|
||||
要求:
|
||||
1. 识别所有一级和二级标题及其对应的页码
|
||||
2. 只返回纯JSON数组,不要markdown代码块,不要任何解释
|
||||
3. 格式: [{"title": "章节标题", "page": 页码数字}]
|
||||
4. 页码必须是文档中标注的实际页码数字
|
||||
5. 如果无法识别目录,返回空数组 []"""
|
||||
|
||||
|
||||
class PdfTocTool(Tool):
|
||||
_TOC_PATTERNS = [
|
||||
r"目录",
|
||||
r"目\s*录",
|
||||
r"目\u3000录",
|
||||
r"Table of Contents",
|
||||
r"Contents",
|
||||
r"目次",
|
||||
]
|
||||
|
||||
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
|
||||
toc_start = self._to_int(tool_parameters.get("toc_start"), None)
|
||||
toc_end = self._to_int(tool_parameters.get("toc_end"), None)
|
||||
toc_pages = (tool_parameters.get("toc_pages") or "").strip()
|
||||
file = tool_parameters.get("file")
|
||||
if not file:
|
||||
yield self.create_text_message("Error: file is required")
|
||||
return
|
||||
|
||||
model_config = tool_parameters.get("model")
|
||||
|
||||
if toc_start is None or toc_end is None:
|
||||
yield self.create_text_message("Error: toc_start and toc_end are required")
|
||||
return
|
||||
doc = fitz.open(stream=file.blob, filetype="pdf")
|
||||
try:
|
||||
num_pages = len(doc)
|
||||
|
||||
if not toc_pages:
|
||||
yield self.create_text_message("Error: toc_pages text is empty")
|
||||
return
|
||||
# 1) 优先从PDF元数据提取目录
|
||||
catalog = self._catalog_from_metadata(doc.get_toc(), num_pages)
|
||||
|
||||
cleaned = self._strip_index_lists(toc_pages)
|
||||
# 2) 元数据无目录时,使用LLM解析
|
||||
if not catalog and model_config:
|
||||
catalog = self._extract_toc_with_llm(doc, num_pages, model_config)
|
||||
|
||||
# 1) deterministic parser first
|
||||
catalog = self._parse_toc_lines(cleaned)
|
||||
# 3) 无LLM配置时回退到正则解析
|
||||
if not catalog:
|
||||
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
|
||||
if toc_start is not None and toc_end is not None:
|
||||
toc_text = "\n".join(
|
||||
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
|
||||
)
|
||||
printed_catalog = self._parse_toc_lines(toc_text)
|
||||
catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages)
|
||||
|
||||
# 2) optional LLM fallback/enhance only when deterministic parser gives no result
|
||||
llm_raw_output = ""
|
||||
llm_error = None
|
||||
if not catalog and model_config:
|
||||
llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
|
||||
toc_start=toc_start,
|
||||
toc_end=toc_end,
|
||||
toc_pages=cleaned,
|
||||
model_config=model_config,
|
||||
if not catalog:
|
||||
catalog = []
|
||||
|
||||
yield self.create_text_message(json.dumps(catalog, ensure_ascii=False))
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def _extract_toc_with_llm(
|
||||
self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any]
|
||||
) -> list[dict[str, int | str]]:
|
||||
# 先尝试定位目录页
|
||||
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
|
||||
|
||||
if toc_start is not None and toc_end is not None:
|
||||
# 有目录页,提取目录页文本
|
||||
toc_text = "\n".join(
|
||||
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
|
||||
)
|
||||
if llm_catalog:
|
||||
catalog = self._normalize_catalog(llm_catalog)
|
||||
content_offset = toc_end
|
||||
else:
|
||||
# 无目录页,提取前15页文本让LLM识别章节结构
|
||||
sample = min(num_pages, 15)
|
||||
toc_text = "\n\n--- 第{}页 ---\n".join(
|
||||
[""] + [doc[i].get_text() or "" for i in range(sample)]
|
||||
)
|
||||
toc_text = toc_text.strip()
|
||||
if not toc_text:
|
||||
return []
|
||||
content_offset = 0
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"toc_start": toc_start,
|
||||
"toc_end": toc_end,
|
||||
"catalog": catalog,
|
||||
"meta": {
|
||||
"catalog_size": len(catalog),
|
||||
"parser": "rule" if catalog else "none",
|
||||
},
|
||||
}
|
||||
# 截断过长文本
|
||||
if len(toc_text) > 15000:
|
||||
toc_text = toc_text[:15000] + "\n...[截断]"
|
||||
|
||||
if llm_raw_output:
|
||||
result["meta"]["llm_used"] = True
|
||||
if llm_error:
|
||||
result["meta"]["llm_error"] = llm_error
|
||||
try:
|
||||
response = self.session.model.llm.invoke(
|
||||
model_config=LLMModelConfig(**model_config),
|
||||
prompt_messages=[
|
||||
SystemPromptMessage(content=_TOC_SYSTEM_PROMPT),
|
||||
UserPromptMessage(content=toc_text),
|
||||
],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
# always return valid json text payload for downstream json.loads
|
||||
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
|
||||
yield self.create_json_message(result)
|
||||
llm_text = self._get_response_text(response)
|
||||
if not llm_text:
|
||||
return []
|
||||
|
||||
def _parse_with_llm(
|
||||
self,
|
||||
toc_start: int,
|
||||
toc_end: int,
|
||||
toc_pages: str,
|
||||
model_config: dict[str, Any],
|
||||
) -> tuple[dict[str, Any] | None, str, str | None]:
|
||||
user_content = (
|
||||
f"TOC page index range: {toc_start}..{toc_end}\n\n"
|
||||
f"TOC raw text:\n{toc_pages}"
|
||||
)
|
||||
response = self.session.model.llm.invoke(
|
||||
model_config=LLMModelConfig(**model_config),
|
||||
prompt_messages=[
|
||||
SystemPromptMessage(content=_SYSTEM_PROMPT),
|
||||
UserPromptMessage(content=user_content),
|
||||
],
|
||||
stream=False,
|
||||
)
|
||||
raw_catalog = self._parse_llm_json(llm_text)
|
||||
if not raw_catalog:
|
||||
return []
|
||||
|
||||
llm_text = ""
|
||||
if hasattr(response, "message") and response.message:
|
||||
content = response.message.content
|
||||
if isinstance(content, str):
|
||||
llm_text = content
|
||||
elif isinstance(content, list):
|
||||
llm_text = "".join(
|
||||
item.data if hasattr(item, "data") else str(item) for item in content
|
||||
)
|
||||
# 转换LLM返回的简单格式为完整catalog
|
||||
return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
parsed = self._extract_json_object(llm_text)
|
||||
if parsed is None:
|
||||
return None, llm_text, "Failed to parse LLM output as JSON"
|
||||
if not isinstance(parsed, dict):
|
||||
return None, llm_text, "LLM output JSON is not an object"
|
||||
def _build_catalog_from_llm(
|
||||
self, raw: list[dict], content_offset: int, num_pages: int
|
||||
) -> list[dict[str, int | str]]:
|
||||
entries: list[tuple[str, int]] = []
|
||||
for item in raw:
|
||||
title = str(item.get("title") or "").strip()
|
||||
page = self._to_int(item.get("page"), None)
|
||||
if not title or page is None:
|
||||
continue
|
||||
entries.append((title, page))
|
||||
|
||||
return parsed, llm_text, None
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
# 计算偏移量:第一个条目的页码与实际内容起始页的差值
|
||||
first_printed_page = entries[0][1]
|
||||
offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0
|
||||
|
||||
result: list[dict[str, int | str]] = []
|
||||
for i, (title, page) in enumerate(entries):
|
||||
next_page = entries[i + 1][1] if i + 1 < len(entries) else page
|
||||
page_start_index = max(0, min(page + offset - 1, num_pages - 1))
|
||||
page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1))
|
||||
if i == len(entries) - 1:
|
||||
page_end_index = num_pages - 1
|
||||
|
||||
result.append({
|
||||
"title": title,
|
||||
"start": page,
|
||||
"end": max(page, next_page - 1) if i + 1 < len(entries) else page,
|
||||
"page_start_index": page_start_index,
|
||||
"page_end_index": page_end_index,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _strip_index_lists(text: str) -> str:
|
||||
# Stop before common appendix lists that pollute TOC parsing.
|
||||
pattern = re.compile(
|
||||
def _get_response_text(response: Any) -> str:
|
||||
if not hasattr(response, "message") or not response.message:
|
||||
return ""
|
||||
content = response.message.content
|
||||
if isinstance(content, str):
|
||||
text = content
|
||||
elif isinstance(content, list):
|
||||
text = "".join(
|
||||
item.data if hasattr(item, "data") else str(item) for item in content
|
||||
)
|
||||
else:
|
||||
text = str(content)
|
||||
|
||||
# 清理思考标签
|
||||
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<\|[^>]+\|>", "", text)
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def _parse_llm_json(text: str) -> list[dict]:
|
||||
# 尝试提取JSON代码块
|
||||
code_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
|
||||
if code_match:
|
||||
text = code_match.group(1).strip()
|
||||
|
||||
# 尝试找到JSON数组
|
||||
bracket_match = re.search(r"\[[\s\S]*\]", text)
|
||||
if bracket_match:
|
||||
text = bracket_match.group(0)
|
||||
|
||||
try:
|
||||
result = json.loads(text)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
def _catalog_from_metadata(self, toc: list, num_pages: int) -> list[dict[str, int | str]]:
|
||||
top = [(title, max(0, page - 1)) for level, title, page in toc if level <= 2 and page >= 1]
|
||||
if not top:
|
||||
return []
|
||||
|
||||
result: list[dict[str, int | str]] = []
|
||||
for index, (title, start_index) in enumerate(top):
|
||||
end_index = top[index + 1][1] - 1 if index + 1 < len(top) else num_pages - 1
|
||||
result.append({
|
||||
"title": title,
|
||||
"start": start_index + 1,
|
||||
"end": max(start_index, end_index) + 1,
|
||||
"page_start_index": start_index,
|
||||
"page_end_index": max(start_index, end_index),
|
||||
})
|
||||
return result
|
||||
|
||||
def _find_toc_pages(self, doc: fitz.Document, num_pages: int) -> tuple[int | None, int | None]:
|
||||
toc_start = None
|
||||
toc_end = None
|
||||
for page_number in range(min(num_pages, 30)):
|
||||
text = doc[page_number].get_text() or ""
|
||||
if any(re.search(pattern, text, re.IGNORECASE) for pattern in self._TOC_PATTERNS):
|
||||
if toc_start is None:
|
||||
toc_start = page_number
|
||||
toc_end = page_number
|
||||
elif toc_start is not None:
|
||||
break
|
||||
return toc_start, toc_end
|
||||
|
||||
def _parse_toc_lines(self, text: str) -> list[dict[str, int | str]]:
|
||||
marker = re.search(
|
||||
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
|
||||
text,
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
m = pattern.search(text)
|
||||
return text[: m.start()].rstrip() if m else text
|
||||
|
||||
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
|
||||
"""Parse lines like:
|
||||
1.2 Engine Overview ........ 35
|
||||
Appendix A 120
|
||||
"""
|
||||
line_pattern = re.compile(
|
||||
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
|
||||
)
|
||||
if marker:
|
||||
text = text[: marker.start()]
|
||||
|
||||
pattern = re.compile(r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$")
|
||||
entries: list[tuple[str, int]] = []
|
||||
for raw in text.splitlines():
|
||||
line = raw.strip()
|
||||
if not line or len(line) < 3:
|
||||
continue
|
||||
if re.fullmatch(r"\d+", line):
|
||||
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
|
||||
continue
|
||||
|
||||
m = line_pattern.match(line)
|
||||
if not m:
|
||||
match = pattern.match(line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
title = re.sub(r"\s+", " ", m.group("title")).strip("-_:: ")
|
||||
page = self._to_int(m.group("page"), None)
|
||||
if not title or page is None:
|
||||
title = re.sub(r"\s+", " ", match.group("title")).strip("-_::")
|
||||
page = self._to_int(match.group("page"), None)
|
||||
if not title or page is None or len(title) <= 1:
|
||||
continue
|
||||
|
||||
# Skip obvious noise.
|
||||
if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
|
||||
if title.lower() in {"page", "pages", "目录", "contents"}:
|
||||
continue
|
||||
|
||||
entries.append((title, page))
|
||||
|
||||
if not entries:
|
||||
return {}
|
||||
return []
|
||||
|
||||
# Deduplicate keeping earliest appearance.
|
||||
dedup: OrderedDict[str, int] = OrderedDict()
|
||||
for title, page in entries:
|
||||
if title not in dedup:
|
||||
dedup[title] = page
|
||||
dedup.setdefault(title, page)
|
||||
|
||||
titles = list(dedup.keys())
|
||||
pages = [dedup[t] for t in titles]
|
||||
pages = [dedup[title] for title in titles]
|
||||
result: list[dict[str, int | str]] = []
|
||||
for index, title in enumerate(titles):
|
||||
start = pages[index]
|
||||
end = max(start, pages[index + 1] - 1) if index + 1 < len(pages) else start
|
||||
result.append({"title": title, "start": start, "end": end})
|
||||
return result
|
||||
|
||||
catalog: dict[str, dict[str, int]] = {}
|
||||
for i, title in enumerate(titles):
|
||||
start = pages[i]
|
||||
if i + 1 < len(pages):
|
||||
next_start = pages[i + 1]
|
||||
end = max(start, next_start - 1)
|
||||
else:
|
||||
end = start
|
||||
catalog[title] = {"start": int(start), "end": int(end)}
|
||||
def _attach_page_indexes(
|
||||
self, catalog: list[dict[str, int | str]], toc_end: int, num_pages: int
|
||||
) -> list[dict[str, int | str]]:
|
||||
if not catalog:
|
||||
return []
|
||||
|
||||
return catalog
|
||||
first_page = None
|
||||
for item in catalog:
|
||||
start = self._to_int(item.get("start"), None)
|
||||
if start is not None and (first_page is None or start < first_page):
|
||||
first_page = start
|
||||
|
||||
def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
|
||||
catalog: dict[str, dict[str, int]] = {}
|
||||
source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
|
||||
if not isinstance(source, dict):
|
||||
return catalog
|
||||
if first_page is None:
|
||||
return []
|
||||
|
||||
for name, value in source.items():
|
||||
if not isinstance(name, str) or not isinstance(value, dict):
|
||||
continue
|
||||
start = self._to_int(value.get("start"), None)
|
||||
end = self._to_int(value.get("end"), start)
|
||||
offset = (toc_end + 1) - first_page
|
||||
result: list[dict[str, int | str]] = []
|
||||
for item in catalog:
|
||||
start = self._to_int(item.get("start"), None)
|
||||
end = self._to_int(item.get("end"), start)
|
||||
if start is None:
|
||||
continue
|
||||
if end is None:
|
||||
end = start
|
||||
catalog[name] = {"start": int(start), "end": int(max(start, end))}
|
||||
return catalog
|
||||
|
||||
@staticmethod
|
||||
def _extract_json_object(text: str) -> Any:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
candidates: list[str] = []
|
||||
|
||||
code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
|
||||
candidates.extend([c.strip() for c in code_blocks if c.strip()])
|
||||
|
||||
brace_candidate = PdfTocTool._extract_first_brace_object(text)
|
||||
if brace_candidate:
|
||||
candidates.append(brace_candidate)
|
||||
|
||||
candidates.append(text.strip())
|
||||
|
||||
for cand in candidates:
|
||||
parsed = PdfTocTool._json_try_parse(cand)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_first_brace_object(text: str) -> str | None:
|
||||
start = text.find("{")
|
||||
if start < 0:
|
||||
return None
|
||||
|
||||
depth = 0
|
||||
in_str = False
|
||||
escape = False
|
||||
for i in range(start, len(text)):
|
||||
ch = text[i]
|
||||
if in_str:
|
||||
if escape:
|
||||
escape = False
|
||||
elif ch == "\\":
|
||||
escape = True
|
||||
elif ch == '"':
|
||||
in_str = False
|
||||
continue
|
||||
|
||||
if ch == '"':
|
||||
in_str = True
|
||||
elif ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return text[start : i + 1]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _json_try_parse(text: str) -> Any:
|
||||
try:
|
||||
return json.loads(text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Minimal repair: remove trailing commas before } or ]
|
||||
repaired = re.sub(r",\s*([}\]])", r"\1", text)
|
||||
try:
|
||||
return json.loads(repaired)
|
||||
except Exception:
|
||||
return None
|
||||
page_start_index = max(0, min(start + offset, num_pages - 1))
|
||||
page_end_index = max(page_start_index, min(end + offset, num_pages - 1))
|
||||
result.append({
|
||||
"title": str(item.get("title") or "Untitled"),
|
||||
"start": start,
|
||||
"end": max(start, end),
|
||||
"page_start_index": page_start_index,
|
||||
"page_end_index": page_end_index,
|
||||
})
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _to_int(value: Any, default: int | None) -> int | None:
|
||||
|
||||
Reference in New Issue
Block a user