更新

2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions
--- a/difyPlugin/pdf/tools/pdf_toc.py
+++ b/difyPlugin/pdf/tools/pdf_toc.py
@@ -4,264 +4,303 @@ from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any

+import fitz
 from dify_plugin import Tool
 from dify_plugin.entities.model.llm import LLMModelConfig
 from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
 from dify_plugin.entities.tool import ToolInvokeMessage

-_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
-Return only valid JSON object, no markdown fences, no explanation.
-Output schema:
-{
-  "Chapter Name": {"start": 1, "end": 5},
-  "Another": {"start": 6, "end": 20}
-}
-Rules:
- start/end are integer printed page numbers from TOC.
- If end is unknown, use same value as start.
- Keep chapter names exactly as in TOC text.
-"""
+_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。
+
+要求：
+1. 识别所有一级和二级标题及其对应的页码
+2. 只返回纯JSON数组，不要markdown代码块，不要任何解释
+3. 格式: [{"title": "章节标题", "page": 页码数字}]
+4. 页码必须是文档中标注的实际页码数字
+5. 如果无法识别目录，返回空数组 []"""


 class PdfTocTool(Tool):
+    _TOC_PATTERNS = [
+        r"目录",
+        r"目\s*录",
+        r"目\u3000录",
+        r"Table of Contents",
+        r"Contents",
+        r"目次",
+    ]
+
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        toc_start = self._to_int(tool_parameters.get("toc_start"), None)
-        toc_end = self._to_int(tool_parameters.get("toc_end"), None)
-        toc_pages = (tool_parameters.get("toc_pages") or "").strip()
+        file = tool_parameters.get("file")
+        if not file:
+            yield self.create_text_message("Error: file is required")
+            return
+
        model_config = tool_parameters.get("model")

-        if toc_start is None or toc_end is None:
-            yield self.create_text_message("Error: toc_start and toc_end are required")
-            return
+        doc = fitz.open(stream=file.blob, filetype="pdf")
+        try:
+            num_pages = len(doc)

-        if not toc_pages:
-            yield self.create_text_message("Error: toc_pages text is empty")
-            return
+            # 1) 优先从PDF元数据提取目录
+            catalog = self._catalog_from_metadata(doc.get_toc(), num_pages)

-        cleaned = self._strip_index_lists(toc_pages)
+            # 2) 元数据无目录时，使用LLM解析
+            if not catalog and model_config:
+                catalog = self._extract_toc_with_llm(doc, num_pages, model_config)

-        # 1) deterministic parser first
-        catalog = self._parse_toc_lines(cleaned)
+            # 3) 无LLM配置时回退到正则解析
+            if not catalog:
+                toc_start, toc_end = self._find_toc_pages(doc, num_pages)
+                if toc_start is not None and toc_end is not None:
+                    toc_text = "\n".join(
+                        doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
+                    )
+                    printed_catalog = self._parse_toc_lines(toc_text)
+                    catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages)

-        # 2) optional LLM fallback/enhance only when deterministic parser gives no result
-        llm_raw_output = ""
-        llm_error = None
-        if not catalog and model_config:
-            llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
-                toc_start=toc_start,
-                toc_end=toc_end,
-                toc_pages=cleaned,
-                model_config=model_config,
+            if not catalog:
+                catalog = []
+
+            yield self.create_text_message(json.dumps(catalog, ensure_ascii=False))
+        finally:
+            doc.close()
+
+    def _extract_toc_with_llm(
+        self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any]
+    ) -> list[dict[str, int | str]]:
+        # 先尝试定位目录页
+        toc_start, toc_end = self._find_toc_pages(doc, num_pages)
+
+        if toc_start is not None and toc_end is not None:
+            # 有目录页，提取目录页文本
+            toc_text = "\n".join(
+                doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
            )
-            if llm_catalog:
-                catalog = self._normalize_catalog(llm_catalog)
+            content_offset = toc_end
+        else:
+            # 无目录页，提取前15页文本让LLM识别章节结构
+            sample = min(num_pages, 15)
+            toc_text = "\n\n--- 第{}页 ---\n".join(
+                [""] + [doc[i].get_text() or "" for i in range(sample)]
+            )
+            toc_text = toc_text.strip()
+            if not toc_text:
+                return []
+            content_offset = 0

-        result: dict[str, Any] = {
-            "toc_start": toc_start,
-            "toc_end": toc_end,
-            "catalog": catalog,
-            "meta": {
-                "catalog_size": len(catalog),
-                "parser": "rule" if catalog else "none",
-            },
-        }
+        # 截断过长文本
+        if len(toc_text) > 15000:
+            toc_text = toc_text[:15000] + "\n...[截断]"

-        if llm_raw_output:
-            result["meta"]["llm_used"] = True
-        if llm_error:
-            result["meta"]["llm_error"] = llm_error
+        try:
+            response = self.session.model.llm.invoke(
+                model_config=LLMModelConfig(**model_config),
+                prompt_messages=[
+                    SystemPromptMessage(content=_TOC_SYSTEM_PROMPT),
+                    UserPromptMessage(content=toc_text),
+                ],
+                stream=False,
+            )

-        # always return valid json text payload for downstream json.loads
-        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
-        yield self.create_json_message(result)
+            llm_text = self._get_response_text(response)
+            if not llm_text:
+                return []

-    def _parse_with_llm(
-        self,
-        toc_start: int,
-        toc_end: int,
-        toc_pages: str,
-        model_config: dict[str, Any],
-    ) -> tuple[dict[str, Any] | None, str, str | None]:
-        user_content = (
-            f"TOC page index range: {toc_start}..{toc_end}\n\n"
-            f"TOC raw text:\n{toc_pages}"
-        )
-        response = self.session.model.llm.invoke(
-            model_config=LLMModelConfig(**model_config),
-            prompt_messages=[
-                SystemPromptMessage(content=_SYSTEM_PROMPT),
-                UserPromptMessage(content=user_content),
-            ],
-            stream=False,
-        )
+            raw_catalog = self._parse_llm_json(llm_text)
+            if not raw_catalog:
+                return []

-        llm_text = ""
-        if hasattr(response, "message") and response.message:
-            content = response.message.content
-            if isinstance(content, str):
-                llm_text = content
-            elif isinstance(content, list):
-                llm_text = "".join(
-                    item.data if hasattr(item, "data") else str(item) for item in content
-                )
+            # 转换LLM返回的简单格式为完整catalog
+            return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages)
+        except Exception:
+            return []

-        parsed = self._extract_json_object(llm_text)
-        if parsed is None:
-            return None, llm_text, "Failed to parse LLM output as JSON"
-        if not isinstance(parsed, dict):
-            return None, llm_text, "LLM output JSON is not an object"
+    def _build_catalog_from_llm(
+        self, raw: list[dict], content_offset: int, num_pages: int
+    ) -> list[dict[str, int | str]]:
+        entries: list[tuple[str, int]] = []
+        for item in raw:
+            title = str(item.get("title") or "").strip()
+            page = self._to_int(item.get("page"), None)
+            if not title or page is None:
+                continue
+            entries.append((title, page))

-        return parsed, llm_text, None
+        if not entries:
+            return []
+
+        # 计算偏移量：第一个条目的页码与实际内容起始页的差值
+        first_printed_page = entries[0][1]
+        offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0
+
+        result: list[dict[str, int | str]] = []
+        for i, (title, page) in enumerate(entries):
+            next_page = entries[i + 1][1] if i + 1 < len(entries) else page
+            page_start_index = max(0, min(page + offset - 1, num_pages - 1))
+            page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1))
+            if i == len(entries) - 1:
+                page_end_index = num_pages - 1
+
+            result.append({
+                "title": title,
+                "start": page,
+                "end": max(page, next_page - 1) if i + 1 < len(entries) else page,
+                "page_start_index": page_start_index,
+                "page_end_index": page_end_index,
+            })
+
+        return result

    @staticmethod
-    def _strip_index_lists(text: str) -> str:
-        # Stop before common appendix lists that pollute TOC parsing.
-        pattern = re.compile(
+    def _get_response_text(response: Any) -> str:
+        if not hasattr(response, "message") or not response.message:
+            return ""
+        content = response.message.content
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            text = "".join(
+                item.data if hasattr(item, "data") else str(item) for item in content
+            )
+        else:
+            text = str(content)
+
+        # 清理思考标签
+        text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
+        text = re.sub(r"<\|[^>]+\|>", "", text)
+        return text.strip()
+
+    @staticmethod
+    def _parse_llm_json(text: str) -> list[dict]:
+        # 尝试提取JSON代码块
+        code_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
+        if code_match:
+            text = code_match.group(1).strip()
+
+        # 尝试找到JSON数组
+        bracket_match = re.search(r"\[[\s\S]*\]", text)
+        if bracket_match:
+            text = bracket_match.group(0)
+
+        try:
+            result = json.loads(text)
+            if isinstance(result, list):
+                return result
+        except Exception:
+            pass
+        return []
+
+    def _catalog_from_metadata(self, toc: list, num_pages: int) -> list[dict[str, int | str]]:
+        top = [(title, max(0, page - 1)) for level, title, page in toc if level <= 2 and page >= 1]
+        if not top:
+            return []
+
+        result: list[dict[str, int | str]] = []
+        for index, (title, start_index) in enumerate(top):
+            end_index = top[index + 1][1] - 1 if index + 1 < len(top) else num_pages - 1
+            result.append({
+                "title": title,
+                "start": start_index + 1,
+                "end": max(start_index, end_index) + 1,
+                "page_start_index": start_index,
+                "page_end_index": max(start_index, end_index),
+            })
+        return result
+
+    def _find_toc_pages(self, doc: fitz.Document, num_pages: int) -> tuple[int | None, int | None]:
+        toc_start = None
+        toc_end = None
+        for page_number in range(min(num_pages, 30)):
+            text = doc[page_number].get_text() or ""
+            if any(re.search(pattern, text, re.IGNORECASE) for pattern in self._TOC_PATTERNS):
+                if toc_start is None:
+                    toc_start = page_number
+                toc_end = page_number
+            elif toc_start is not None:
+                break
+        return toc_start, toc_end
+
+    def _parse_toc_lines(self, text: str) -> list[dict[str, int | str]]:
+        marker = re.search(
            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
+            text,
            re.IGNORECASE | re.MULTILINE,
        )
-        m = pattern.search(text)
-        return text[: m.start()].rstrip() if m else text
-
-    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
-        """Parse lines like:
-        1.2 Engine Overview ........ 35
-        Appendix A  120
-        """
-        line_pattern = re.compile(
-            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
-        )
+        if marker:
+            text = text[: marker.start()]

+        pattern = re.compile(r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$")
        entries: list[tuple[str, int]] = []
        for raw in text.splitlines():
            line = raw.strip()
-            if not line or len(line) < 3:
-                continue
-            if re.fullmatch(r"\d+", line):
+            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
                continue

-            m = line_pattern.match(line)
-            if not m:
+            match = pattern.match(line)
+            if not match:
                continue

-            title = re.sub(r"\s+", " ", m.group("title")).strip("-_:： ")
-            page = self._to_int(m.group("page"), None)
-            if not title or page is None:
+            title = re.sub(r"\s+", " ", match.group("title")).strip("-_:：")
+            page = self._to_int(match.group("page"), None)
+            if not title or page is None or len(title) <= 1:
                continue
-
-            # Skip obvious noise.
-            if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
+            if title.lower() in {"page", "pages", "目录", "contents"}:
                continue

            entries.append((title, page))

        if not entries:
-            return {}
+            return []

-        # Deduplicate keeping earliest appearance.
        dedup: OrderedDict[str, int] = OrderedDict()
        for title, page in entries:
-            if title not in dedup:
-                dedup[title] = page
+            dedup.setdefault(title, page)

        titles = list(dedup.keys())
-        pages = [dedup[t] for t in titles]
+        pages = [dedup[title] for title in titles]
+        result: list[dict[str, int | str]] = []
+        for index, title in enumerate(titles):
+            start = pages[index]
+            end = max(start, pages[index + 1] - 1) if index + 1 < len(pages) else start
+            result.append({"title": title, "start": start, "end": end})
+        return result

-        catalog: dict[str, dict[str, int]] = {}
-        for i, title in enumerate(titles):
-            start = pages[i]
-            if i + 1 < len(pages):
-                next_start = pages[i + 1]
-                end = max(start, next_start - 1)
-            else:
-                end = start
-            catalog[title] = {"start": int(start), "end": int(end)}
+    def _attach_page_indexes(
+        self, catalog: list[dict[str, int | str]], toc_end: int, num_pages: int
+    ) -> list[dict[str, int | str]]:
+        if not catalog:
+            return []

-        return catalog
+        first_page = None
+        for item in catalog:
+            start = self._to_int(item.get("start"), None)
+            if start is not None and (first_page is None or start < first_page):
+                first_page = start

-    def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
-        catalog: dict[str, dict[str, int]] = {}
-        source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
-        if not isinstance(source, dict):
-            return catalog
+        if first_page is None:
+            return []

-        for name, value in source.items():
-            if not isinstance(name, str) or not isinstance(value, dict):
-                continue
-            start = self._to_int(value.get("start"), None)
-            end = self._to_int(value.get("end"), start)
+        offset = (toc_end + 1) - first_page
+        result: list[dict[str, int | str]] = []
+        for item in catalog:
+            start = self._to_int(item.get("start"), None)
+            end = self._to_int(item.get("end"), start)
            if start is None:
                continue
            if end is None:
                end = start
-            catalog[name] = {"start": int(start), "end": int(max(start, end))}
-        return catalog

-    @staticmethod
-    def _extract_json_object(text: str) -> Any:
-        if not text:
-            return None
-
-        candidates: list[str] = []
-
-        code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
-        candidates.extend([c.strip() for c in code_blocks if c.strip()])
-
-        brace_candidate = PdfTocTool._extract_first_brace_object(text)
-        if brace_candidate:
-            candidates.append(brace_candidate)
-
-        candidates.append(text.strip())
-
-        for cand in candidates:
-            parsed = PdfTocTool._json_try_parse(cand)
-            if parsed is not None:
-                return parsed
-        return None
-
-    @staticmethod
-    def _extract_first_brace_object(text: str) -> str | None:
-        start = text.find("{")
-        if start < 0:
-            return None
-
-        depth = 0
-        in_str = False
-        escape = False
-        for i in range(start, len(text)):
-            ch = text[i]
-            if in_str:
-                if escape:
-                    escape = False
-                elif ch == "\\":
-                    escape = True
-                elif ch == '"':
-                    in_str = False
-                continue
-
-            if ch == '"':
-                in_str = True
-            elif ch == "{":
-                depth += 1
-            elif ch == "}":
-                depth -= 1
-                if depth == 0:
-                    return text[start : i + 1]
-        return None
-
-    @staticmethod
-    def _json_try_parse(text: str) -> Any:
-        try:
-            return json.loads(text)
-        except Exception:
-            pass
-
-        # Minimal repair: remove trailing commas before } or ]
-        repaired = re.sub(r",\s*([}\]])", r"\1", text)
-        try:
-            return json.loads(repaired)
-        except Exception:
-            return None
+            page_start_index = max(0, min(start + offset, num_pages - 1))
+            page_end_index = max(page_start_index, min(end + offset, num_pages - 1))
+            result.append({
+                "title": str(item.get("title") or "Untitled"),
+                "start": start,
+                "end": max(start, end),
+                "page_start_index": page_start_index,
+                "page_end_index": page_end_index,
+            })
+        return result

    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None: