import json import re from collections import OrderedDict from collections.abc import Generator from typing import Any from dify_plugin import Tool from dify_plugin.entities.model.llm import LLMModelConfig from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage from dify_plugin.entities.tool import ToolInvokeMessage _SYSTEM_PROMPT = """You parse PDF table-of-contents text. Return only valid JSON object, no markdown fences, no explanation. Output schema: { "Chapter Name": {"start": 1, "end": 5}, "Another": {"start": 6, "end": 20} } Rules: - start/end are integer printed page numbers from TOC. - If end is unknown, use same value as start. - Keep chapter names exactly as in TOC text. """ class PdfTocTool(Tool): def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: toc_start = self._to_int(tool_parameters.get("toc_start"), None) toc_end = self._to_int(tool_parameters.get("toc_end"), None) toc_pages = (tool_parameters.get("toc_pages") or "").strip() model_config = tool_parameters.get("model") if toc_start is None or toc_end is None: yield self.create_text_message("Error: toc_start and toc_end are required") return if not toc_pages: yield self.create_text_message("Error: toc_pages text is empty") return cleaned = self._strip_index_lists(toc_pages) # 1) deterministic parser first catalog = self._parse_toc_lines(cleaned) # 2) optional LLM fallback/enhance only when deterministic parser gives no result llm_raw_output = "" llm_error = None if not catalog and model_config: llm_catalog, llm_raw_output, llm_error = self._parse_with_llm( toc_start=toc_start, toc_end=toc_end, toc_pages=cleaned, model_config=model_config, ) if llm_catalog: catalog = self._normalize_catalog(llm_catalog) result: dict[str, Any] = { "toc_start": toc_start, "toc_end": toc_end, "catalog": catalog, "meta": { "catalog_size": len(catalog), "parser": "rule" if catalog else "none", }, } if llm_raw_output: result["meta"]["llm_used"] = True if llm_error: result["meta"]["llm_error"] = llm_error # always return valid json text payload for downstream json.loads yield self.create_text_message(json.dumps(result, ensure_ascii=False)) yield self.create_json_message(result) def _parse_with_llm( self, toc_start: int, toc_end: int, toc_pages: str, model_config: dict[str, Any], ) -> tuple[dict[str, Any] | None, str, str | None]: user_content = ( f"TOC page index range: {toc_start}..{toc_end}\n\n" f"TOC raw text:\n{toc_pages}" ) response = self.session.model.llm.invoke( model_config=LLMModelConfig(**model_config), prompt_messages=[ SystemPromptMessage(content=_SYSTEM_PROMPT), UserPromptMessage(content=user_content), ], stream=False, ) llm_text = "" if hasattr(response, "message") and response.message: content = response.message.content if isinstance(content, str): llm_text = content elif isinstance(content, list): llm_text = "".join( item.data if hasattr(item, "data") else str(item) for item in content ) parsed = self._extract_json_object(llm_text) if parsed is None: return None, llm_text, "Failed to parse LLM output as JSON" if not isinstance(parsed, dict): return None, llm_text, "LLM output JSON is not an object" return parsed, llm_text, None @staticmethod def _strip_index_lists(text: str) -> str: # Stop before common appendix lists that pollute TOC parsing. pattern = re.compile( r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", re.IGNORECASE | re.MULTILINE, ) m = pattern.search(text) return text[: m.start()].rstrip() if m else text def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: """Parse lines like: 1.2 Engine Overview ........ 35 Appendix A 120 """ line_pattern = re.compile( r"^\s*(?P