更新

2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions
--- a/2
+++ b/2
--- a/difyPlugin/pdf/provider/pdf.yaml
+++ b/difyPlugin/pdf/provider/pdf.yaml
@@ -1,4 +1,4 @@
-identity:
+identity:
  author: "yslg"
  name: "pdf"
  label:
@@ -13,54 +13,8 @@ identity:
    ja_JP: "pdfTools"
  icon: "icon.svg"

-#########################################################################################
-# If you want to support OAuth, you can uncomment the following code.
-#########################################################################################
-# oauth_schema:
-#   client_schema:
-#     - name: "client_id"
-#       type: "secret-input"
-#       required: true
-#       url: https://example.com/oauth/authorize
-#       placeholder:
-#         en_US: "Please input your Client ID"
-#         zh_Hans: "请输入你的 Client ID"
-#         pt_BR: "Insira seu Client ID"
-#       help:
-#         en_US: "Client ID is used to authenticate requests to the example.com API."
-#         zh_Hans: "Client ID 用于认证请求到 example.com API。"
-#         pt_BR: "Client ID é usado para autenticar solicitações à API do example.com."
-#       label:
-#         zh_Hans: "Client ID"
-#         en_US: "Client ID"
-#     - name: "client_secret"
-#       type: "secret-input"
-#       required: true
-#       url: https://example.com/oauth/authorize
-#       placeholder:
-#         en_US: "Please input your Client Secret"
-#         zh_Hans: "请输入你的 Client Secret"
-#         pt_BR: "Insira seu Client Secret"
-#       help:
-#         en_US: "Client Secret is used to authenticate requests to the example.com API."
-#         zh_Hans: "Client Secret 用于认证请求到 example.com API。"
-#         pt_BR: "Client Secret é usado para autenticar solicitações à API do example.com."
-#       label:
-#         zh_Hans: "Client Secret"
-#         en_US: "Client Secret"
-#   credentials_schema:
-#     - name: "access_token"
-#       type: "secret-input"
-#       label:
-#         zh_Hans: "Access Token"
-#         en_US: "Access Token"
-
 tools:
-  - tools/pdf_column_range.yaml
-  - tools/pdf_single_page.yaml
-  - tools/pdf_summary.yaml
  - tools/pdf_toc.yaml
-  - tools/pdf_extract_range.yaml
  - tools/pdf_to_markdown.yaml
 extra:
  python:
--- a/difyPlugin/pdf/tools/pdf_column_range.py
+++ b/difyPlugin/pdf/tools/pdf_column_range.py
@@ -1,107 +0,0 @@
-import json
-import re
-from collections.abc import Generator
-from io import BytesIO
-from typing import Any
-
-import fitz  # PyMuPDF 核心库
-from dify_plugin import Tool
-from dify_plugin.entities.tool import ToolInvokeMessage
-
-
-class PdfTool(Tool):
-    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        file = tool_parameters.get("file")
-        if not file:
-            yield self.create_text_message("Error: file is required")
-            return
-
-        # 从字节流加载 PDF（替换 PyPDF2）
-        pdf_bytes = file.blob
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        num_pages = len(doc)
-
-        toc_start = None
-        toc_end = None
-
-        # 目录匹配正则（与原代码一致）
-        toc_patterns = [
-            r'目录',
-            r'目　录',
-            r'目\u3000录',
-            r'Table of Contents',
-            r'Contents',
-            r'目次'
-        ]
-
-        # 遍历页面识别目录页（逻辑不变，仅替换文本提取方式）
-        for page_num in range(num_pages):
-            page = doc[page_num]
-            text = page.get_text() or ""  # PyMuPDF 提取文本
-
-            if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
-                if toc_start is None:
-                    toc_start = page_num
-                toc_end = page_num
-            elif toc_start is not None and toc_end is not None:
-                break
-
-        # 提取目录页文本
-        toc_pages = []
-        if toc_start is not None and toc_end is not None:
-            for page_num in range(toc_start, toc_end + 1):
-                page = doc[page_num]
-                toc_pages.append(page.get_text() or "")
-
-        # 关闭文档
-        doc.close()
-
-        result = {
-            "start": toc_start,
-            "end": toc_end,
-            "pages": toc_pages,
-            "pages_text": "\n".join(toc_pages) if toc_pages else "",
-        }
-        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
-        yield self.create_json_message(result)
-
-
-if __name__ == "__main__":
-    # 测试代码（改用 PyMuPDF）
-    pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf"
-    doc = fitz.open(pdf_path)  # 本地文件直接打开
-    num_pages = len(doc)
-
-    toc_start = None
-    toc_end = None
-
-    toc_patterns = [
-        r'目录',
-        r'目　录',
-        r'目\u3000录',
-        r'Table of Contents',
-        r'Contents',
-        r'目次'
-    ]
-
-    # 遍历页面找目录
-    for page_num in range(num_pages):
-        page = doc[page_num]
-        text = page.get_text() or ""
-        if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
-            if toc_start is None:
-                toc_start = page_num
-            toc_end = page_num
-        elif toc_start is not None and toc_end is not None:
-            break
-
-    # 提取目录页文本
-    toc_pages = []
-    toc_start = toc_start if toc_start is not None else 18
-    toc_end = toc_end if toc_end is not None else toc_start + 9
-    for page_num in range(toc_start, toc_end):
-        page = doc[page_num]
-        toc_pages.append(page.get_text() or "")
-    
-    print(toc_start, toc_end, toc_pages)
-    doc.close()  # 关闭文档
--- a/difyPlugin/pdf/tools/pdf_column_range.yaml
+++ b/difyPlugin/pdf/tools/pdf_column_range.yaml
@@ -1,36 +0,0 @@
-identity:
-  name: "pdf"
-  author: "yslg"
-  label:
-    en_US: "Extract TOC Pages and Content"
-    zh_Hans: "提取目录页和内容"
-    pt_BR: "Extrair páginas de sumário e conteúdo"
-    ja_JP: "目次ページと内容を抽出"
-description:
-  human:
-    en_US: "Extract table-of-contents page range and all page text in that range"
-    zh_Hans: "提取目录页范围以及该范围内所有页文本"
-    pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo"
-    ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出"
-  llm: "Extract table-of-contents page range and all page text in that range"
-parameters:
-  - name: file
-    type: file
-    required: true
-    label:
-      en_US: PDF File
-      zh_Hans: PDF 文件
-      pt_BR: Arquivo PDF
-      ja_JP: PDFファイル
-    human_description:
-      en_US: "PDF file to process"
-      zh_Hans: "要处理的 PDF 文件"
-      pt_BR: "Arquivo PDF para processar"
-      ja_JP: "処理するPDFファイル"
-    llm_description: "PDF file to process, output contains start/end/pages"
-    form: llm
-    fileTypes:
-      - "pdf"
-extra:
-  python:
-    source: tools/pdf_column_range.py
--- a/difyPlugin/pdf/tools/pdf_extract_range.py
+++ b/difyPlugin/pdf/tools/pdf_extract_range.py
@@ -1,48 +0,0 @@
-import json
-from collections.abc import Generator
-from typing import Any
-
-import fitz  # PyMuPDF
-from dify_plugin import Tool
-from dify_plugin.entities.tool import ToolInvokeMessage
-
-
-class PdfExtractRangeTool(Tool):
-    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        file = tool_parameters.get("file")
-        if not file:
-            yield self.create_text_message("Error: file is required")
-            return
-
-        start_page = int(tool_parameters.get("start_page", 0))
-        end_page = int(tool_parameters.get("end_page", 0))
-
-        # 打开 PDF
-        pdf_bytes = file.blob
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        num_pages = len(doc)
-
-        # 边界处理
-        start_page = max(0, min(start_page, num_pages - 1))
-        end_page = max(start_page, min(end_page, num_pages - 1))
-
-        # 逐页提取文本
-        page_texts = []
-        for page_idx in range(start_page, end_page + 1):
-            page = doc[page_idx]
-            text = page.get_text("text", sort=True) or ""
-            page_texts.append(text)
-
-        doc.close()
-
-        # 拼接所有页面文本
-        full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
-
-        result = {
-            "start": start_page,
-            "end": end_page,
-            "total_pages": end_page - start_page + 1,
-            "text": full_text,
-        }
-        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
-        yield self.create_json_message(result)
--- a/difyPlugin/pdf/tools/pdf_extract_range.yaml
+++ b/difyPlugin/pdf/tools/pdf_extract_range.yaml
@@ -1,68 +0,0 @@
-identity:
-  name: "pdf_extract_range"
-  author: "yslg"
-  label:
-    en_US: "Extract Page Range Text"
-    zh_Hans: "提取页面范围文本"
-    pt_BR: "Extrair Texto do Intervalo de Páginas"
-    ja_JP: "ページ範囲テキスト抽出"
-description:
-  human:
-    en_US: "Extract plain text from a specified page range of a PDF file"
-    zh_Hans: "从PDF文件的指定页码范围提取纯文本"
-    pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF"
-    ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出"
-  llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range."
-parameters:
-  - name: file
-    type: file
-    required: true
-    label:
-      en_US: PDF File
-      zh_Hans: PDF 文件
-      pt_BR: Arquivo PDF
-      ja_JP: PDFファイル
-    human_description:
-      en_US: "PDF file to extract text from"
-      zh_Hans: "要提取文本的 PDF 文件"
-      pt_BR: "Arquivo PDF para extrair texto"
-      ja_JP: "テキストを抽出するPDFファイル"
-    llm_description: "PDF file to extract page range text from"
-    form: llm
-    fileTypes:
-      - "pdf"
-  - name: start_page
-    type: number
-    required: true
-    label:
-      en_US: Start Page
-      zh_Hans: 起始页码
-      pt_BR: Página Inicial
-      ja_JP: 開始ページ
-    human_description:
-      en_US: "Start page index (0-based)"
-      zh_Hans: "起始页码（从0开始）"
-      pt_BR: "Índice da página inicial (base 0)"
-      ja_JP: "開始ページ番号（0始まり）"
-    llm_description: "Start page index (0-based)"
-    form: llm
-    default: 0
-  - name: end_page
-    type: number
-    required: true
-    label:
-      en_US: End Page
-      zh_Hans: 结束页码
-      pt_BR: Página Final
-      ja_JP: 終了ページ
-    human_description:
-      en_US: "End page index (0-based, inclusive)"
-      zh_Hans: "结束页码（从0开始，包含该页）"
-      pt_BR: "Índice da página final (base 0, inclusivo)"
-      ja_JP: "終了ページ番号（0始まり、含む）"
-    llm_description: "End page index (0-based, inclusive)"
-    form: llm
-    default: 0
-extra:
-  python:
-    source: tools/pdf_extract_range.py
--- a/difyPlugin/pdf/tools/pdf_single_page.py
+++ b/difyPlugin/pdf/tools/pdf_single_page.py
@@ -1,45 +0,0 @@
-import json
-from collections.abc import Generator
-from io import BytesIO
-from typing import Any
-
-import fitz  # PyMuPDF 核心库
-from dify_plugin import Tool
-from dify_plugin.entities.tool import ToolInvokeMessage
-
-
-class PdfSinglePageTool(Tool):
-    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        file = tool_parameters.get("file")
-        page = tool_parameters.get("page", 0)
-
-        if not file:
-            yield self.create_text_message("Error: file is required")
-            return
-
-        # 从字节流加载 PDF（替换 PyPDF2 的 PdfReader）
-        pdf_bytes = file.blob
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")  # 字节流方式打开
-        num_pages = len(doc)
-
-        # 页码边界处理（逻辑与原代码一致）
-        page_index = int(page)
-        if page_index < 0:
-            page_index = 0
-        if page_index >= num_pages:
-            page_index = num_pages - 1
-
-        # 提取指定页面文本（PyMuPDF 方式）
-        selected_page = doc[page_index]
-        text = selected_page.get_text() or ""  # get_text() 提取文本，比 PyPDF2 更精准
-
-        # 关闭文档释放资源
-        doc.close()
-
-        result = {
-            "start": page_index,
-            "end": page_index,
-            "pages": [text]
-        }
-        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
-        yield self.create_json_message(result)
--- a/difyPlugin/pdf/tools/pdf_single_page.yaml
+++ b/difyPlugin/pdf/tools/pdf_single_page.yaml
@@ -1,52 +0,0 @@
-identity:
-  name: "pdf_single_page"
-  author: "yslg"
-  label:
-    en_US: "Extract Single-Page Text"
-    zh_Hans: "提取单页文字"
-    pt_BR: "Extrair texto de página única"
-    ja_JP: "単一ページのテキストを抽出"
-description:
-  human:
-    en_US: "Extract text from one specified page"
-    zh_Hans: "提取指定单页文字"
-    pt_BR: "Extrair texto de uma página especificada"
-    ja_JP: "指定した1ページのテキストを抽出"
-  llm: "Extract text from one specified page"
-parameters:
-  - name: file
-    type: file
-    required: true
-    label:
-      en_US: PDF File
-      zh_Hans: PDF 文件
-      pt_BR: Arquivo PDF
-      ja_JP: PDFファイル
-    human_description:
-      en_US: "PDF file to process"
-      zh_Hans: "要处理的 PDF 文件"
-      pt_BR: "Arquivo PDF para processar"
-      ja_JP: "処理するPDFファイル"
-    llm_description: "PDF file to process"
-    form: llm
-    fileTypes:
-      - "pdf"
-  - name: page
-    type: number
-    required: true
-    label:
-      en_US: Page Index
-      zh_Hans: 页码
-      pt_BR: Índice da Página
-      ja_JP: ページ番号
-    human_description:
-      en_US: "Single page index to extract"
-      zh_Hans: "要提取的单页页码"
-      pt_BR: "Índice da página única para extrair"
-      ja_JP: "抽出対象のページ番号"
-    llm_description: "Single page index to extract"
-    form: llm
-    default: 0
-extra:
-  python:
-    source: tools/pdf_single_page.py
--- a/difyPlugin/pdf/tools/pdf_summary.py
+++ b/difyPlugin/pdf/tools/pdf_summary.py
@@ -1,209 +0,0 @@
-import json
-import re
-from collections.abc import Generator
-from typing import Any
-
-import fitz
-from dify_plugin import Tool
-from dify_plugin.entities.model.llm import LLMModelConfig
-from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
-from dify_plugin.entities.tool import ToolInvokeMessage
-
-
-class PdfSummaryTool(Tool):
-    """Fast PDF page summary tool.
-
-    Default behavior is optimized for throughput in large workflows:
-    - Extract plain text and lightweight table data only.
-    - Skip expensive image base64 and drawing path extraction.
-    - Skip LLM by default unless `use_llm=true` is explicitly passed.
-    """
-
-    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        file = tool_parameters.get("file")
-        if not file:
-            yield self.create_text_message("Error: file is required")
-            return
-
-        start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0)
-        end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0)
-        model_config = tool_parameters.get("model")
-        use_llm = self._to_bool(tool_parameters.get("use_llm"), False)
-
-        max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000)
-        max_chars_per_page = max(800, min(max_chars_per_page, 20000))
-
-        llm_prompt = tool_parameters.get(
-            "llm_prompt",
-            "请基于输入的PDF页面文本做简洁准确摘要，输出中文要点。不要输出思考过程。",
-        )
-
-        pdf_bytes = file.blob
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        try:
-            num_pages = len(doc)
-            start_page = max(0, min(start_page, num_pages - 1))
-            end_page = max(start_page, min(end_page, num_pages - 1))
-
-            pages_data: list[dict[str, Any]] = []
-            for page_idx in range(start_page, end_page + 1):
-                page = doc[page_idx]
-                page_data = self._extract_page_fast(page, page_idx, max_chars_per_page)
-                pages_data.append(page_data)
-
-            result = {
-                "total_pages_extracted": len(pages_data),
-                "page_range": {"start": start_page, "end": end_page},
-                "pages": pages_data,
-            }
-            yield self.create_json_message(result)
-
-            # Fast local summary first (deterministic, no model latency)
-            local_text = self._build_local_summary(pages_data)
-
-            # Optional LLM refinement, explicitly enabled only
-            if use_llm and model_config:
-                refined = self._summarize_with_llm(local_text, llm_prompt, model_config)
-                final_text = refined if refined else local_text
-            else:
-                final_text = local_text
-
-            if final_text:
-                yield self.create_text_message(final_text)
-        finally:
-            doc.close()
-
-    def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]:
-        text = (page.get_text("text") or "").strip()
-        if len(text) > max_chars_per_page:
-            text = text[:max_chars_per_page] + "\n...[truncated]"
-
-        tables: list[dict[str, Any]] = []
-        try:
-            tabs = page.find_tables()
-            for tab_idx, tab in enumerate(tabs.tables[:3]):
-                cells = tab.extract() or []
-                tables.append(
-                    {
-                        "index": tab_idx,
-                        "rows": tab.row_count,
-                        "cols": tab.col_count,
-                        "cells": cells[:10],
-                    }
-                )
-        except Exception:
-            pass
-
-        return {
-            "page_number": page_idx,
-            "text": text,
-            "tables": tables,
-            "images": [],
-            "drawings_summary": [],
-            "text_blocks": [],
-            "width": float(page.rect.width),
-            "height": float(page.rect.height),
-        }
-
-    def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str:
-        """Output actual page content as Markdown (text + tables).
-
-        No LLM needed downstream — the text is already usable Markdown.
-        """
-        parts: list[str] = []
-        for page in pages_data:
-            text = (page.get("text") or "").strip()
-            tables = page.get("tables") or []
-
-            page_parts: list[str] = []
-            if text:
-                page_parts.append(text)
-
-            for tab in tables:
-                cells = tab.get("cells") or []
-                if len(cells) >= 2:
-                    md = self._cells_to_md_table(cells)
-                    if md:
-                        page_parts.append(md)
-
-            if page_parts:
-                parts.append("\n\n".join(page_parts))
-
-        return "\n\n--- 分页 ---\n\n".join(parts)
-
-    @staticmethod
-    def _cells_to_md_table(cells: list) -> str:
-        if not cells:
-            return ""
-        header = cells[0]
-        ncols = len(header)
-        if ncols == 0:
-            return ""
-        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
-        lines = [
-            "| " + " | ".join(clean(c) for c in header) + " |",
-            "| " + " | ".join("---" for _ in range(ncols)) + " |",
-        ]
-        for row in cells[1:]:
-            padded = list(row) + [""] * max(0, ncols - len(row))
-            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
-        return "\n".join(lines)
-
-    def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str:
-        response = self.session.model.llm.invoke(
-            model_config=LLMModelConfig(**model_config),
-            prompt_messages=[
-                SystemPromptMessage(content=llm_prompt),
-                UserPromptMessage(content=local_text),
-            ],
-            stream=False,
-        )
-
-        llm_text = ""
-        if hasattr(response, "message") and response.message:
-            content = response.message.content
-            if isinstance(content, str):
-                llm_text = content
-            elif isinstance(content, list):
-                llm_text = "".join(
-                    item.data if hasattr(item, "data") else str(item)
-                    for item in content
-                )
-
-        return self._extract_visible_answer(llm_text)
-
-    @staticmethod
-    def _extract_visible_answer(text: str) -> str:
-        if not text:
-            return ""
-
-        box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text)
-        if box_match:
-            text = box_match.group(1)
-        else:
-            text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
-
-        text = re.sub(r"<\|[^>]+\|>", "", text)
-        return text.strip()
-
-    @staticmethod
-    def _to_int(value: Any, default: int) -> int:
-        try:
-            if value is None or value == "":
-                return default
-            return int(value)
-        except Exception:
-            return default
-
-    @staticmethod
-    def _to_bool(value: Any, default: bool) -> bool:
-        if value is None:
-            return default
-        if isinstance(value, bool):
-            return value
-        s = str(value).strip().lower()
-        if s in {"1", "true", "yes", "on"}:
-            return True
-        if s in {"0", "false", "no", "off"}:
-            return False
-        return default
--- a/difyPlugin/pdf/tools/pdf_summary.yaml
+++ b/difyPlugin/pdf/tools/pdf_summary.yaml
@@ -1,99 +0,0 @@
-identity:
-  name: "pdf_summary"
-  author: "yslg"
-  label:
-    en_US: "PDF Page Summary"
-    zh_Hans: "PDF页面概述"
-    pt_BR: "Resumo de Página PDF"
-    ja_JP: "PDFページ概要"
-description:
-  human:
-    en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM"
-    zh_Hans: "提取PDF页面核心元素（文本、图片、表格、路径）及坐标，并通过LLM进行概述"
-    pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM"
-    ja_JP: "PDFページからコア要素（テキスト、画像、テーブル、パス）を座標付きで抽出し、LLMで要約"
-  llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content"
-parameters:
-  - name: file
-    type: file
-    required: true
-    label:
-      en_US: PDF File
-      zh_Hans: PDF 文件
-      pt_BR: Arquivo PDF
-      ja_JP: PDFファイル
-    human_description:
-      en_US: "PDF file to process"
-      zh_Hans: "要处理的 PDF 文件"
-      pt_BR: "Arquivo PDF para processar"
-      ja_JP: "処理するPDFファイル"
-    llm_description: "PDF file to extract elements from and summarize"
-    form: llm
-    fileTypes:
-      - "pdf"
-  - name: pdf_start_page
-    type: number
-    required: true
-    label:
-      en_US: Start Page
-      zh_Hans: 起始页码
-      pt_BR: Página Inicial
-      ja_JP: 開始ページ
-    human_description:
-      en_US: "Start page index (0-based)"
-      zh_Hans: "起始页码（从0开始）"
-      pt_BR: "Índice da página inicial (base 0)"
-      ja_JP: "開始ページ番号（0始まり）"
-    llm_description: "Start page index (0-based) for element extraction"
-    form: llm
-    default: 0
-  - name: pdf_end_page
-    type: number
-    required: true
-    label:
-      en_US: End Page
-      zh_Hans: 结束页码
-      pt_BR: Página Final
-      ja_JP: 終了ページ
-    human_description:
-      en_US: "End page index (0-based, inclusive)"
-      zh_Hans: "结束页码（从0开始，包含该页）"
-      pt_BR: "Índice da página final (base 0, inclusivo)"
-      ja_JP: "終了ページ番号（0始まり、含む）"
-    llm_description: "End page index (0-based, inclusive) for element extraction"
-    form: llm
-    default: 0
-  - name: model
-    type: model-selector
-    scope: llm
-    required: true
-    label:
-      en_US: LLM Model
-      zh_Hans: LLM 模型
-      pt_BR: Modelo LLM
-      ja_JP: LLMモデル
-    human_description:
-      en_US: "LLM model used for summarizing extracted content"
-      zh_Hans: "用于概述提取内容的 LLM 模型"
-      pt_BR: "Modelo LLM usado para resumir o conteúdo extraído"
-      ja_JP: "抽出内容の要約に使用するLLMモデル"
-    form: form
-  - name: llm_prompt
-    type: string
-    required: false
-    label:
-      en_US: LLM Prompt
-      zh_Hans: LLM 提示词
-      pt_BR: Prompt do LLM
-      ja_JP: LLMプロンプト
-    human_description:
-      en_US: "System prompt for LLM summarization"
-      zh_Hans: "LLM 概述的系统提示词"
-      pt_BR: "Prompt do sistema para resumo LLM"
-      ja_JP: "LLM要約用のシステムプロンプト"
-    llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content"
-    form: form
-    default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容（包含文本、图片信息、表格和矢量图形），对每页内容进行准确、简洁的概述。"
-extra:
-  python:
-    source: tools/pdf_summary.py
--- a/difyPlugin/pdf/tools/pdf_to_markdown.py
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.py
@@ -1,6 +1,5 @@
-import base64
+import json
 import re
-from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any

@@ -10,306 +9,219 @@ from dify_plugin.entities.tool import ToolInvokeMessage


 class PdfToMarkdownTool(Tool):
-    """Convert PDF to a single Markdown file. No LLM needed.
-
-    - Auto-detect TOC and organize content by chapters.
-    - Extract text and tables as Markdown.
-    - Embed raster images as base64.
-    - Render vector drawings as base64 PNG.
-    - Output one .md file via create_blob_message.
-    """
-
-    _TOC_PATTERNS = [
-        r"目录", r"目　录", r"目\u3000录",
-        r"Table of Contents", r"Contents", r"目次",
-    ]
-
-    # ── entry point ──────────────────────────────────────────
+    """Convert PDF to Markdown using an external catalog array."""

    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
+        catalog_text = (tool_parameters.get("catalog") or "").strip()
        if not file:
            yield self.create_text_message("Error: file is required")
            return
+        if not catalog_text:
+            yield self.create_text_message("Error: catalog is required")
+            return

-        include_images = self._to_bool(tool_parameters.get("include_images"), True)
-        image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
-        image_dpi = max(72, min(image_dpi, 300))
-        max_image_bytes = 2 * 1024 * 1024  # skip images > 2 MB raw
+        catalog = self._parse_catalog(catalog_text)
+        if not catalog:
+            yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
+            return

        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)
+            hf_texts = self._detect_headers_footers(doc, num_pages)
+            page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
+            final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)

-            # 1) Build chapter map (metadata TOC → printed TOC → none)
-            chapters, content_offset = self._build_chapter_map(doc, num_pages)
-
-            # 2) Convert every page
-            page_mds: list[str] = []
-            for idx in range(num_pages):
-                md = self._page_to_markdown(
-                    doc, doc[idx], idx,
-                    include_images, image_dpi, max_image_bytes,
-                )
-                page_mds.append(md)
-
-            # 3) Assemble
-            if chapters:
-                final_md = self._assemble_by_chapters(
-                    chapters, page_mds, content_offset, num_pages,
-                )
-            else:
-                final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
-
-            # 4) Output: text (for variable aggregation) + blob (.md file)
            yield self.create_text_message(final_md)
-            md_bytes = final_md.encode("utf-8")
            yield self.create_blob_message(
-                blob=md_bytes,
+                blob=final_md.encode("utf-8"),
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()

-    # ── chapter detection ────────────────────────────────────
+    def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
+        try:
+            raw = json.loads(catalog_text)
+        except Exception:
+            return []

-    def _build_chapter_map(
-        self, doc: fitz.Document, num_pages: int,
-    ) -> tuple[dict, int]:
-        """Return (chapters_dict, content_offset).
+        if not isinstance(raw, list):
+            return []

-        Try embedded PDF TOC metadata first (reliable page mapping).
-        Fall back to scanning printed TOC pages.
-        """
-        toc = doc.get_toc()
-        if toc:
-            chapters = self._chapters_from_metadata(toc, num_pages)
-            if chapters:
-                return chapters, 0
+        result: list[dict[str, Any]] = []
+        for item in raw:
+            if not isinstance(item, dict):
+                continue

-        toc_start, toc_end = self._find_toc_pages(doc, num_pages)
-        if toc_start is not None and toc_end is not None:
-            toc_text = "\n".join(
-                doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
+            title = str(item.get("title") or "").strip() or "Untitled"
+            start_index = self._to_int(item.get("page_start_index"), None)
+            end_index = self._to_int(item.get("page_end_index"), start_index)
+
+            if start_index is None:
+                start = self._to_int(item.get("start"), None)
+                end = self._to_int(item.get("end"), start)
+                if start is None:
+                    continue
+                start_index = max(0, start - 1)
+                end_index = max(start_index, (end if end is not None else start) - 1)
+
+            if end_index is None:
+                end_index = start_index
+
+            result.append(
+                {
+                    "title": title,
+                    "page_start_index": max(0, start_index),
+                    "page_end_index": max(start_index, end_index),
+                }
            )
-            chapters = self._parse_toc_lines(toc_text)
-            if chapters:
-                offset = self._guess_offset(chapters, toc_end)
-                return chapters, offset
+        return result

-        return {}, 0
+    def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
+        margin_ratio = 0.08
+        sample_count = min(num_pages, 30)
+        text_counts: dict[str, int] = {}

-    def _chapters_from_metadata(
-        self, toc: list, num_pages: int,
-    ) -> dict[str, dict[str, int]]:
-        top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
-        if not top:
-            return {}
-        chapters: dict[str, dict[str, int]] = OrderedDict()
-        for i, (title, start) in enumerate(top):
-            end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
-            chapters[title] = {"start": start, "end": max(start, end)}
-        return chapters
-
-    def _find_toc_pages(self, doc, num_pages):
-        toc_start = toc_end = None
-        for pn in range(min(num_pages, 30)):
-            text = doc[pn].get_text() or ""
-            if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
-                if toc_start is None:
-                    toc_start = pn
-                toc_end = pn
-            elif toc_start is not None:
-                break
-        return toc_start, toc_end
-
-    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
-        m = re.search(
-            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
-            text, re.IGNORECASE | re.MULTILINE,
-        )
-        if m:
-            text = text[: m.start()]
-
-        pat = re.compile(
-            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
-        )
-        entries: list[tuple[str, int]] = []
-        for raw in text.splitlines():
-            line = raw.strip()
-            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
+        for idx in range(sample_count):
+            page = doc[idx]
+            page_height = page.rect.height
+            top_limit = page_height * margin_ratio
+            bottom_limit = page_height * (1 - margin_ratio)
+            try:
+                blocks = page.get_text("blocks", sort=True) or []
+            except Exception:
                continue
-            m2 = pat.match(line)
-            if not m2:
+
+            seen: set[str] = set()
+            for block in blocks:
+                if len(block) < 7 or block[6] != 0:
                    continue
-            title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:： ")
-            page = self._to_int(m2.group("page"), None)
-            if not title or page is None or len(title) <= 1:
+                y0, y1 = block[1], block[3]
+                text = (block[4] or "").strip()
+                if not text or len(text) < 2 or text in seen:
                    continue
-            if title.lower() in {"page", "pages", "目录", "contents"}:
-                continue
-            entries.append((title, page))
+                if y1 <= top_limit or y0 >= bottom_limit:
+                    seen.add(text)
+                    text_counts[text] = text_counts.get(text, 0) + 1

-        if not entries:
-            return {}
+        threshold = max(3, sample_count * 0.35)
+        return {text for text, count in text_counts.items() if count >= threshold}

-        dedup: OrderedDict[str, int] = OrderedDict()
-        for t, p in entries:
-            dedup.setdefault(t, p)
-
-        titles = list(dedup.keys())
-        pages = [dedup[t] for t in titles]
-        catalog: dict[str, dict[str, int]] = OrderedDict()
-        for i, t in enumerate(titles):
-            s = pages[i]
-            e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
-            catalog[t] = {"start": s, "end": e}
-        return catalog
-
-    @staticmethod
-    def _guess_offset(chapters: dict, toc_end: int) -> int:
-        first_page = None
-        for info in chapters.values():
-            s = info["start"]
-            if first_page is None or s < first_page:
-                first_page = s
-        if first_page is None:
-            return 0
-        return (toc_end + 1) - first_page
-
-    # ── per-page conversion ──────────────────────────────────
-
-    def _page_to_markdown(
-        self,
-        doc: fitz.Document,
-        page: fitz.Page,
-        page_idx: int,
-        include_images: bool,
-        image_dpi: int,
-        max_image_bytes: int,
-    ) -> str:
+    def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
        parts: list[str] = []
+        page_height = page.rect.height
+        top_margin = page_height * 0.06
+        bottom_margin = page_height * 0.94
+
+        table_rects: list[fitz.Rect] = []
+        table_mds: list[str] = []
+        try:
+            find_tables = getattr(page, "find_tables", None)
+            tables = []
+            if callable(find_tables):
+                table_finder = find_tables()
+                tables = getattr(table_finder, "tables", []) or []
+
+            for table in tables[:5]:
+                try:
+                    table_rects.append(fitz.Rect(table.bbox))
+                except Exception:
+                    pass
+
+                cells = table.extract() or []
+                if len(cells) < 2:
+                    continue
+                if hf_texts and len(cells) <= 3:
+                    flat = " ".join(str(cell or "") for row in cells for cell in row)
+                    if any(hf in flat for hf in hf_texts):
+                        continue
+
+                md_table = self._cells_to_md_table(cells)
+                if md_table:
+                    table_mds.append(md_table)
+        except Exception:
+            pass
+
+        try:
+            blocks = page.get_text("blocks", sort=True) or []
+        except Exception:
+            blocks = []
+
+        for block in blocks:
+            if len(block) < 7 or block[6] != 0:
+                continue
+            x0, y0, x1, y1 = block[:4]
+            text = (block[4] or "").strip()
+            if not text:
+                continue
+
+            block_rect = fitz.Rect(x0, y0, x1, y1)
+            if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
+                continue
+            if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
+                if any(hf in text for hf in hf_texts):
+                    continue
+            if re.fullmatch(r"\s*\d{1,4}\s*", text):
+                continue

-        # ── text ──
-        text = (page.get_text("text", sort=True) or "").strip()
-        if text:
            parts.append(text)

-        # ── tables → Markdown ──
-        try:
-            for tab in (page.find_tables().tables or [])[:5]:
-                cells = tab.extract() or []
-                if len(cells) >= 2:
-                    md = self._cells_to_md_table(cells)
-                    if md:
-                        parts.append(md)
-        except Exception:
-            pass
-
-        if not include_images:
+        parts.extend(table_mds)
        return "\n\n".join(parts)

-        # ── embedded raster images ──
-        try:
-            for img_idx, img_info in enumerate(page.get_images(full=True)):
-                xref = img_info[0]
-                try:
-                    data = doc.extract_image(xref)
-                    if not data or not data.get("image"):
-                        continue
-                    raw = data["image"]
-                    if len(raw) > max_image_bytes:
-                        continue
-                    # skip tiny icons (< 20x20)
-                    w = data.get("width", 0)
-                    h = data.get("height", 0)
-                    if w < 20 and h < 20:
-                        continue
-                    ext = data.get("ext", "png")
-                    mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
-                    b64 = base64.b64encode(raw).decode("ascii")
-                    parts.append(
-                        f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
-                    )
-                except Exception:
-                    pass
-        except Exception:
-            pass
-
-        # ── vector drawings → render as PNG ──
-        try:
-            drawings = page.get_drawings()
-            if len(drawings) >= 3:
-                valid_rects: list[fitz.Rect] = []
-                for d in drawings:
-                    r = d.get("rect")
-                    if r:
-                        try:
-                            rect = fitz.Rect(r)
-                            if rect.is_valid and not rect.is_empty:
-                                valid_rects.append(rect)
-                        except Exception:
-                            pass
-                if valid_rects:
-                    bbox = valid_rects[0]
-                    for r in valid_rects[1:]:
-                        bbox |= r
-                    bbox &= page.rect
-                    if bbox.width > 30 and bbox.height > 30:
-                        scale = image_dpi / 72
-                        mat = fitz.Matrix(scale, scale)
-                        pix = page.get_pixmap(matrix=mat, clip=bbox)
-                        png = pix.tobytes("png")
-                        if len(png) <= max_image_bytes:
-                            b64 = base64.b64encode(png).decode("ascii")
-                            parts.append(
-                                f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
-                            )
-        except Exception:
-            pass
-
-        return "\n\n".join(parts)
-
-    # ── assembly ─────────────────────────────────────────────
-
-    def _assemble_by_chapters(
-        self,
-        chapters: dict[str, dict[str, int]],
-        page_mds: list[str],
-        offset: int,
-        num_pages: int,
-    ) -> str:
+    def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
        parts: list[str] = []
-        for name, info in chapters.items():
-            s = info["start"] + offset
-            e = info["end"] + offset
-            s = max(0, min(s, num_pages - 1))
-            e = max(s, min(e, num_pages - 1))
-            ch: list[str] = [f"# {name}\n"]
-            for idx in range(s, e + 1):
-                if idx < len(page_mds) and page_mds[idx].strip():
-                    ch.append(page_mds[idx])
-            parts.append("\n\n".join(ch))
-        return "\n\n---\n\n".join(parts)
+        used_pages: set[int] = set()

-    # ── helpers ──────────────────────────────────────────────
+        for item in catalog:
+            start = max(0, min(int(item["page_start_index"]), num_pages - 1))
+            end = max(start, min(int(item["page_end_index"]), num_pages - 1))
+
+            chapter_parts = [f"# {item['title']}\n"]
+            for idx in range(start, end + 1):
+                if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
+                    chapter_parts.append(page_mds[idx])
+                    used_pages.add(idx)
+
+            if len(chapter_parts) > 1:
+                parts.append("\n\n".join(chapter_parts))
+
+        if parts:
+            return "\n\n---\n\n".join(parts)
+        return "\n\n---\n\n".join(m for m in page_mds if m.strip())
+
+    @staticmethod
+    def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
+        inter = block_rect & table_rect
+        if inter.is_empty:
+            return False
+        block_area = block_rect.width * block_rect.height
+        if block_area <= 0:
+            return False
+        return (inter.width * inter.height) / block_area >= 0.3

    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""
+
        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""
-        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
+
+        def clean(value: Any) -> str:
+            return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
+
        lines = [
-            "| " + " | ".join(clean(c) for c in header) + " |",
+            "| " + " | ".join(clean(cell) for cell in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
-            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
+            lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
        return "\n".join(lines)

    @staticmethod
@@ -320,16 +232,3 @@ class PdfToMarkdownTool(Tool):
            return int(value)
        except Exception:
            return default
-
-    @staticmethod
-    def _to_bool(value: Any, default: bool) -> bool:
-        if value is None:
-            return default
-        if isinstance(value, bool):
-            return value
-        s = str(value).strip().lower()
-        if s in {"1", "true", "yes", "on"}:
-            return True
-        if s in {"0", "false", "no", "off"}:
-            return False
-        return default
--- a/difyPlugin/pdf/tools/pdf_to_markdown.yaml
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.yaml
@@ -1,68 +1,51 @@
-identity:
+identity:
  name: "pdf_to_markdown"
  author: "yslg"
  label:
    en_US: "PDF to Markdown"
-    zh_Hans: "PDF转Markdown"
+    zh_Hans: "PDF to Markdown"
    pt_BR: "PDF para Markdown"
-    ja_JP: "PDFからMarkdown"
+    ja_JP: "PDF to Markdown"
 description:
  human:
-    en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed."
-    zh_Hans: "将PDF转换为单个Markdown文件，图片以base64嵌入，无需大模型"
-    pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM."
-    ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。"
-  llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed."
+    en_US: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
+    zh_Hans: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
+    pt_BR: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
+    ja_JP: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
+  llm: "Convert a PDF file into Markdown using a catalog JSON array. Ignore images and graphics."
 parameters:
  - name: file
    type: file
    required: true
    label:
      en_US: PDF File
-      zh_Hans: PDF 文件
-      pt_BR: Arquivo PDF
-      ja_JP: PDFファイル
+      zh_Hans: PDF File
+      pt_BR: PDF File
+      ja_JP: PDF File
    human_description:
      en_US: "PDF file to convert"
-      zh_Hans: "要转换的 PDF 文件"
-      pt_BR: "Arquivo PDF para converter"
-      ja_JP: "変換するPDFファイル"
+      zh_Hans: "PDF file to convert"
+      pt_BR: "PDF file to convert"
+      ja_JP: "PDF file to convert"
    llm_description: "PDF file to convert to Markdown"
    form: llm
    fileTypes:
      - "pdf"
-  - name: include_images
-    type: boolean
-    required: false
+  - name: catalog
+    type: string
+    required: true
    label:
-      en_US: Include Images
-      zh_Hans: 包含图片
-      pt_BR: Incluir Imagens
-      ja_JP: 画像を含める
+      en_US: Catalog JSON
+      zh_Hans: Catalog JSON
+      pt_BR: Catalog JSON
+      ja_JP: Catalog JSON
    human_description:
-      en_US: "Whether to embed images as base64 in the Markdown output (default: true)"
-      zh_Hans: "是否将图片以base64嵌入Markdown输出（默认：是）"
-      pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)"
-      ja_JP: "Markdown出力にbase64として画像を埋め込むかどうか（デフォルト：はい）"
-    llm_description: "Set to true to embed images as base64, false to skip images"
-    form: form
-    default: true
-  - name: image_dpi
-    type: number
-    required: false
-    label:
-      en_US: Image DPI
-      zh_Hans: 图片DPI
-      pt_BR: DPI da Imagem
-      ja_JP: 画像DPI
-    human_description:
-      en_US: "DPI for rendering vector drawings (72-300, default: 150)"
-      zh_Hans: "矢量图渲染DPI（72-300，默认150）"
-      pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)"
-      ja_JP: "ベクター描画のレンダリングDPI（72-300、デフォルト：150）"
-    llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150."
-    form: form
-    default: 150
+      en_US: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
+      zh_Hans: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
+      pt_BR: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
+      ja_JP: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
+    llm_description: "Catalog JSON array returned by pdf_toc"
+    form: llm
 extra:
  python:
    source: tools/pdf_to_markdown.py
--- a/difyPlugin/pdf/tools/pdf_toc.py
+++ b/difyPlugin/pdf/tools/pdf_toc.py
@@ -4,264 +4,303 @@ from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any

+import fitz
 from dify_plugin import Tool
 from dify_plugin.entities.model.llm import LLMModelConfig
 from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
 from dify_plugin.entities.tool import ToolInvokeMessage

-_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
-Return only valid JSON object, no markdown fences, no explanation.
-Output schema:
-{
-  "Chapter Name": {"start": 1, "end": 5},
-  "Another": {"start": 6, "end": 20}
-}
-Rules:
- start/end are integer printed page numbers from TOC.
- If end is unknown, use same value as start.
- Keep chapter names exactly as in TOC text.
-"""
+_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。
+
+要求：
+1. 识别所有一级和二级标题及其对应的页码
+2. 只返回纯JSON数组，不要markdown代码块，不要任何解释
+3. 格式: [{"title": "章节标题", "page": 页码数字}]
+4. 页码必须是文档中标注的实际页码数字
+5. 如果无法识别目录，返回空数组 []"""


 class PdfTocTool(Tool):
+    _TOC_PATTERNS = [
+        r"目录",
+        r"目\s*录",
+        r"目\u3000录",
+        r"Table of Contents",
+        r"Contents",
+        r"目次",
+    ]
+
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
-        toc_start = self._to_int(tool_parameters.get("toc_start"), None)
-        toc_end = self._to_int(tool_parameters.get("toc_end"), None)
-        toc_pages = (tool_parameters.get("toc_pages") or "").strip()
+        file = tool_parameters.get("file")
+        if not file:
+            yield self.create_text_message("Error: file is required")
+            return
+
        model_config = tool_parameters.get("model")

-        if toc_start is None or toc_end is None:
-            yield self.create_text_message("Error: toc_start and toc_end are required")
-            return
+        doc = fitz.open(stream=file.blob, filetype="pdf")
+        try:
+            num_pages = len(doc)

-        if not toc_pages:
-            yield self.create_text_message("Error: toc_pages text is empty")
-            return
+            # 1) 优先从PDF元数据提取目录
+            catalog = self._catalog_from_metadata(doc.get_toc(), num_pages)

-        cleaned = self._strip_index_lists(toc_pages)
-
-        # 1) deterministic parser first
-        catalog = self._parse_toc_lines(cleaned)
-
-        # 2) optional LLM fallback/enhance only when deterministic parser gives no result
-        llm_raw_output = ""
-        llm_error = None
+            # 2) 元数据无目录时，使用LLM解析
            if not catalog and model_config:
-            llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
-                toc_start=toc_start,
-                toc_end=toc_end,
-                toc_pages=cleaned,
-                model_config=model_config,
+                catalog = self._extract_toc_with_llm(doc, num_pages, model_config)
+
+            # 3) 无LLM配置时回退到正则解析
+            if not catalog:
+                toc_start, toc_end = self._find_toc_pages(doc, num_pages)
+                if toc_start is not None and toc_end is not None:
+                    toc_text = "\n".join(
+                        doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
                    )
-            if llm_catalog:
-                catalog = self._normalize_catalog(llm_catalog)
+                    printed_catalog = self._parse_toc_lines(toc_text)
+                    catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages)

-        result: dict[str, Any] = {
-            "toc_start": toc_start,
-            "toc_end": toc_end,
-            "catalog": catalog,
-            "meta": {
-                "catalog_size": len(catalog),
-                "parser": "rule" if catalog else "none",
-            },
-        }
+            if not catalog:
+                catalog = []

-        if llm_raw_output:
-            result["meta"]["llm_used"] = True
-        if llm_error:
-            result["meta"]["llm_error"] = llm_error
+            yield self.create_text_message(json.dumps(catalog, ensure_ascii=False))
+        finally:
+            doc.close()

-        # always return valid json text payload for downstream json.loads
-        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
-        yield self.create_json_message(result)
+    def _extract_toc_with_llm(
+        self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any]
+    ) -> list[dict[str, int | str]]:
+        # 先尝试定位目录页
+        toc_start, toc_end = self._find_toc_pages(doc, num_pages)

-    def _parse_with_llm(
-        self,
-        toc_start: int,
-        toc_end: int,
-        toc_pages: str,
-        model_config: dict[str, Any],
-    ) -> tuple[dict[str, Any] | None, str, str | None]:
-        user_content = (
-            f"TOC page index range: {toc_start}..{toc_end}\n\n"
-            f"TOC raw text:\n{toc_pages}"
+        if toc_start is not None and toc_end is not None:
+            # 有目录页，提取目录页文本
+            toc_text = "\n".join(
+                doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
            )
+            content_offset = toc_end
+        else:
+            # 无目录页，提取前15页文本让LLM识别章节结构
+            sample = min(num_pages, 15)
+            toc_text = "\n\n--- 第{}页 ---\n".join(
+                [""] + [doc[i].get_text() or "" for i in range(sample)]
+            )
+            toc_text = toc_text.strip()
+            if not toc_text:
+                return []
+            content_offset = 0
+
+        # 截断过长文本
+        if len(toc_text) > 15000:
+            toc_text = toc_text[:15000] + "\n...[截断]"
+
+        try:
            response = self.session.model.llm.invoke(
                model_config=LLMModelConfig(**model_config),
                prompt_messages=[
-                SystemPromptMessage(content=_SYSTEM_PROMPT),
-                UserPromptMessage(content=user_content),
+                    SystemPromptMessage(content=_TOC_SYSTEM_PROMPT),
+                    UserPromptMessage(content=toc_text),
                ],
                stream=False,
            )

-        llm_text = ""
-        if hasattr(response, "message") and response.message:
-            content = response.message.content
-            if isinstance(content, str):
-                llm_text = content
-            elif isinstance(content, list):
-                llm_text = "".join(
-                    item.data if hasattr(item, "data") else str(item) for item in content
-                )
+            llm_text = self._get_response_text(response)
+            if not llm_text:
+                return []

-        parsed = self._extract_json_object(llm_text)
-        if parsed is None:
-            return None, llm_text, "Failed to parse LLM output as JSON"
-        if not isinstance(parsed, dict):
-            return None, llm_text, "LLM output JSON is not an object"
+            raw_catalog = self._parse_llm_json(llm_text)
+            if not raw_catalog:
+                return []

-        return parsed, llm_text, None
+            # 转换LLM返回的简单格式为完整catalog
+            return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages)
+        except Exception:
+            return []
+
+    def _build_catalog_from_llm(
+        self, raw: list[dict], content_offset: int, num_pages: int
+    ) -> list[dict[str, int | str]]:
+        entries: list[tuple[str, int]] = []
+        for item in raw:
+            title = str(item.get("title") or "").strip()
+            page = self._to_int(item.get("page"), None)
+            if not title or page is None:
+                continue
+            entries.append((title, page))
+
+        if not entries:
+            return []
+
+        # 计算偏移量：第一个条目的页码与实际内容起始页的差值
+        first_printed_page = entries[0][1]
+        offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0
+
+        result: list[dict[str, int | str]] = []
+        for i, (title, page) in enumerate(entries):
+            next_page = entries[i + 1][1] if i + 1 < len(entries) else page
+            page_start_index = max(0, min(page + offset - 1, num_pages - 1))
+            page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1))
+            if i == len(entries) - 1:
+                page_end_index = num_pages - 1
+
+            result.append({
+                "title": title,
+                "start": page,
+                "end": max(page, next_page - 1) if i + 1 < len(entries) else page,
+                "page_start_index": page_start_index,
+                "page_end_index": page_end_index,
+            })
+
+        return result

    @staticmethod
-    def _strip_index_lists(text: str) -> str:
-        # Stop before common appendix lists that pollute TOC parsing.
-        pattern = re.compile(
+    def _get_response_text(response: Any) -> str:
+        if not hasattr(response, "message") or not response.message:
+            return ""
+        content = response.message.content
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            text = "".join(
+                item.data if hasattr(item, "data") else str(item) for item in content
+            )
+        else:
+            text = str(content)
+
+        # 清理思考标签
+        text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
+        text = re.sub(r"<\|[^>]+\|>", "", text)
+        return text.strip()
+
+    @staticmethod
+    def _parse_llm_json(text: str) -> list[dict]:
+        # 尝试提取JSON代码块
+        code_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
+        if code_match:
+            text = code_match.group(1).strip()
+
+        # 尝试找到JSON数组
+        bracket_match = re.search(r"\[[\s\S]*\]", text)
+        if bracket_match:
+            text = bracket_match.group(0)
+
+        try:
+            result = json.loads(text)
+            if isinstance(result, list):
+                return result
+        except Exception:
+            pass
+        return []
+
+    def _catalog_from_metadata(self, toc: list, num_pages: int) -> list[dict[str, int | str]]:
+        top = [(title, max(0, page - 1)) for level, title, page in toc if level <= 2 and page >= 1]
+        if not top:
+            return []
+
+        result: list[dict[str, int | str]] = []
+        for index, (title, start_index) in enumerate(top):
+            end_index = top[index + 1][1] - 1 if index + 1 < len(top) else num_pages - 1
+            result.append({
+                "title": title,
+                "start": start_index + 1,
+                "end": max(start_index, end_index) + 1,
+                "page_start_index": start_index,
+                "page_end_index": max(start_index, end_index),
+            })
+        return result
+
+    def _find_toc_pages(self, doc: fitz.Document, num_pages: int) -> tuple[int | None, int | None]:
+        toc_start = None
+        toc_end = None
+        for page_number in range(min(num_pages, 30)):
+            text = doc[page_number].get_text() or ""
+            if any(re.search(pattern, text, re.IGNORECASE) for pattern in self._TOC_PATTERNS):
+                if toc_start is None:
+                    toc_start = page_number
+                toc_end = page_number
+            elif toc_start is not None:
+                break
+        return toc_start, toc_end
+
+    def _parse_toc_lines(self, text: str) -> list[dict[str, int | str]]:
+        marker = re.search(
            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
+            text,
            re.IGNORECASE | re.MULTILINE,
        )
-        m = pattern.search(text)
-        return text[: m.start()].rstrip() if m else text
-
-    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
-        """Parse lines like:
-        1.2 Engine Overview ........ 35
-        Appendix A  120
-        """
-        line_pattern = re.compile(
-            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
-        )
+        if marker:
+            text = text[: marker.start()]

+        pattern = re.compile(r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$")
        entries: list[tuple[str, int]] = []
        for raw in text.splitlines():
            line = raw.strip()
-            if not line or len(line) < 3:
-                continue
-            if re.fullmatch(r"\d+", line):
+            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
                continue

-            m = line_pattern.match(line)
-            if not m:
+            match = pattern.match(line)
+            if not match:
                continue

-            title = re.sub(r"\s+", " ", m.group("title")).strip("-_:： ")
-            page = self._to_int(m.group("page"), None)
-            if not title or page is None:
+            title = re.sub(r"\s+", " ", match.group("title")).strip("-_:：")
+            page = self._to_int(match.group("page"), None)
+            if not title or page is None or len(title) <= 1:
                continue
-
-            # Skip obvious noise.
-            if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
+            if title.lower() in {"page", "pages", "目录", "contents"}:
                continue

            entries.append((title, page))

        if not entries:
-            return {}
+            return []

-        # Deduplicate keeping earliest appearance.
        dedup: OrderedDict[str, int] = OrderedDict()
        for title, page in entries:
-            if title not in dedup:
-                dedup[title] = page
+            dedup.setdefault(title, page)

        titles = list(dedup.keys())
-        pages = [dedup[t] for t in titles]
+        pages = [dedup[title] for title in titles]
+        result: list[dict[str, int | str]] = []
+        for index, title in enumerate(titles):
+            start = pages[index]
+            end = max(start, pages[index + 1] - 1) if index + 1 < len(pages) else start
+            result.append({"title": title, "start": start, "end": end})
+        return result

-        catalog: dict[str, dict[str, int]] = {}
-        for i, title in enumerate(titles):
-            start = pages[i]
-            if i + 1 < len(pages):
-                next_start = pages[i + 1]
-                end = max(start, next_start - 1)
-            else:
-                end = start
-            catalog[title] = {"start": int(start), "end": int(end)}
+    def _attach_page_indexes(
+        self, catalog: list[dict[str, int | str]], toc_end: int, num_pages: int
+    ) -> list[dict[str, int | str]]:
+        if not catalog:
+            return []

-        return catalog
+        first_page = None
+        for item in catalog:
+            start = self._to_int(item.get("start"), None)
+            if start is not None and (first_page is None or start < first_page):
+                first_page = start

-    def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
-        catalog: dict[str, dict[str, int]] = {}
-        source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
-        if not isinstance(source, dict):
-            return catalog
+        if first_page is None:
+            return []

-        for name, value in source.items():
-            if not isinstance(name, str) or not isinstance(value, dict):
-                continue
-            start = self._to_int(value.get("start"), None)
-            end = self._to_int(value.get("end"), start)
+        offset = (toc_end + 1) - first_page
+        result: list[dict[str, int | str]] = []
+        for item in catalog:
+            start = self._to_int(item.get("start"), None)
+            end = self._to_int(item.get("end"), start)
            if start is None:
                continue
            if end is None:
                end = start
-            catalog[name] = {"start": int(start), "end": int(max(start, end))}
-        return catalog

-    @staticmethod
-    def _extract_json_object(text: str) -> Any:
-        if not text:
-            return None
-
-        candidates: list[str] = []
-
-        code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
-        candidates.extend([c.strip() for c in code_blocks if c.strip()])
-
-        brace_candidate = PdfTocTool._extract_first_brace_object(text)
-        if brace_candidate:
-            candidates.append(brace_candidate)
-
-        candidates.append(text.strip())
-
-        for cand in candidates:
-            parsed = PdfTocTool._json_try_parse(cand)
-            if parsed is not None:
-                return parsed
-        return None
-
-    @staticmethod
-    def _extract_first_brace_object(text: str) -> str | None:
-        start = text.find("{")
-        if start < 0:
-            return None
-
-        depth = 0
-        in_str = False
-        escape = False
-        for i in range(start, len(text)):
-            ch = text[i]
-            if in_str:
-                if escape:
-                    escape = False
-                elif ch == "\\":
-                    escape = True
-                elif ch == '"':
-                    in_str = False
-                continue
-
-            if ch == '"':
-                in_str = True
-            elif ch == "{":
-                depth += 1
-            elif ch == "}":
-                depth -= 1
-                if depth == 0:
-                    return text[start : i + 1]
-        return None
-
-    @staticmethod
-    def _json_try_parse(text: str) -> Any:
-        try:
-            return json.loads(text)
-        except Exception:
-            pass
-
-        # Minimal repair: remove trailing commas before } or ]
-        repaired = re.sub(r",\s*([}\]])", r"\1", text)
-        try:
-            return json.loads(repaired)
-        except Exception:
-            return None
+            page_start_index = max(0, min(start + offset, num_pages - 1))
+            page_end_index = max(page_start_index, min(end + offset, num_pages - 1))
+            result.append({
+                "title": str(item.get("title") or "Untitled"),
+                "start": start,
+                "end": max(start, end),
+                "page_start_index": page_start_index,
+                "page_end_index": page_end_index,
+            })
+        return result

    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
--- a/difyPlugin/pdf/tools/pdf_toc.yaml
+++ b/difyPlugin/pdf/tools/pdf_toc.yaml
@@ -2,63 +2,35 @@ identity:
  name: "pdf_toc"
  author: "yslg"
  label:
-    en_US: "PDF TOC Parser"
-    zh_Hans: "PDF目录解析"
-    pt_BR: "Analisador de Sumário PDF"
-    ja_JP: "PDF目次解析"
+    en_US: "PDF TOC"
+    zh_Hans: "PDF 目录提取"
+    pt_BR: "PDF TOC"
+    ja_JP: "PDF TOC"
 description:
  human:
-    en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
-    zh_Hans: "通过LLM将PDF目录文本（来自目录页提取工具的输出）解析为结构化JSON目录"
-    pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
-    ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
-  llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
+    en_US: "Extract the catalog array from a PDF file using metadata or LLM."
+    zh_Hans: "从PDF文件中提取目录数组，优先使用元数据，回退使用LLM解析。"
+    pt_BR: "Extrair o array de catálogo de um arquivo PDF."
+    ja_JP: "PDFファイルからカタログ配列を抽出する。"
+  llm: "Extract a catalog array from a PDF file. Returns JSON text like [{title,start,end,page_start_index,page_end_index}]."
 parameters:
-  - name: toc_start
-    type: number
+  - name: file
+    type: file
    required: true
    label:
-      en_US: TOC Start Page
-      zh_Hans: 目录起始页
-      pt_BR: Página Inicial do Sumário
-      ja_JP: 目次開始ページ
+      en_US: PDF File
+      zh_Hans: PDF 文件
+      pt_BR: PDF File
+      ja_JP: PDF File
    human_description:
-      en_US: "Start page index of TOC (from pdf_column_range output)"
-      zh_Hans: "目录起始页码（来自目录页提取工具输出的 start）"
-      pt_BR: "Índice da página inicial do sumário"
-      ja_JP: "目次の開始ページ番号"
-    llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
-    form: llm
-  - name: toc_end
-    type: number
-    required: true
-    label:
-      en_US: TOC End Page
-      zh_Hans: 目录结束页
-      pt_BR: Página Final do Sumário
-      ja_JP: 目次終了ページ
-    human_description:
-      en_US: "End page index of TOC (from pdf_column_range output)"
-      zh_Hans: "目录结束页码（来自目录页提取工具输出的 end）"
-      pt_BR: "Índice da página final do sumário"
-      ja_JP: "目次の終了ページ番号"
-    llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
-    form: llm
-  - name: toc_pages
-    type: string
-    required: true
-    label:
-      en_US: TOC Page Text
-      zh_Hans: 目录页文本
-      pt_BR: Texto das Páginas do Sumário
-      ja_JP: 目次ページテキスト
-    human_description:
-      en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
-      zh_Hans: "目录页原始文本内容（来自目录页提取工具输出的 pages 数组）"
-      pt_BR: "Conteúdo de texto bruto das páginas do sumário"
-      ja_JP: "目次ページの生テキスト内容"
-    llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
+      en_US: "PDF file to inspect"
+      zh_Hans: "要解析的PDF文件"
+      pt_BR: "PDF file to inspect"
+      ja_JP: "PDF file to inspect"
+    llm_description: "PDF file to extract catalog from"
    form: llm
+    fileTypes:
+      - "pdf"
  - name: model
    type: model-selector
    scope: llm
@@ -69,10 +41,10 @@ parameters:
      pt_BR: Modelo LLM
      ja_JP: LLMモデル
    human_description:
-      en_US: "LLM model for parsing TOC into structured JSON"
-      zh_Hans: "用于解析目录的 LLM 模型"
-      pt_BR: "Modelo LLM para análise do sumário"
-      ja_JP: "目次解析用のLLMモデル"
+      en_US: "LLM model used for parsing TOC when metadata is unavailable"
+      zh_Hans: "当元数据不可用时，用于解析目录的LLM模型"
+      pt_BR: "Modelo LLM para análise de TOC"
+      ja_JP: "メタデータが利用できない場合のTOC解析用LLMモデル"
    form: form
 extra:
  python:
--- a/difyPlugin/数据清洗-大文件处理.yml.bak
+++ b/difyPlugin/数据清洗-大文件处理.yml.bak