更新

2026-03-06 14:50:43 +08:00
parent 843146cdd7
commit 91ff28bdcf
18 changed files with 1316 additions and 100 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@
 .trae
 **/*.difypkg
 urbanLifeServ/*
 */.data
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,27 +0,0 @@
 {
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Python: FastAPI Server",
      "type": "python",
      "request": "launch",
      "program": "${workspaceFolder}/difyPlugin/main.py",
      "console": "integratedTerminal",
      "justMyCode": true,
      "env": {
        "PYTHONUNBUFFERED": "1"
      },
      "cwd": "${workspaceFolder}/difyPlugin",
      "args": []
    },
    {
      "name": "Python: Debug Plugin",
      "type": "python",
      "request": "launch",
      "program": "${workspaceFolder}/difyPlugin/app/plugins/pdf/__init__.py",
      "console": "integratedTerminal",
      "justMyCode": true,
      "cwd": "${workspaceFolder}/difyPlugin"
    }
  ]
 }
--- a/2
+++ b/2
--- a/difyPlugin/pdf/manifest.yaml
+++ b/difyPlugin/pdf/manifest.yaml
@@ -19,6 +19,9 @@ resource:
  permission:
    tool:
      enabled: true
    model:
      enabled: true
      llm: true
 plugins:
  tools:
    - provider/pdf.yaml
--- a/difyPlugin/pdf/provider/pdf.yaml
+++ b/difyPlugin/pdf/provider/pdf.yaml
@@ -56,8 +56,12 @@ identity:
 #         en_US: "Access Token"
 tools:
-  - tools/pdf.yaml
+  - tools/pdf_column_range.yaml
  - tools/pdf_single_page.yaml
  - tools/pdf_summary.yaml
  - tools/pdf_toc.yaml
  - tools/pdf_extract_range.yaml
  - tools/pdf_to_markdown.yaml
 extra:
  python:
    source: provider/pdf.py
--- a/difyPlugin/pdf/requirements.txt
+++ b/difyPlugin/pdf/requirements.txt
@@ -1,2 +1,2 @@
 dify_plugin>=0.4.0,<0.7.0
-PyPDF2>=3.0.1
+pymupdf>=1.27.1
--- a/difyPlugin/pdf/tools/pdf.py
+++ b/difyPlugin/pdf/tools/pdf.py
@@ -1,61 +0,0 @@
 import re
 from collections.abc import Generator
 from io import BytesIO
 from typing import Any
 import PyPDF2
 from dify_plugin import Tool
 from dify_plugin.entities.tool import ToolInvokeMessage
 class PdfTool(Tool):
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        # file.blob returns bytes
        pdf_bytes = file.blob
        reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
        num_pages = len(reader.pages)
        toc_start = None
        toc_end = None
        toc_patterns = [
            r'目录',
            r'Table of Contents',
            r'Contents',
            r'目次'
        ]
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text() or ""
            if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
                if toc_start is None:
                    toc_start = page_num
                toc_end = page_num
            elif toc_start is not None and toc_end is not None:
                break
        if toc_start is None:
            yield self.create_json_message({
                "start": None,
                "end": None,
                "pages": []
            })
            return
        toc_pages = []
        for page_num in range(toc_start, toc_end + 1):
            page = reader.pages[page_num]
            toc_pages.append(page.extract_text() or "")
        yield self.create_json_message({
            "start": toc_start,
            "end": toc_end,
            "pages": toc_pages
        })
--- a/difyPlugin/pdf/tools/pdf_column_range.py
+++ b/difyPlugin/pdf/tools/pdf_column_range.py
@@ -0,0 +1,107 @@
 import json
 import re
 from collections.abc import Generator
 from io import BytesIO
 from typing import Any
 import fitz  # PyMuPDF 核心库
 from dify_plugin import Tool
 from dify_plugin.entities.tool import ToolInvokeMessage
 class PdfTool(Tool):
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        # 从字节流加载 PDF（替换 PyPDF2）
        pdf_bytes = file.blob
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        num_pages = len(doc)
        toc_start = None
        toc_end = None
        # 目录匹配正则（与原代码一致）
        toc_patterns = [
            r'目录',
            r'目　录',
            r'目\u3000录',
            r'Table of Contents',
            r'Contents',
            r'目次'
        ]
        # 遍历页面识别目录页（逻辑不变，仅替换文本提取方式）
        for page_num in range(num_pages):
            page = doc[page_num]
            text = page.get_text() or ""  # PyMuPDF 提取文本
            if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
                if toc_start is None:
                    toc_start = page_num
                toc_end = page_num
            elif toc_start is not None and toc_end is not None:
                break
        # 提取目录页文本
        toc_pages = []
        if toc_start is not None and toc_end is not None:
            for page_num in range(toc_start, toc_end + 1):
                page = doc[page_num]
                toc_pages.append(page.get_text() or "")
        # 关闭文档
        doc.close()
        result = {
            "start": toc_start,
            "end": toc_end,
            "pages": toc_pages,
            "pages_text": "\n".join(toc_pages) if toc_pages else "",
        }
        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
        yield self.create_json_message(result)
 if __name__ == "__main__":
    # 测试代码（改用 PyMuPDF）
    pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf"
    doc = fitz.open(pdf_path)  # 本地文件直接打开
    num_pages = len(doc)
    toc_start = None
    toc_end = None
    toc_patterns = [
        r'目录',
        r'目　录',
        r'目\u3000录',
        r'Table of Contents',
        r'Contents',
        r'目次'
    ]
    # 遍历页面找目录
    for page_num in range(num_pages):
        page = doc[page_num]
        text = page.get_text() or ""
        if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
            if toc_start is None:
                toc_start = page_num
            toc_end = page_num
        elif toc_start is not None and toc_end is not None:
            break
    # 提取目录页文本
    toc_pages = []
    toc_start = toc_start if toc_start is not None else 18
    toc_end = toc_end if toc_end is not None else toc_start + 9
    for page_num in range(toc_start, toc_end):
        page = doc[page_num]
        toc_pages.append(page.get_text() or "")
    print(toc_start, toc_end, toc_pages)
    doc.close()  # 关闭文档
--- a/difyPlugin/pdf/tools/pdf_column_range.yaml
+++ b/difyPlugin/pdf/tools/pdf_column_range.yaml
@@ -33,4 +33,4 @@ parameters:
      - "pdf"
 extra:
  python:
-    source: tools/pdf.py
+    source: tools/pdf_column_range.py
--- a/difyPlugin/pdf/tools/pdf_extract_range.py
+++ b/difyPlugin/pdf/tools/pdf_extract_range.py
@@ -0,0 +1,48 @@
 import json
 from collections.abc import Generator
 from typing import Any
 import fitz  # PyMuPDF
 from dify_plugin import Tool
 from dify_plugin.entities.tool import ToolInvokeMessage
 class PdfExtractRangeTool(Tool):
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        start_page = int(tool_parameters.get("start_page", 0))
        end_page = int(tool_parameters.get("end_page", 0))
        # 打开 PDF
        pdf_bytes = file.blob
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        num_pages = len(doc)
        # 边界处理
        start_page = max(0, min(start_page, num_pages - 1))
        end_page = max(start_page, min(end_page, num_pages - 1))
        # 逐页提取文本
        page_texts = []
        for page_idx in range(start_page, end_page + 1):
            page = doc[page_idx]
            text = page.get_text("text", sort=True) or ""
            page_texts.append(text)
        doc.close()
        # 拼接所有页面文本
        full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
        result = {
            "start": start_page,
            "end": end_page,
            "total_pages": end_page - start_page + 1,
            "text": full_text,
        }
        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
        yield self.create_json_message(result)
--- a/difyPlugin/pdf/tools/pdf_extract_range.yaml
+++ b/difyPlugin/pdf/tools/pdf_extract_range.yaml
@@ -0,0 +1,68 @@
 identity:
  name: "pdf_extract_range"
  author: "yslg"
  label:
    en_US: "Extract Page Range Text"
    zh_Hans: "提取页面范围文本"
    pt_BR: "Extrair Texto do Intervalo de Páginas"
    ja_JP: "ページ範囲テキスト抽出"
 description:
  human:
    en_US: "Extract plain text from a specified page range of a PDF file"
    zh_Hans: "从PDF文件的指定页码范围提取纯文本"
    pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF"
    ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出"
  llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range."
 parameters:
  - name: file
    type: file
    required: true
    label:
      en_US: PDF File
      zh_Hans: PDF 文件
      pt_BR: Arquivo PDF
      ja_JP: PDFファイル
    human_description:
      en_US: "PDF file to extract text from"
      zh_Hans: "要提取文本的 PDF 文件"
      pt_BR: "Arquivo PDF para extrair texto"
      ja_JP: "テキストを抽出するPDFファイル"
    llm_description: "PDF file to extract page range text from"
    form: llm
    fileTypes:
      - "pdf"
  - name: start_page
    type: number
    required: true
    label:
      en_US: Start Page
      zh_Hans: 起始页码
      pt_BR: Página Inicial
      ja_JP: 開始ページ
    human_description:
      en_US: "Start page index (0-based)"
      zh_Hans: "起始页码（从0开始）"
      pt_BR: "Índice da página inicial (base 0)"
      ja_JP: "開始ページ番号（0始まり）"
    llm_description: "Start page index (0-based)"
    form: llm
    default: 0
  - name: end_page
    type: number
    required: true
    label:
      en_US: End Page
      zh_Hans: 结束页码
      pt_BR: Página Final
      ja_JP: 終了ページ
    human_description:
      en_US: "End page index (0-based, inclusive)"
      zh_Hans: "结束页码（从0开始，包含该页）"
      pt_BR: "Índice da página final (base 0, inclusivo)"
      ja_JP: "終了ページ番号（0始まり、含む）"
    llm_description: "End page index (0-based, inclusive)"
    form: llm
    default: 0
 extra:
  python:
    source: tools/pdf_extract_range.py
--- a/difyPlugin/pdf/tools/pdf_single_page.py
+++ b/difyPlugin/pdf/tools/pdf_single_page.py
@@ -1,8 +1,9 @@
 import json
 from collections.abc import Generator
 from io import BytesIO
 from typing import Any
-import PyPDF2
+import fitz  # PyMuPDF 核心库
 from dify_plugin import Tool
 from dify_plugin.entities.tool import ToolInvokeMessage
@@ -16,21 +17,29 @@ class PdfSinglePageTool(Tool):
            yield self.create_text_message("Error: file is required")
            return
        # 从字节流加载 PDF（替换 PyPDF2 的 PdfReader）
        pdf_bytes = file.blob
-        reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")  # 字节流方式打开
-        num_pages = len(reader.pages)
+        num_pages = len(doc)
        # 页码边界处理（逻辑与原代码一致）
        page_index = int(page)
        if page_index < 0:
            page_index = 0
        if page_index >= num_pages:
            page_index = num_pages - 1
-        selected_page = reader.pages[page_index]
+        # 提取指定页面文本（PyMuPDF 方式）
-        text = selected_page.extract_text() or ""
+        selected_page = doc[page_index]
        text = selected_page.get_text() or ""  # get_text() 提取文本，比 PyPDF2 更精准
-        yield self.create_json_message({
+        # 关闭文档释放资源
        doc.close()
        result = {
            "start": page_index,
            "end": page_index,
            "pages": [text]
-        })
+        }
        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
        yield self.create_json_message(result)
--- a/difyPlugin/pdf/tools/pdf_summary.py
+++ b/difyPlugin/pdf/tools/pdf_summary.py
@@ -0,0 +1,209 @@
 import json
 import re
 from collections.abc import Generator
 from typing import Any
 import fitz
 from dify_plugin import Tool
 from dify_plugin.entities.model.llm import LLMModelConfig
 from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
 from dify_plugin.entities.tool import ToolInvokeMessage
 class PdfSummaryTool(Tool):
    """Fast PDF page summary tool.
    Default behavior is optimized for throughput in large workflows:
    - Extract plain text and lightweight table data only.
    - Skip expensive image base64 and drawing path extraction.
    - Skip LLM by default unless `use_llm=true` is explicitly passed.
    """
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0)
        end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0)
        model_config = tool_parameters.get("model")
        use_llm = self._to_bool(tool_parameters.get("use_llm"), False)
        max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000)
        max_chars_per_page = max(800, min(max_chars_per_page, 20000))
        llm_prompt = tool_parameters.get(
            "llm_prompt",
            "请基于输入的PDF页面文本做简洁准确摘要，输出中文要点。不要输出思考过程。",
        )
        pdf_bytes = file.blob
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        try:
            num_pages = len(doc)
            start_page = max(0, min(start_page, num_pages - 1))
            end_page = max(start_page, min(end_page, num_pages - 1))
            pages_data: list[dict[str, Any]] = []
            for page_idx in range(start_page, end_page + 1):
                page = doc[page_idx]
                page_data = self._extract_page_fast(page, page_idx, max_chars_per_page)
                pages_data.append(page_data)
            result = {
                "total_pages_extracted": len(pages_data),
                "page_range": {"start": start_page, "end": end_page},
                "pages": pages_data,
            }
            yield self.create_json_message(result)
            # Fast local summary first (deterministic, no model latency)
            local_text = self._build_local_summary(pages_data)
            # Optional LLM refinement, explicitly enabled only
            if use_llm and model_config:
                refined = self._summarize_with_llm(local_text, llm_prompt, model_config)
                final_text = refined if refined else local_text
            else:
                final_text = local_text
            if final_text:
                yield self.create_text_message(final_text)
        finally:
            doc.close()
    def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]:
        text = (page.get_text("text") or "").strip()
        if len(text) > max_chars_per_page:
            text = text[:max_chars_per_page] + "\n...[truncated]"
        tables: list[dict[str, Any]] = []
        try:
            tabs = page.find_tables()
            for tab_idx, tab in enumerate(tabs.tables[:3]):
                cells = tab.extract() or []
                tables.append(
                    {
                        "index": tab_idx,
                        "rows": tab.row_count,
                        "cols": tab.col_count,
                        "cells": cells[:10],
                    }
                )
        except Exception:
            pass
        return {
            "page_number": page_idx,
            "text": text,
            "tables": tables,
            "images": [],
            "drawings_summary": [],
            "text_blocks": [],
            "width": float(page.rect.width),
            "height": float(page.rect.height),
        }
    def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str:
        """Output actual page content as Markdown (text + tables).
        No LLM needed downstream — the text is already usable Markdown.
        """
        parts: list[str] = []
        for page in pages_data:
            text = (page.get("text") or "").strip()
            tables = page.get("tables") or []
            page_parts: list[str] = []
            if text:
                page_parts.append(text)
            for tab in tables:
                cells = tab.get("cells") or []
                if len(cells) >= 2:
                    md = self._cells_to_md_table(cells)
                    if md:
                        page_parts.append(md)
            if page_parts:
                parts.append("\n\n".join(page_parts))
        return "\n\n--- 分页 ---\n\n".join(parts)
    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""
        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""
        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
        lines = [
            "| " + " | ".join(clean(c) for c in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
        return "\n".join(lines)
    def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str:
        response = self.session.model.llm.invoke(
            model_config=LLMModelConfig(**model_config),
            prompt_messages=[
                SystemPromptMessage(content=llm_prompt),
                UserPromptMessage(content=local_text),
            ],
            stream=False,
        )
        llm_text = ""
        if hasattr(response, "message") and response.message:
            content = response.message.content
            if isinstance(content, str):
                llm_text = content
            elif isinstance(content, list):
                llm_text = "".join(
                    item.data if hasattr(item, "data") else str(item)
                    for item in content
                )
        return self._extract_visible_answer(llm_text)
    @staticmethod
    def _extract_visible_answer(text: str) -> str:
        if not text:
            return ""
        box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text)
        if box_match:
            text = box_match.group(1)
        else:
            text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
        text = re.sub(r"<\|[^>]+\|>", "", text)
        return text.strip()
    @staticmethod
    def _to_int(value: Any, default: int) -> int:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default
    @staticmethod
    def _to_bool(value: Any, default: bool) -> bool:
        if value is None:
            return default
        if isinstance(value, bool):
            return value
        s = str(value).strip().lower()
        if s in {"1", "true", "yes", "on"}:
            return True
        if s in {"0", "false", "no", "off"}:
            return False
        return default
--- a/difyPlugin/pdf/tools/pdf_summary.yaml
+++ b/difyPlugin/pdf/tools/pdf_summary.yaml
@@ -0,0 +1,99 @@
 identity:
  name: "pdf_summary"
  author: "yslg"
  label:
    en_US: "PDF Page Summary"
    zh_Hans: "PDF页面概述"
    pt_BR: "Resumo de Página PDF"
    ja_JP: "PDFページ概要"
 description:
  human:
    en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM"
    zh_Hans: "提取PDF页面核心元素（文本、图片、表格、路径）及坐标，并通过LLM进行概述"
    pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM"
    ja_JP: "PDFページからコア要素（テキスト、画像、テーブル、パス）を座標付きで抽出し、LLMで要約"
  llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content"
 parameters:
  - name: file
    type: file
    required: true
    label:
      en_US: PDF File
      zh_Hans: PDF 文件
      pt_BR: Arquivo PDF
      ja_JP: PDFファイル
    human_description:
      en_US: "PDF file to process"
      zh_Hans: "要处理的 PDF 文件"
      pt_BR: "Arquivo PDF para processar"
      ja_JP: "処理するPDFファイル"
    llm_description: "PDF file to extract elements from and summarize"
    form: llm
    fileTypes:
      - "pdf"
  - name: pdf_start_page
    type: number
    required: true
    label:
      en_US: Start Page
      zh_Hans: 起始页码
      pt_BR: Página Inicial
      ja_JP: 開始ページ
    human_description:
      en_US: "Start page index (0-based)"
      zh_Hans: "起始页码（从0开始）"
      pt_BR: "Índice da página inicial (base 0)"
      ja_JP: "開始ページ番号（0始まり）"
    llm_description: "Start page index (0-based) for element extraction"
    form: llm
    default: 0
  - name: pdf_end_page
    type: number
    required: true
    label:
      en_US: End Page
      zh_Hans: 结束页码
      pt_BR: Página Final
      ja_JP: 終了ページ
    human_description:
      en_US: "End page index (0-based, inclusive)"
      zh_Hans: "结束页码（从0开始，包含该页）"
      pt_BR: "Índice da página final (base 0, inclusivo)"
      ja_JP: "終了ページ番号（0始まり、含む）"
    llm_description: "End page index (0-based, inclusive) for element extraction"
    form: llm
    default: 0
  - name: model
    type: model-selector
    scope: llm
    required: true
    label:
      en_US: LLM Model
      zh_Hans: LLM 模型
      pt_BR: Modelo LLM
      ja_JP: LLMモデル
    human_description:
      en_US: "LLM model used for summarizing extracted content"
      zh_Hans: "用于概述提取内容的 LLM 模型"
      pt_BR: "Modelo LLM usado para resumir o conteúdo extraído"
      ja_JP: "抽出内容の要約に使用するLLMモデル"
    form: form
  - name: llm_prompt
    type: string
    required: false
    label:
      en_US: LLM Prompt
      zh_Hans: LLM 提示词
      pt_BR: Prompt do LLM
      ja_JP: LLMプロンプト
    human_description:
      en_US: "System prompt for LLM summarization"
      zh_Hans: "LLM 概述的系统提示词"
      pt_BR: "Prompt do sistema para resumo LLM"
      ja_JP: "LLM要約用のシステムプロンプト"
    llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content"
    form: form
    default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容（包含文本、图片信息、表格和矢量图形），对每页内容进行准确、简洁的概述。"
 extra:
  python:
    source: tools/pdf_summary.py
--- a/difyPlugin/pdf/tools/pdf_to_markdown.py
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.py
@@ -0,0 +1,335 @@
 import base64
 import re
 from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any
 import fitz
 from dify_plugin import Tool
 from dify_plugin.entities.tool import ToolInvokeMessage
 class PdfToMarkdownTool(Tool):
    """Convert PDF to a single Markdown file. No LLM needed.
    - Auto-detect TOC and organize content by chapters.
    - Extract text and tables as Markdown.
    - Embed raster images as base64.
    - Render vector drawings as base64 PNG.
    - Output one .md file via create_blob_message.
    """
    _TOC_PATTERNS = [
        r"目录", r"目　录", r"目\u3000录",
        r"Table of Contents", r"Contents", r"目次",
    ]
    # ── entry point ──────────────────────────────────────────
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        file = tool_parameters.get("file")
        if not file:
            yield self.create_text_message("Error: file is required")
            return
        include_images = self._to_bool(tool_parameters.get("include_images"), True)
        image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
        image_dpi = max(72, min(image_dpi, 300))
        max_image_bytes = 2 * 1024 * 1024  # skip images > 2 MB raw
        doc = fitz.open(stream=file.blob, filetype="pdf")
        try:
            num_pages = len(doc)
            # 1) Build chapter map (metadata TOC → printed TOC → none)
            chapters, content_offset = self._build_chapter_map(doc, num_pages)
            # 2) Convert every page
            page_mds: list[str] = []
            for idx in range(num_pages):
                md = self._page_to_markdown(
                    doc, doc[idx], idx,
                    include_images, image_dpi, max_image_bytes,
                )
                page_mds.append(md)
            # 3) Assemble
            if chapters:
                final_md = self._assemble_by_chapters(
                    chapters, page_mds, content_offset, num_pages,
                )
            else:
                final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
            # 4) Output: text (for variable aggregation) + blob (.md file)
            yield self.create_text_message(final_md)
            md_bytes = final_md.encode("utf-8")
            yield self.create_blob_message(
                blob=md_bytes,
                meta={"mime_type": "text/markdown"},
            )
        finally:
            doc.close()
    # ── chapter detection ────────────────────────────────────
    def _build_chapter_map(
        self, doc: fitz.Document, num_pages: int,
    ) -> tuple[dict, int]:
        """Return (chapters_dict, content_offset).
        Try embedded PDF TOC metadata first (reliable page mapping).
        Fall back to scanning printed TOC pages.
        """
        toc = doc.get_toc()
        if toc:
            chapters = self._chapters_from_metadata(toc, num_pages)
            if chapters:
                return chapters, 0
        toc_start, toc_end = self._find_toc_pages(doc, num_pages)
        if toc_start is not None and toc_end is not None:
            toc_text = "\n".join(
                doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
            )
            chapters = self._parse_toc_lines(toc_text)
            if chapters:
                offset = self._guess_offset(chapters, toc_end)
                return chapters, offset
        return {}, 0
    def _chapters_from_metadata(
        self, toc: list, num_pages: int,
    ) -> dict[str, dict[str, int]]:
        top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
        if not top:
            return {}
        chapters: dict[str, dict[str, int]] = OrderedDict()
        for i, (title, start) in enumerate(top):
            end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
            chapters[title] = {"start": start, "end": max(start, end)}
        return chapters
    def _find_toc_pages(self, doc, num_pages):
        toc_start = toc_end = None
        for pn in range(min(num_pages, 30)):
            text = doc[pn].get_text() or ""
            if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
                if toc_start is None:
                    toc_start = pn
                toc_end = pn
            elif toc_start is not None:
                break
        return toc_start, toc_end
    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
        m = re.search(
            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
            text, re.IGNORECASE | re.MULTILINE,
        )
        if m:
            text = text[: m.start()]
        pat = re.compile(
            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
        )
        entries: list[tuple[str, int]] = []
        for raw in text.splitlines():
            line = raw.strip()
            if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
                continue
            m2 = pat.match(line)
            if not m2:
                continue
            title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:： ")
            page = self._to_int(m2.group("page"), None)
            if not title or page is None or len(title) <= 1:
                continue
            if title.lower() in {"page", "pages", "目录", "contents"}:
                continue
            entries.append((title, page))
        if not entries:
            return {}
        dedup: OrderedDict[str, int] = OrderedDict()
        for t, p in entries:
            dedup.setdefault(t, p)
        titles = list(dedup.keys())
        pages = [dedup[t] for t in titles]
        catalog: dict[str, dict[str, int]] = OrderedDict()
        for i, t in enumerate(titles):
            s = pages[i]
            e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
            catalog[t] = {"start": s, "end": e}
        return catalog
    @staticmethod
    def _guess_offset(chapters: dict, toc_end: int) -> int:
        first_page = None
        for info in chapters.values():
            s = info["start"]
            if first_page is None or s < first_page:
                first_page = s
        if first_page is None:
            return 0
        return (toc_end + 1) - first_page
    # ── per-page conversion ──────────────────────────────────
    def _page_to_markdown(
        self,
        doc: fitz.Document,
        page: fitz.Page,
        page_idx: int,
        include_images: bool,
        image_dpi: int,
        max_image_bytes: int,
    ) -> str:
        parts: list[str] = []
        # ── text ──
        text = (page.get_text("text", sort=True) or "").strip()
        if text:
            parts.append(text)
        # ── tables → Markdown ──
        try:
            for tab in (page.find_tables().tables or [])[:5]:
                cells = tab.extract() or []
                if len(cells) >= 2:
                    md = self._cells_to_md_table(cells)
                    if md:
                        parts.append(md)
        except Exception:
            pass
        if not include_images:
            return "\n\n".join(parts)
        # ── embedded raster images ──
        try:
            for img_idx, img_info in enumerate(page.get_images(full=True)):
                xref = img_info[0]
                try:
                    data = doc.extract_image(xref)
                    if not data or not data.get("image"):
                        continue
                    raw = data["image"]
                    if len(raw) > max_image_bytes:
                        continue
                    # skip tiny icons (< 20x20)
                    w = data.get("width", 0)
                    h = data.get("height", 0)
                    if w < 20 and h < 20:
                        continue
                    ext = data.get("ext", "png")
                    mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
                    b64 = base64.b64encode(raw).decode("ascii")
                    parts.append(
                        f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
                    )
                except Exception:
                    pass
        except Exception:
            pass
        # ── vector drawings → render as PNG ──
        try:
            drawings = page.get_drawings()
            if len(drawings) >= 3:
                valid_rects: list[fitz.Rect] = []
                for d in drawings:
                    r = d.get("rect")
                    if r:
                        try:
                            rect = fitz.Rect(r)
                            if rect.is_valid and not rect.is_empty:
                                valid_rects.append(rect)
                        except Exception:
                            pass
                if valid_rects:
                    bbox = valid_rects[0]
                    for r in valid_rects[1:]:
                        bbox |= r
                    bbox &= page.rect
                    if bbox.width > 30 and bbox.height > 30:
                        scale = image_dpi / 72
                        mat = fitz.Matrix(scale, scale)
                        pix = page.get_pixmap(matrix=mat, clip=bbox)
                        png = pix.tobytes("png")
                        if len(png) <= max_image_bytes:
                            b64 = base64.b64encode(png).decode("ascii")
                            parts.append(
                                f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
                            )
        except Exception:
            pass
        return "\n\n".join(parts)
    # ── assembly ─────────────────────────────────────────────
    def _assemble_by_chapters(
        self,
        chapters: dict[str, dict[str, int]],
        page_mds: list[str],
        offset: int,
        num_pages: int,
    ) -> str:
        parts: list[str] = []
        for name, info in chapters.items():
            s = info["start"] + offset
            e = info["end"] + offset
            s = max(0, min(s, num_pages - 1))
            e = max(s, min(e, num_pages - 1))
            ch: list[str] = [f"# {name}\n"]
            for idx in range(s, e + 1):
                if idx < len(page_mds) and page_mds[idx].strip():
                    ch.append(page_mds[idx])
            parts.append("\n\n".join(ch))
        return "\n\n---\n\n".join(parts)
    # ── helpers ──────────────────────────────────────────────
    @staticmethod
    def _cells_to_md_table(cells: list) -> str:
        if not cells:
            return ""
        header = cells[0]
        ncols = len(header)
        if ncols == 0:
            return ""
        clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
        lines = [
            "| " + " | ".join(clean(c) for c in header) + " |",
            "| " + " | ".join("---" for _ in range(ncols)) + " |",
        ]
        for row in cells[1:]:
            padded = list(row) + [""] * max(0, ncols - len(row))
            lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
        return "\n".join(lines)
    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default
    @staticmethod
    def _to_bool(value: Any, default: bool) -> bool:
        if value is None:
            return default
        if isinstance(value, bool):
            return value
        s = str(value).strip().lower()
        if s in {"1", "true", "yes", "on"}:
            return True
        if s in {"0", "false", "no", "off"}:
            return False
        return default
--- a/difyPlugin/pdf/tools/pdf_to_markdown.yaml
+++ b/difyPlugin/pdf/tools/pdf_to_markdown.yaml
@@ -0,0 +1,68 @@
 identity:
  name: "pdf_to_markdown"
  author: "yslg"
  label:
    en_US: "PDF to Markdown"
    zh_Hans: "PDF转Markdown"
    pt_BR: "PDF para Markdown"
    ja_JP: "PDFからMarkdown"
 description:
  human:
    en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed."
    zh_Hans: "将PDF转换为单个Markdown文件，图片以base64嵌入，无需大模型"
    pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM."
    ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。"
  llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed."
 parameters:
  - name: file
    type: file
    required: true
    label:
      en_US: PDF File
      zh_Hans: PDF 文件
      pt_BR: Arquivo PDF
      ja_JP: PDFファイル
    human_description:
      en_US: "PDF file to convert"
      zh_Hans: "要转换的 PDF 文件"
      pt_BR: "Arquivo PDF para converter"
      ja_JP: "変換するPDFファイル"
    llm_description: "PDF file to convert to Markdown"
    form: llm
    fileTypes:
      - "pdf"
  - name: include_images
    type: boolean
    required: false
    label:
      en_US: Include Images
      zh_Hans: 包含图片
      pt_BR: Incluir Imagens
      ja_JP: 画像を含める
    human_description:
      en_US: "Whether to embed images as base64 in the Markdown output (default: true)"
      zh_Hans: "是否将图片以base64嵌入Markdown输出（默认：是）"
      pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)"
      ja_JP: "Markdown出力にbase64として画像を埋め込むかどうか（デフォルト：はい）"
    llm_description: "Set to true to embed images as base64, false to skip images"
    form: form
    default: true
  - name: image_dpi
    type: number
    required: false
    label:
      en_US: Image DPI
      zh_Hans: 图片DPI
      pt_BR: DPI da Imagem
      ja_JP: 画像DPI
    human_description:
      en_US: "DPI for rendering vector drawings (72-300, default: 150)"
      zh_Hans: "矢量图渲染DPI（72-300，默认150）"
      pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)"
      ja_JP: "ベクター描画のレンダリングDPI（72-300、デフォルト：150）"
    llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150."
    form: form
    default: 150
 extra:
  python:
    source: tools/pdf_to_markdown.py
--- a/difyPlugin/pdf/tools/pdf_toc.py
+++ b/difyPlugin/pdf/tools/pdf_toc.py
@@ -0,0 +1,273 @@
 import json
 import re
 from collections import OrderedDict
 from collections.abc import Generator
 from typing import Any
 from dify_plugin import Tool
 from dify_plugin.entities.model.llm import LLMModelConfig
 from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
 from dify_plugin.entities.tool import ToolInvokeMessage
 _SYSTEM_PROMPT = """You parse PDF table-of-contents text.
 Return only valid JSON object, no markdown fences, no explanation.
 Output schema:
 {
  "Chapter Name": {"start": 1, "end": 5},
  "Another": {"start": 6, "end": 20}
 }
 Rules:
 - start/end are integer printed page numbers from TOC.
 - If end is unknown, use same value as start.
 - Keep chapter names exactly as in TOC text.
 """
 class PdfTocTool(Tool):
    def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
        toc_start = self._to_int(tool_parameters.get("toc_start"), None)
        toc_end = self._to_int(tool_parameters.get("toc_end"), None)
        toc_pages = (tool_parameters.get("toc_pages") or "").strip()
        model_config = tool_parameters.get("model")
        if toc_start is None or toc_end is None:
            yield self.create_text_message("Error: toc_start and toc_end are required")
            return
        if not toc_pages:
            yield self.create_text_message("Error: toc_pages text is empty")
            return
        cleaned = self._strip_index_lists(toc_pages)
        # 1) deterministic parser first
        catalog = self._parse_toc_lines(cleaned)
        # 2) optional LLM fallback/enhance only when deterministic parser gives no result
        llm_raw_output = ""
        llm_error = None
        if not catalog and model_config:
            llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
                toc_start=toc_start,
                toc_end=toc_end,
                toc_pages=cleaned,
                model_config=model_config,
            )
            if llm_catalog:
                catalog = self._normalize_catalog(llm_catalog)
        result: dict[str, Any] = {
            "toc_start": toc_start,
            "toc_end": toc_end,
            "catalog": catalog,
            "meta": {
                "catalog_size": len(catalog),
                "parser": "rule" if catalog else "none",
            },
        }
        if llm_raw_output:
            result["meta"]["llm_used"] = True
        if llm_error:
            result["meta"]["llm_error"] = llm_error
        # always return valid json text payload for downstream json.loads
        yield self.create_text_message(json.dumps(result, ensure_ascii=False))
        yield self.create_json_message(result)
    def _parse_with_llm(
        self,
        toc_start: int,
        toc_end: int,
        toc_pages: str,
        model_config: dict[str, Any],
    ) -> tuple[dict[str, Any] | None, str, str | None]:
        user_content = (
            f"TOC page index range: {toc_start}..{toc_end}\n\n"
            f"TOC raw text:\n{toc_pages}"
        )
        response = self.session.model.llm.invoke(
            model_config=LLMModelConfig(**model_config),
            prompt_messages=[
                SystemPromptMessage(content=_SYSTEM_PROMPT),
                UserPromptMessage(content=user_content),
            ],
            stream=False,
        )
        llm_text = ""
        if hasattr(response, "message") and response.message:
            content = response.message.content
            if isinstance(content, str):
                llm_text = content
            elif isinstance(content, list):
                llm_text = "".join(
                    item.data if hasattr(item, "data") else str(item) for item in content
                )
        parsed = self._extract_json_object(llm_text)
        if parsed is None:
            return None, llm_text, "Failed to parse LLM output as JSON"
        if not isinstance(parsed, dict):
            return None, llm_text, "LLM output JSON is not an object"
        return parsed, llm_text, None
    @staticmethod
    def _strip_index_lists(text: str) -> str:
        # Stop before common appendix lists that pollute TOC parsing.
        pattern = re.compile(
            r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
            re.IGNORECASE | re.MULTILINE,
        )
        m = pattern.search(text)
        return text[: m.start()].rstrip() if m else text
    def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
        """Parse lines like:
        1.2 Engine Overview ........ 35
        Appendix A  120
        """
        line_pattern = re.compile(
            r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
        )
        entries: list[tuple[str, int]] = []
        for raw in text.splitlines():
            line = raw.strip()
            if not line or len(line) < 3:
                continue
            if re.fullmatch(r"\d+", line):
                continue
            m = line_pattern.match(line)
            if not m:
                continue
            title = re.sub(r"\s+", " ", m.group("title")).strip("-_:： ")
            page = self._to_int(m.group("page"), None)
            if not title or page is None:
                continue
            # Skip obvious noise.
            if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
                continue
            entries.append((title, page))
        if not entries:
            return {}
        # Deduplicate keeping earliest appearance.
        dedup: OrderedDict[str, int] = OrderedDict()
        for title, page in entries:
            if title not in dedup:
                dedup[title] = page
        titles = list(dedup.keys())
        pages = [dedup[t] for t in titles]
        catalog: dict[str, dict[str, int]] = {}
        for i, title in enumerate(titles):
            start = pages[i]
            if i + 1 < len(pages):
                next_start = pages[i + 1]
                end = max(start, next_start - 1)
            else:
                end = start
            catalog[title] = {"start": int(start), "end": int(end)}
        return catalog
    def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
        catalog: dict[str, dict[str, int]] = {}
        source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
        if not isinstance(source, dict):
            return catalog
        for name, value in source.items():
            if not isinstance(name, str) or not isinstance(value, dict):
                continue
            start = self._to_int(value.get("start"), None)
            end = self._to_int(value.get("end"), start)
            if start is None:
                continue
            if end is None:
                end = start
            catalog[name] = {"start": int(start), "end": int(max(start, end))}
        return catalog
    @staticmethod
    def _extract_json_object(text: str) -> Any:
        if not text:
            return None
        candidates: list[str] = []
        code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
        candidates.extend([c.strip() for c in code_blocks if c.strip()])
        brace_candidate = PdfTocTool._extract_first_brace_object(text)
        if brace_candidate:
            candidates.append(brace_candidate)
        candidates.append(text.strip())
        for cand in candidates:
            parsed = PdfTocTool._json_try_parse(cand)
            if parsed is not None:
                return parsed
        return None
    @staticmethod
    def _extract_first_brace_object(text: str) -> str | None:
        start = text.find("{")
        if start < 0:
            return None
        depth = 0
        in_str = False
        escape = False
        for i in range(start, len(text)):
            ch = text[i]
            if in_str:
                if escape:
                    escape = False
                elif ch == "\\":
                    escape = True
                elif ch == '"':
                    in_str = False
                continue
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    return text[start : i + 1]
        return None
    @staticmethod
    def _json_try_parse(text: str) -> Any:
        try:
            return json.loads(text)
        except Exception:
            pass
        # Minimal repair: remove trailing commas before } or ]
        repaired = re.sub(r",\s*([}\]])", r"\1", text)
        try:
            return json.loads(repaired)
        except Exception:
            return None
    @staticmethod
    def _to_int(value: Any, default: int | None) -> int | None:
        try:
            if value is None or value == "":
                return default
            return int(value)
        except Exception:
            return default
--- a/difyPlugin/pdf/tools/pdf_toc.yaml
+++ b/difyPlugin/pdf/tools/pdf_toc.yaml
@@ -0,0 +1,79 @@
 identity:
  name: "pdf_toc"
  author: "yslg"
  label:
    en_US: "PDF TOC Parser"
    zh_Hans: "PDF目录解析"
    pt_BR: "Analisador de Sumário PDF"
    ja_JP: "PDF目次解析"
 description:
  human:
    en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
    zh_Hans: "通过LLM将PDF目录文本（来自目录页提取工具的输出）解析为结构化JSON目录"
    pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
    ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
  llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
 parameters:
  - name: toc_start
    type: number
    required: true
    label:
      en_US: TOC Start Page
      zh_Hans: 目录起始页
      pt_BR: Página Inicial do Sumário
      ja_JP: 目次開始ページ
    human_description:
      en_US: "Start page index of TOC (from pdf_column_range output)"
      zh_Hans: "目录起始页码（来自目录页提取工具输出的 start）"
      pt_BR: "Índice da página inicial do sumário"
      ja_JP: "目次の開始ページ番号"
    llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
    form: llm
  - name: toc_end
    type: number
    required: true
    label:
      en_US: TOC End Page
      zh_Hans: 目录结束页
      pt_BR: Página Final do Sumário
      ja_JP: 目次終了ページ
    human_description:
      en_US: "End page index of TOC (from pdf_column_range output)"
      zh_Hans: "目录结束页码（来自目录页提取工具输出的 end）"
      pt_BR: "Índice da página final do sumário"
      ja_JP: "目次の終了ページ番号"
    llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
    form: llm
  - name: toc_pages
    type: string
    required: true
    label:
      en_US: TOC Page Text
      zh_Hans: 目录页文本
      pt_BR: Texto das Páginas do Sumário
      ja_JP: 目次ページテキスト
    human_description:
      en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
      zh_Hans: "目录页原始文本内容（来自目录页提取工具输出的 pages 数组）"
      pt_BR: "Conteúdo de texto bruto das páginas do sumário"
      ja_JP: "目次ページの生テキスト内容"
    llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
    form: llm
  - name: model
    type: model-selector
    scope: llm
    required: true
    label:
      en_US: LLM Model
      zh_Hans: LLM 模型
      pt_BR: Modelo LLM
      ja_JP: LLMモデル
    human_description:
      en_US: "LLM model for parsing TOC into structured JSON"
      zh_Hans: "用于解析目录的 LLM 模型"
      pt_BR: "Modelo LLM para análise do sumário"
      ja_JP: "目次解析用のLLMモデル"
    form: form
 extra:
  python:
    source: tools/pdf_toc.py
`@@ -1,2 +1,2 @@`
	`dify_plugin>=0.4.0,<0.7.0`	`dify_plugin>=0.4.0,<0.7.0`
	`PyPDF2>=3.0.1`	`pymupdf>=1.27.1`