diff --git a/ai-management-platform b/ai-management-platform index 96f7c3aa..6bbe3e41 160000 --- a/ai-management-platform +++ b/ai-management-platform @@ -1 +1 @@ -Subproject commit 96f7c3aa4c9ac8b00e0b98b5a4998b5f910d5337 +Subproject commit 6bbe3e4181466bc86712e2d0abdba36ed8988082 diff --git a/difyPlugin/pdf/provider/pdf.yaml b/difyPlugin/pdf/provider/pdf.yaml index c7473239..754fd59d 100644 --- a/difyPlugin/pdf/provider/pdf.yaml +++ b/difyPlugin/pdf/provider/pdf.yaml @@ -1,4 +1,4 @@ -identity: +identity: author: "yslg" name: "pdf" label: @@ -13,54 +13,8 @@ identity: ja_JP: "pdfTools" icon: "icon.svg" -######################################################################################### -# If you want to support OAuth, you can uncomment the following code. -######################################################################################### -# oauth_schema: -# client_schema: -# - name: "client_id" -# type: "secret-input" -# required: true -# url: https://example.com/oauth/authorize -# placeholder: -# en_US: "Please input your Client ID" -# zh_Hans: "请输入你的 Client ID" -# pt_BR: "Insira seu Client ID" -# help: -# en_US: "Client ID is used to authenticate requests to the example.com API." -# zh_Hans: "Client ID 用于认证请求到 example.com API。" -# pt_BR: "Client ID é usado para autenticar solicitações à API do example.com." -# label: -# zh_Hans: "Client ID" -# en_US: "Client ID" -# - name: "client_secret" -# type: "secret-input" -# required: true -# url: https://example.com/oauth/authorize -# placeholder: -# en_US: "Please input your Client Secret" -# zh_Hans: "请输入你的 Client Secret" -# pt_BR: "Insira seu Client Secret" -# help: -# en_US: "Client Secret is used to authenticate requests to the example.com API." -# zh_Hans: "Client Secret 用于认证请求到 example.com API。" -# pt_BR: "Client Secret é usado para autenticar solicitações à API do example.com." -# label: -# zh_Hans: "Client Secret" -# en_US: "Client Secret" -# credentials_schema: -# - name: "access_token" -# type: "secret-input" -# label: -# zh_Hans: "Access Token" -# en_US: "Access Token" - tools: - - tools/pdf_column_range.yaml - - tools/pdf_single_page.yaml - - tools/pdf_summary.yaml - tools/pdf_toc.yaml - - tools/pdf_extract_range.yaml - tools/pdf_to_markdown.yaml extra: python: diff --git a/difyPlugin/pdf/tools/pdf_column_range.py b/difyPlugin/pdf/tools/pdf_column_range.py deleted file mode 100644 index 5d5f5db8..00000000 --- a/difyPlugin/pdf/tools/pdf_column_range.py +++ /dev/null @@ -1,107 +0,0 @@ -import json -import re -from collections.abc import Generator -from io import BytesIO -from typing import Any - -import fitz # PyMuPDF 核心库 -from dify_plugin import Tool -from dify_plugin.entities.tool import ToolInvokeMessage - - -class PdfTool(Tool): - def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - file = tool_parameters.get("file") - if not file: - yield self.create_text_message("Error: file is required") - return - - # 从字节流加载 PDF(替换 PyPDF2) - pdf_bytes = file.blob - doc = fitz.open(stream=pdf_bytes, filetype="pdf") - num_pages = len(doc) - - toc_start = None - toc_end = None - - # 目录匹配正则(与原代码一致) - toc_patterns = [ - r'目录', - r'目 录', - r'目\u3000录', - r'Table of Contents', - r'Contents', - r'目次' - ] - - # 遍历页面识别目录页(逻辑不变,仅替换文本提取方式) - for page_num in range(num_pages): - page = doc[page_num] - text = page.get_text() or "" # PyMuPDF 提取文本 - - if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): - if toc_start is None: - toc_start = page_num - toc_end = page_num - elif toc_start is not None and toc_end is not None: - break - - # 提取目录页文本 - toc_pages = [] - if toc_start is not None and toc_end is not None: - for page_num in range(toc_start, toc_end + 1): - page = doc[page_num] - toc_pages.append(page.get_text() or "") - - # 关闭文档 - doc.close() - - result = { - "start": toc_start, - "end": toc_end, - "pages": toc_pages, - "pages_text": "\n".join(toc_pages) if toc_pages else "", - } - yield self.create_text_message(json.dumps(result, ensure_ascii=False)) - yield self.create_json_message(result) - - -if __name__ == "__main__": - # 测试代码(改用 PyMuPDF) - pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf" - doc = fitz.open(pdf_path) # 本地文件直接打开 - num_pages = len(doc) - - toc_start = None - toc_end = None - - toc_patterns = [ - r'目录', - r'目 录', - r'目\u3000录', - r'Table of Contents', - r'Contents', - r'目次' - ] - - # 遍历页面找目录 - for page_num in range(num_pages): - page = doc[page_num] - text = page.get_text() or "" - if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): - if toc_start is None: - toc_start = page_num - toc_end = page_num - elif toc_start is not None and toc_end is not None: - break - - # 提取目录页文本 - toc_pages = [] - toc_start = toc_start if toc_start is not None else 18 - toc_end = toc_end if toc_end is not None else toc_start + 9 - for page_num in range(toc_start, toc_end): - page = doc[page_num] - toc_pages.append(page.get_text() or "") - - print(toc_start, toc_end, toc_pages) - doc.close() # 关闭文档 \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_column_range.yaml b/difyPlugin/pdf/tools/pdf_column_range.yaml deleted file mode 100644 index 8f758dd7..00000000 --- a/difyPlugin/pdf/tools/pdf_column_range.yaml +++ /dev/null @@ -1,36 +0,0 @@ -identity: - name: "pdf" - author: "yslg" - label: - en_US: "Extract TOC Pages and Content" - zh_Hans: "提取目录页和内容" - pt_BR: "Extrair páginas de sumário e conteúdo" - ja_JP: "目次ページと内容を抽出" -description: - human: - en_US: "Extract table-of-contents page range and all page text in that range" - zh_Hans: "提取目录页范围以及该范围内所有页文本" - pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo" - ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出" - llm: "Extract table-of-contents page range and all page text in that range" -parameters: - - name: file - type: file - required: true - label: - en_US: PDF File - zh_Hans: PDF 文件 - pt_BR: Arquivo PDF - ja_JP: PDFファイル - human_description: - en_US: "PDF file to process" - zh_Hans: "要处理的 PDF 文件" - pt_BR: "Arquivo PDF para processar" - ja_JP: "処理するPDFファイル" - llm_description: "PDF file to process, output contains start/end/pages" - form: llm - fileTypes: - - "pdf" -extra: - python: - source: tools/pdf_column_range.py \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_extract_range.py b/difyPlugin/pdf/tools/pdf_extract_range.py deleted file mode 100644 index fbaa3927..00000000 --- a/difyPlugin/pdf/tools/pdf_extract_range.py +++ /dev/null @@ -1,48 +0,0 @@ -import json -from collections.abc import Generator -from typing import Any - -import fitz # PyMuPDF -from dify_plugin import Tool -from dify_plugin.entities.tool import ToolInvokeMessage - - -class PdfExtractRangeTool(Tool): - def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - file = tool_parameters.get("file") - if not file: - yield self.create_text_message("Error: file is required") - return - - start_page = int(tool_parameters.get("start_page", 0)) - end_page = int(tool_parameters.get("end_page", 0)) - - # 打开 PDF - pdf_bytes = file.blob - doc = fitz.open(stream=pdf_bytes, filetype="pdf") - num_pages = len(doc) - - # 边界处理 - start_page = max(0, min(start_page, num_pages - 1)) - end_page = max(start_page, min(end_page, num_pages - 1)) - - # 逐页提取文本 - page_texts = [] - for page_idx in range(start_page, end_page + 1): - page = doc[page_idx] - text = page.get_text("text", sort=True) or "" - page_texts.append(text) - - doc.close() - - # 拼接所有页面文本 - full_text = "\n\n--- 分页 ---\n\n".join(page_texts) - - result = { - "start": start_page, - "end": end_page, - "total_pages": end_page - start_page + 1, - "text": full_text, - } - yield self.create_text_message(json.dumps(result, ensure_ascii=False)) - yield self.create_json_message(result) diff --git a/difyPlugin/pdf/tools/pdf_extract_range.yaml b/difyPlugin/pdf/tools/pdf_extract_range.yaml deleted file mode 100644 index 0bc10b6f..00000000 --- a/difyPlugin/pdf/tools/pdf_extract_range.yaml +++ /dev/null @@ -1,68 +0,0 @@ -identity: - name: "pdf_extract_range" - author: "yslg" - label: - en_US: "Extract Page Range Text" - zh_Hans: "提取页面范围文本" - pt_BR: "Extrair Texto do Intervalo de Páginas" - ja_JP: "ページ範囲テキスト抽出" -description: - human: - en_US: "Extract plain text from a specified page range of a PDF file" - zh_Hans: "从PDF文件的指定页码范围提取纯文本" - pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF" - ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出" - llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range." -parameters: - - name: file - type: file - required: true - label: - en_US: PDF File - zh_Hans: PDF 文件 - pt_BR: Arquivo PDF - ja_JP: PDFファイル - human_description: - en_US: "PDF file to extract text from" - zh_Hans: "要提取文本的 PDF 文件" - pt_BR: "Arquivo PDF para extrair texto" - ja_JP: "テキストを抽出するPDFファイル" - llm_description: "PDF file to extract page range text from" - form: llm - fileTypes: - - "pdf" - - name: start_page - type: number - required: true - label: - en_US: Start Page - zh_Hans: 起始页码 - pt_BR: Página Inicial - ja_JP: 開始ページ - human_description: - en_US: "Start page index (0-based)" - zh_Hans: "起始页码(从0开始)" - pt_BR: "Índice da página inicial (base 0)" - ja_JP: "開始ページ番号(0始まり)" - llm_description: "Start page index (0-based)" - form: llm - default: 0 - - name: end_page - type: number - required: true - label: - en_US: End Page - zh_Hans: 结束页码 - pt_BR: Página Final - ja_JP: 終了ページ - human_description: - en_US: "End page index (0-based, inclusive)" - zh_Hans: "结束页码(从0开始,包含该页)" - pt_BR: "Índice da página final (base 0, inclusivo)" - ja_JP: "終了ページ番号(0始まり、含む)" - llm_description: "End page index (0-based, inclusive)" - form: llm - default: 0 -extra: - python: - source: tools/pdf_extract_range.py diff --git a/difyPlugin/pdf/tools/pdf_single_page.py b/difyPlugin/pdf/tools/pdf_single_page.py deleted file mode 100644 index 0fa67660..00000000 --- a/difyPlugin/pdf/tools/pdf_single_page.py +++ /dev/null @@ -1,45 +0,0 @@ -import json -from collections.abc import Generator -from io import BytesIO -from typing import Any - -import fitz # PyMuPDF 核心库 -from dify_plugin import Tool -from dify_plugin.entities.tool import ToolInvokeMessage - - -class PdfSinglePageTool(Tool): - def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - file = tool_parameters.get("file") - page = tool_parameters.get("page", 0) - - if not file: - yield self.create_text_message("Error: file is required") - return - - # 从字节流加载 PDF(替换 PyPDF2 的 PdfReader) - pdf_bytes = file.blob - doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开 - num_pages = len(doc) - - # 页码边界处理(逻辑与原代码一致) - page_index = int(page) - if page_index < 0: - page_index = 0 - if page_index >= num_pages: - page_index = num_pages - 1 - - # 提取指定页面文本(PyMuPDF 方式) - selected_page = doc[page_index] - text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准 - - # 关闭文档释放资源 - doc.close() - - result = { - "start": page_index, - "end": page_index, - "pages": [text] - } - yield self.create_text_message(json.dumps(result, ensure_ascii=False)) - yield self.create_json_message(result) \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_single_page.yaml b/difyPlugin/pdf/tools/pdf_single_page.yaml deleted file mode 100644 index 5fef64ef..00000000 --- a/difyPlugin/pdf/tools/pdf_single_page.yaml +++ /dev/null @@ -1,52 +0,0 @@ -identity: - name: "pdf_single_page" - author: "yslg" - label: - en_US: "Extract Single-Page Text" - zh_Hans: "提取单页文字" - pt_BR: "Extrair texto de página única" - ja_JP: "単一ページのテキストを抽出" -description: - human: - en_US: "Extract text from one specified page" - zh_Hans: "提取指定单页文字" - pt_BR: "Extrair texto de uma página especificada" - ja_JP: "指定した1ページのテキストを抽出" - llm: "Extract text from one specified page" -parameters: - - name: file - type: file - required: true - label: - en_US: PDF File - zh_Hans: PDF 文件 - pt_BR: Arquivo PDF - ja_JP: PDFファイル - human_description: - en_US: "PDF file to process" - zh_Hans: "要处理的 PDF 文件" - pt_BR: "Arquivo PDF para processar" - ja_JP: "処理するPDFファイル" - llm_description: "PDF file to process" - form: llm - fileTypes: - - "pdf" - - name: page - type: number - required: true - label: - en_US: Page Index - zh_Hans: 页码 - pt_BR: Índice da Página - ja_JP: ページ番号 - human_description: - en_US: "Single page index to extract" - zh_Hans: "要提取的单页页码" - pt_BR: "Índice da página única para extrair" - ja_JP: "抽出対象のページ番号" - llm_description: "Single page index to extract" - form: llm - default: 0 -extra: - python: - source: tools/pdf_single_page.py diff --git a/difyPlugin/pdf/tools/pdf_summary.py b/difyPlugin/pdf/tools/pdf_summary.py deleted file mode 100644 index 684914c7..00000000 --- a/difyPlugin/pdf/tools/pdf_summary.py +++ /dev/null @@ -1,209 +0,0 @@ -import json -import re -from collections.abc import Generator -from typing import Any - -import fitz -from dify_plugin import Tool -from dify_plugin.entities.model.llm import LLMModelConfig -from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage -from dify_plugin.entities.tool import ToolInvokeMessage - - -class PdfSummaryTool(Tool): - """Fast PDF page summary tool. - - Default behavior is optimized for throughput in large workflows: - - Extract plain text and lightweight table data only. - - Skip expensive image base64 and drawing path extraction. - - Skip LLM by default unless `use_llm=true` is explicitly passed. - """ - - def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - file = tool_parameters.get("file") - if not file: - yield self.create_text_message("Error: file is required") - return - - start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0) - end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0) - model_config = tool_parameters.get("model") - use_llm = self._to_bool(tool_parameters.get("use_llm"), False) - - max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000) - max_chars_per_page = max(800, min(max_chars_per_page, 20000)) - - llm_prompt = tool_parameters.get( - "llm_prompt", - "请基于输入的PDF页面文本做简洁准确摘要,输出中文要点。不要输出思考过程。", - ) - - pdf_bytes = file.blob - doc = fitz.open(stream=pdf_bytes, filetype="pdf") - try: - num_pages = len(doc) - start_page = max(0, min(start_page, num_pages - 1)) - end_page = max(start_page, min(end_page, num_pages - 1)) - - pages_data: list[dict[str, Any]] = [] - for page_idx in range(start_page, end_page + 1): - page = doc[page_idx] - page_data = self._extract_page_fast(page, page_idx, max_chars_per_page) - pages_data.append(page_data) - - result = { - "total_pages_extracted": len(pages_data), - "page_range": {"start": start_page, "end": end_page}, - "pages": pages_data, - } - yield self.create_json_message(result) - - # Fast local summary first (deterministic, no model latency) - local_text = self._build_local_summary(pages_data) - - # Optional LLM refinement, explicitly enabled only - if use_llm and model_config: - refined = self._summarize_with_llm(local_text, llm_prompt, model_config) - final_text = refined if refined else local_text - else: - final_text = local_text - - if final_text: - yield self.create_text_message(final_text) - finally: - doc.close() - - def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]: - text = (page.get_text("text") or "").strip() - if len(text) > max_chars_per_page: - text = text[:max_chars_per_page] + "\n...[truncated]" - - tables: list[dict[str, Any]] = [] - try: - tabs = page.find_tables() - for tab_idx, tab in enumerate(tabs.tables[:3]): - cells = tab.extract() or [] - tables.append( - { - "index": tab_idx, - "rows": tab.row_count, - "cols": tab.col_count, - "cells": cells[:10], - } - ) - except Exception: - pass - - return { - "page_number": page_idx, - "text": text, - "tables": tables, - "images": [], - "drawings_summary": [], - "text_blocks": [], - "width": float(page.rect.width), - "height": float(page.rect.height), - } - - def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str: - """Output actual page content as Markdown (text + tables). - - No LLM needed downstream — the text is already usable Markdown. - """ - parts: list[str] = [] - for page in pages_data: - text = (page.get("text") or "").strip() - tables = page.get("tables") or [] - - page_parts: list[str] = [] - if text: - page_parts.append(text) - - for tab in tables: - cells = tab.get("cells") or [] - if len(cells) >= 2: - md = self._cells_to_md_table(cells) - if md: - page_parts.append(md) - - if page_parts: - parts.append("\n\n".join(page_parts)) - - return "\n\n--- 分页 ---\n\n".join(parts) - - @staticmethod - def _cells_to_md_table(cells: list) -> str: - if not cells: - return "" - header = cells[0] - ncols = len(header) - if ncols == 0: - return "" - clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip() - lines = [ - "| " + " | ".join(clean(c) for c in header) + " |", - "| " + " | ".join("---" for _ in range(ncols)) + " |", - ] - for row in cells[1:]: - padded = list(row) + [""] * max(0, ncols - len(row)) - lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |") - return "\n".join(lines) - - def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str: - response = self.session.model.llm.invoke( - model_config=LLMModelConfig(**model_config), - prompt_messages=[ - SystemPromptMessage(content=llm_prompt), - UserPromptMessage(content=local_text), - ], - stream=False, - ) - - llm_text = "" - if hasattr(response, "message") and response.message: - content = response.message.content - if isinstance(content, str): - llm_text = content - elif isinstance(content, list): - llm_text = "".join( - item.data if hasattr(item, "data") else str(item) - for item in content - ) - - return self._extract_visible_answer(llm_text) - - @staticmethod - def _extract_visible_answer(text: str) -> str: - if not text: - return "" - - box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text) - if box_match: - text = box_match.group(1) - else: - text = re.sub(r"[\s\S]*?", "", text, flags=re.IGNORECASE) - - text = re.sub(r"<\|[^>]+\|>", "", text) - return text.strip() - - @staticmethod - def _to_int(value: Any, default: int) -> int: - try: - if value is None or value == "": - return default - return int(value) - except Exception: - return default - - @staticmethod - def _to_bool(value: Any, default: bool) -> bool: - if value is None: - return default - if isinstance(value, bool): - return value - s = str(value).strip().lower() - if s in {"1", "true", "yes", "on"}: - return True - if s in {"0", "false", "no", "off"}: - return False - return default diff --git a/difyPlugin/pdf/tools/pdf_summary.yaml b/difyPlugin/pdf/tools/pdf_summary.yaml deleted file mode 100644 index 059c920d..00000000 --- a/difyPlugin/pdf/tools/pdf_summary.yaml +++ /dev/null @@ -1,99 +0,0 @@ -identity: - name: "pdf_summary" - author: "yslg" - label: - en_US: "PDF Page Summary" - zh_Hans: "PDF页面概述" - pt_BR: "Resumo de Página PDF" - ja_JP: "PDFページ概要" -description: - human: - en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM" - zh_Hans: "提取PDF页面核心元素(文本、图片、表格、路径)及坐标,并通过LLM进行概述" - pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM" - ja_JP: "PDFページからコア要素(テキスト、画像、テーブル、パス)を座標付きで抽出し、LLMで要約" - llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content" -parameters: - - name: file - type: file - required: true - label: - en_US: PDF File - zh_Hans: PDF 文件 - pt_BR: Arquivo PDF - ja_JP: PDFファイル - human_description: - en_US: "PDF file to process" - zh_Hans: "要处理的 PDF 文件" - pt_BR: "Arquivo PDF para processar" - ja_JP: "処理するPDFファイル" - llm_description: "PDF file to extract elements from and summarize" - form: llm - fileTypes: - - "pdf" - - name: pdf_start_page - type: number - required: true - label: - en_US: Start Page - zh_Hans: 起始页码 - pt_BR: Página Inicial - ja_JP: 開始ページ - human_description: - en_US: "Start page index (0-based)" - zh_Hans: "起始页码(从0开始)" - pt_BR: "Índice da página inicial (base 0)" - ja_JP: "開始ページ番号(0始まり)" - llm_description: "Start page index (0-based) for element extraction" - form: llm - default: 0 - - name: pdf_end_page - type: number - required: true - label: - en_US: End Page - zh_Hans: 结束页码 - pt_BR: Página Final - ja_JP: 終了ページ - human_description: - en_US: "End page index (0-based, inclusive)" - zh_Hans: "结束页码(从0开始,包含该页)" - pt_BR: "Índice da página final (base 0, inclusivo)" - ja_JP: "終了ページ番号(0始まり、含む)" - llm_description: "End page index (0-based, inclusive) for element extraction" - form: llm - default: 0 - - name: model - type: model-selector - scope: llm - required: true - label: - en_US: LLM Model - zh_Hans: LLM 模型 - pt_BR: Modelo LLM - ja_JP: LLMモデル - human_description: - en_US: "LLM model used for summarizing extracted content" - zh_Hans: "用于概述提取内容的 LLM 模型" - pt_BR: "Modelo LLM usado para resumir o conteúdo extraído" - ja_JP: "抽出内容の要約に使用するLLMモデル" - form: form - - name: llm_prompt - type: string - required: false - label: - en_US: LLM Prompt - zh_Hans: LLM 提示词 - pt_BR: Prompt do LLM - ja_JP: LLMプロンプト - human_description: - en_US: "System prompt for LLM summarization" - zh_Hans: "LLM 概述的系统提示词" - pt_BR: "Prompt do sistema para resumo LLM" - ja_JP: "LLM要約用のシステムプロンプト" - llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content" - form: form - default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容(包含文本、图片信息、表格和矢量图形),对每页内容进行准确、简洁的概述。" -extra: - python: - source: tools/pdf_summary.py diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.py b/difyPlugin/pdf/tools/pdf_to_markdown.py index c00ab31e..75367173 100644 --- a/difyPlugin/pdf/tools/pdf_to_markdown.py +++ b/difyPlugin/pdf/tools/pdf_to_markdown.py @@ -1,6 +1,5 @@ -import base64 +import json import re -from collections import OrderedDict from collections.abc import Generator from typing import Any @@ -10,306 +9,219 @@ from dify_plugin.entities.tool import ToolInvokeMessage class PdfToMarkdownTool(Tool): - """Convert PDF to a single Markdown file. No LLM needed. - - - Auto-detect TOC and organize content by chapters. - - Extract text and tables as Markdown. - - Embed raster images as base64. - - Render vector drawings as base64 PNG. - - Output one .md file via create_blob_message. - """ - - _TOC_PATTERNS = [ - r"目录", r"目 录", r"目\u3000录", - r"Table of Contents", r"Contents", r"目次", - ] - - # ── entry point ────────────────────────────────────────── + """Convert PDF to Markdown using an external catalog array.""" def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: file = tool_parameters.get("file") + catalog_text = (tool_parameters.get("catalog") or "").strip() if not file: yield self.create_text_message("Error: file is required") return + if not catalog_text: + yield self.create_text_message("Error: catalog is required") + return - include_images = self._to_bool(tool_parameters.get("include_images"), True) - image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150) - image_dpi = max(72, min(image_dpi, 300)) - max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw + catalog = self._parse_catalog(catalog_text) + if not catalog: + yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes") + return doc = fitz.open(stream=file.blob, filetype="pdf") try: num_pages = len(doc) + hf_texts = self._detect_headers_footers(doc, num_pages) + page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)] + final_md = self._assemble_by_catalog(catalog, page_mds, num_pages) - # 1) Build chapter map (metadata TOC → printed TOC → none) - chapters, content_offset = self._build_chapter_map(doc, num_pages) - - # 2) Convert every page - page_mds: list[str] = [] - for idx in range(num_pages): - md = self._page_to_markdown( - doc, doc[idx], idx, - include_images, image_dpi, max_image_bytes, - ) - page_mds.append(md) - - # 3) Assemble - if chapters: - final_md = self._assemble_by_chapters( - chapters, page_mds, content_offset, num_pages, - ) - else: - final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip()) - - # 4) Output: text (for variable aggregation) + blob (.md file) yield self.create_text_message(final_md) - md_bytes = final_md.encode("utf-8") yield self.create_blob_message( - blob=md_bytes, + blob=final_md.encode("utf-8"), meta={"mime_type": "text/markdown"}, ) finally: doc.close() - # ── chapter detection ──────────────────────────────────── - - def _build_chapter_map( - self, doc: fitz.Document, num_pages: int, - ) -> tuple[dict, int]: - """Return (chapters_dict, content_offset). - - Try embedded PDF TOC metadata first (reliable page mapping). - Fall back to scanning printed TOC pages. - """ - toc = doc.get_toc() - if toc: - chapters = self._chapters_from_metadata(toc, num_pages) - if chapters: - return chapters, 0 - - toc_start, toc_end = self._find_toc_pages(doc, num_pages) - if toc_start is not None and toc_end is not None: - toc_text = "\n".join( - doc[i].get_text() or "" for i in range(toc_start, toc_end + 1) - ) - chapters = self._parse_toc_lines(toc_text) - if chapters: - offset = self._guess_offset(chapters, toc_end) - return chapters, offset - - return {}, 0 - - def _chapters_from_metadata( - self, toc: list, num_pages: int, - ) -> dict[str, dict[str, int]]: - top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1] - if not top: - return {} - chapters: dict[str, dict[str, int]] = OrderedDict() - for i, (title, start) in enumerate(top): - end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1 - chapters[title] = {"start": start, "end": max(start, end)} - return chapters - - def _find_toc_pages(self, doc, num_pages): - toc_start = toc_end = None - for pn in range(min(num_pages, 30)): - text = doc[pn].get_text() or "" - if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS): - if toc_start is None: - toc_start = pn - toc_end = pn - elif toc_start is not None: - break - return toc_start, toc_end - - def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: - m = re.search( - r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", - text, re.IGNORECASE | re.MULTILINE, - ) - if m: - text = text[: m.start()] - - pat = re.compile( - r"^\s*(?P.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$" - ) - entries: list[tuple[str, int]] = [] - for raw in text.splitlines(): - line = raw.strip() - if not line or len(line) < 3 or re.fullmatch(r"\d+", line): - continue - m2 = pat.match(line) - if not m2: - continue - title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ") - page = self._to_int(m2.group("page"), None) - if not title or page is None or len(title) <= 1: - continue - if title.lower() in {"page", "pages", "目录", "contents"}: - continue - entries.append((title, page)) - - if not entries: - return {} - - dedup: OrderedDict[str, int] = OrderedDict() - for t, p in entries: - dedup.setdefault(t, p) - - titles = list(dedup.keys()) - pages = [dedup[t] for t in titles] - catalog: dict[str, dict[str, int]] = OrderedDict() - for i, t in enumerate(titles): - s = pages[i] - e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s - catalog[t] = {"start": s, "end": e} - return catalog - - @staticmethod - def _guess_offset(chapters: dict, toc_end: int) -> int: - first_page = None - for info in chapters.values(): - s = info["start"] - if first_page is None or s < first_page: - first_page = s - if first_page is None: - return 0 - return (toc_end + 1) - first_page - - # ── per-page conversion ────────────────────────────────── - - def _page_to_markdown( - self, - doc: fitz.Document, - page: fitz.Page, - page_idx: int, - include_images: bool, - image_dpi: int, - max_image_bytes: int, - ) -> str: - parts: list[str] = [] - - # ── text ── - text = (page.get_text("text", sort=True) or "").strip() - if text: - parts.append(text) - - # ── tables → Markdown ── + def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]: try: - for tab in (page.find_tables().tables or [])[:5]: - cells = tab.extract() or [] - if len(cells) >= 2: - md = self._cells_to_md_table(cells) - if md: - parts.append(md) + raw = json.loads(catalog_text) except Exception: - pass + return [] - if not include_images: - return "\n\n".join(parts) + if not isinstance(raw, list): + return [] - # ── embedded raster images ── + result: list[dict[str, Any]] = [] + for item in raw: + if not isinstance(item, dict): + continue + + title = str(item.get("title") or "").strip() or "Untitled" + start_index = self._to_int(item.get("page_start_index"), None) + end_index = self._to_int(item.get("page_end_index"), start_index) + + if start_index is None: + start = self._to_int(item.get("start"), None) + end = self._to_int(item.get("end"), start) + if start is None: + continue + start_index = max(0, start - 1) + end_index = max(start_index, (end if end is not None else start) - 1) + + if end_index is None: + end_index = start_index + + result.append( + { + "title": title, + "page_start_index": max(0, start_index), + "page_end_index": max(start_index, end_index), + } + ) + return result + + def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]: + margin_ratio = 0.08 + sample_count = min(num_pages, 30) + text_counts: dict[str, int] = {} + + for idx in range(sample_count): + page = doc[idx] + page_height = page.rect.height + top_limit = page_height * margin_ratio + bottom_limit = page_height * (1 - margin_ratio) + try: + blocks = page.get_text("blocks", sort=True) or [] + except Exception: + continue + + seen: set[str] = set() + for block in blocks: + if len(block) < 7 or block[6] != 0: + continue + y0, y1 = block[1], block[3] + text = (block[4] or "").strip() + if not text or len(text) < 2 or text in seen: + continue + if y1 <= top_limit or y0 >= bottom_limit: + seen.add(text) + text_counts[text] = text_counts.get(text, 0) + 1 + + threshold = max(3, sample_count * 0.35) + return {text for text, count in text_counts.items() if count >= threshold} + + def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str: + parts: list[str] = [] + page_height = page.rect.height + top_margin = page_height * 0.06 + bottom_margin = page_height * 0.94 + + table_rects: list[fitz.Rect] = [] + table_mds: list[str] = [] try: - for img_idx, img_info in enumerate(page.get_images(full=True)): - xref = img_info[0] + find_tables = getattr(page, "find_tables", None) + tables = [] + if callable(find_tables): + table_finder = find_tables() + tables = getattr(table_finder, "tables", []) or [] + + for table in tables[:5]: try: - data = doc.extract_image(xref) - if not data or not data.get("image"): - continue - raw = data["image"] - if len(raw) > max_image_bytes: - continue - # skip tiny icons (< 20x20) - w = data.get("width", 0) - h = data.get("height", 0) - if w < 20 and h < 20: - continue - ext = data.get("ext", "png") - mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}" - b64 = base64.b64encode(raw).decode("ascii") - parts.append( - f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})" - ) + table_rects.append(fitz.Rect(table.bbox)) except Exception: pass + + cells = table.extract() or [] + if len(cells) < 2: + continue + if hf_texts and len(cells) <= 3: + flat = " ".join(str(cell or "") for row in cells for cell in row) + if any(hf in flat for hf in hf_texts): + continue + + md_table = self._cells_to_md_table(cells) + if md_table: + table_mds.append(md_table) except Exception: pass - # ── vector drawings → render as PNG ── try: - drawings = page.get_drawings() - if len(drawings) >= 3: - valid_rects: list[fitz.Rect] = [] - for d in drawings: - r = d.get("rect") - if r: - try: - rect = fitz.Rect(r) - if rect.is_valid and not rect.is_empty: - valid_rects.append(rect) - except Exception: - pass - if valid_rects: - bbox = valid_rects[0] - for r in valid_rects[1:]: - bbox |= r - bbox &= page.rect - if bbox.width > 30 and bbox.height > 30: - scale = image_dpi / 72 - mat = fitz.Matrix(scale, scale) - pix = page.get_pixmap(matrix=mat, clip=bbox) - png = pix.tobytes("png") - if len(png) <= max_image_bytes: - b64 = base64.b64encode(png).decode("ascii") - parts.append( - f"![drawing-p{page_idx}](data:image/png;base64,{b64})" - ) + blocks = page.get_text("blocks", sort=True) or [] except Exception: - pass + blocks = [] + for block in blocks: + if len(block) < 7 or block[6] != 0: + continue + x0, y0, x1, y1 = block[:4] + text = (block[4] or "").strip() + if not text: + continue + + block_rect = fitz.Rect(x0, y0, x1, y1) + if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects): + continue + if hf_texts and (y1 <= top_margin or y0 >= bottom_margin): + if any(hf in text for hf in hf_texts): + continue + if re.fullmatch(r"\s*\d{1,4}\s*", text): + continue + + parts.append(text) + + parts.extend(table_mds) return "\n\n".join(parts) - # ── assembly ───────────────────────────────────────────── - - def _assemble_by_chapters( - self, - chapters: dict[str, dict[str, int]], - page_mds: list[str], - offset: int, - num_pages: int, - ) -> str: + def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str: parts: list[str] = [] - for name, info in chapters.items(): - s = info["start"] + offset - e = info["end"] + offset - s = max(0, min(s, num_pages - 1)) - e = max(s, min(e, num_pages - 1)) - ch: list[str] = [f"# {name}\n"] - for idx in range(s, e + 1): - if idx < len(page_mds) and page_mds[idx].strip(): - ch.append(page_mds[idx]) - parts.append("\n\n".join(ch)) - return "\n\n---\n\n".join(parts) + used_pages: set[int] = set() - # ── helpers ────────────────────────────────────────────── + for item in catalog: + start = max(0, min(int(item["page_start_index"]), num_pages - 1)) + end = max(start, min(int(item["page_end_index"]), num_pages - 1)) + + chapter_parts = [f"# {item['title']}\n"] + for idx in range(start, end + 1): + if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages: + chapter_parts.append(page_mds[idx]) + used_pages.add(idx) + + if len(chapter_parts) > 1: + parts.append("\n\n".join(chapter_parts)) + + if parts: + return "\n\n---\n\n".join(parts) + return "\n\n---\n\n".join(m for m in page_mds if m.strip()) + + @staticmethod + def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool: + inter = block_rect & table_rect + if inter.is_empty: + return False + block_area = block_rect.width * block_rect.height + if block_area <= 0: + return False + return (inter.width * inter.height) / block_area >= 0.3 @staticmethod def _cells_to_md_table(cells: list) -> str: if not cells: return "" + header = cells[0] ncols = len(header) if ncols == 0: return "" - clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip() + + def clean(value: Any) -> str: + return str(value or "").replace("|", "\\|").replace("\n", " ").strip() + lines = [ - "| " + " | ".join(clean(c) for c in header) + " |", + "| " + " | ".join(clean(cell) for cell in header) + " |", "| " + " | ".join("---" for _ in range(ncols)) + " |", ] for row in cells[1:]: padded = list(row) + [""] * max(0, ncols - len(row)) - lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |") + lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |") return "\n".join(lines) @staticmethod @@ -320,16 +232,3 @@ class PdfToMarkdownTool(Tool): return int(value) except Exception: return default - - @staticmethod - def _to_bool(value: Any, default: bool) -> bool: - if value is None: - return default - if isinstance(value, bool): - return value - s = str(value).strip().lower() - if s in {"1", "true", "yes", "on"}: - return True - if s in {"0", "false", "no", "off"}: - return False - return default diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.yaml b/difyPlugin/pdf/tools/pdf_to_markdown.yaml index 87505722..9a089a2b 100644 --- a/difyPlugin/pdf/tools/pdf_to_markdown.yaml +++ b/difyPlugin/pdf/tools/pdf_to_markdown.yaml @@ -1,68 +1,51 @@ -identity: +identity: name: "pdf_to_markdown" author: "yslg" label: en_US: "PDF to Markdown" - zh_Hans: "PDF转Markdown" + zh_Hans: "PDF to Markdown" pt_BR: "PDF para Markdown" - ja_JP: "PDFからMarkdown" + ja_JP: "PDF to Markdown" description: human: - en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed." - zh_Hans: "将PDF转换为单个Markdown文件,图片以base64嵌入,无需大模型" - pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM." - ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。" - llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed." + en_US: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored." + zh_Hans: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored." + pt_BR: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored." + ja_JP: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored." + llm: "Convert a PDF file into Markdown using a catalog JSON array. Ignore images and graphics." parameters: - name: file type: file required: true label: en_US: PDF File - zh_Hans: PDF 文件 - pt_BR: Arquivo PDF - ja_JP: PDFファイル + zh_Hans: PDF File + pt_BR: PDF File + ja_JP: PDF File human_description: en_US: "PDF file to convert" - zh_Hans: "要转换的 PDF 文件" - pt_BR: "Arquivo PDF para converter" - ja_JP: "変換するPDFファイル" + zh_Hans: "PDF file to convert" + pt_BR: "PDF file to convert" + ja_JP: "PDF file to convert" llm_description: "PDF file to convert to Markdown" form: llm fileTypes: - "pdf" - - name: include_images - type: boolean - required: false + - name: catalog + type: string + required: true label: - en_US: Include Images - zh_Hans: 包含图片 - pt_BR: Incluir Imagens - ja_JP: 画像を含める + en_US: Catalog JSON + zh_Hans: Catalog JSON + pt_BR: Catalog JSON + ja_JP: Catalog JSON human_description: - en_US: "Whether to embed images as base64 in the Markdown output (default: true)" - zh_Hans: "是否将图片以base64嵌入Markdown输出(默认:是)" - pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)" - ja_JP: "Markdown出力にbase64として画像を埋め込むかどうか(デフォルト:はい)" - llm_description: "Set to true to embed images as base64, false to skip images" - form: form - default: true - - name: image_dpi - type: number - required: false - label: - en_US: Image DPI - zh_Hans: 图片DPI - pt_BR: DPI da Imagem - ja_JP: 画像DPI - human_description: - en_US: "DPI for rendering vector drawings (72-300, default: 150)" - zh_Hans: "矢量图渲染DPI(72-300,默认150)" - pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)" - ja_JP: "ベクター描画のレンダリングDPI(72-300、デフォルト:150)" - llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150." - form: form - default: 150 + en_US: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]" + zh_Hans: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]" + pt_BR: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]" + ja_JP: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]" + llm_description: "Catalog JSON array returned by pdf_toc" + form: llm extra: python: source: tools/pdf_to_markdown.py diff --git a/difyPlugin/pdf/tools/pdf_toc.py b/difyPlugin/pdf/tools/pdf_toc.py index a96b86b1..12c1caf6 100644 --- a/difyPlugin/pdf/tools/pdf_toc.py +++ b/difyPlugin/pdf/tools/pdf_toc.py @@ -4,264 +4,303 @@ from collections import OrderedDict from collections.abc import Generator from typing import Any +import fitz from dify_plugin import Tool from dify_plugin.entities.model.llm import LLMModelConfig from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage from dify_plugin.entities.tool import ToolInvokeMessage -_SYSTEM_PROMPT = """You parse PDF table-of-contents text. -Return only valid JSON object, no markdown fences, no explanation. -Output schema: -{ - "Chapter Name": {"start": 1, "end": 5}, - "Another": {"start": 6, "end": 20} -} -Rules: -- start/end are integer printed page numbers from TOC. -- If end is unknown, use same value as start. -- Keep chapter names exactly as in TOC text. -""" +_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。 + +要求: +1. 识别所有一级和二级标题及其对应的页码 +2. 只返回纯JSON数组,不要markdown代码块,不要任何解释 +3. 格式: [{"title": "章节标题", "page": 页码数字}] +4. 页码必须是文档中标注的实际页码数字 +5. 如果无法识别目录,返回空数组 []""" class PdfTocTool(Tool): + _TOC_PATTERNS = [ + r"目录", + r"目\s*录", + r"目\u3000录", + r"Table of Contents", + r"Contents", + r"目次", + ] + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - toc_start = self._to_int(tool_parameters.get("toc_start"), None) - toc_end = self._to_int(tool_parameters.get("toc_end"), None) - toc_pages = (tool_parameters.get("toc_pages") or "").strip() + file = tool_parameters.get("file") + if not file: + yield self.create_text_message("Error: file is required") + return + model_config = tool_parameters.get("model") - if toc_start is None or toc_end is None: - yield self.create_text_message("Error: toc_start and toc_end are required") - return + doc = fitz.open(stream=file.blob, filetype="pdf") + try: + num_pages = len(doc) - if not toc_pages: - yield self.create_text_message("Error: toc_pages text is empty") - return + # 1) 优先从PDF元数据提取目录 + catalog = self._catalog_from_metadata(doc.get_toc(), num_pages) - cleaned = self._strip_index_lists(toc_pages) + # 2) 元数据无目录时,使用LLM解析 + if not catalog and model_config: + catalog = self._extract_toc_with_llm(doc, num_pages, model_config) - # 1) deterministic parser first - catalog = self._parse_toc_lines(cleaned) + # 3) 无LLM配置时回退到正则解析 + if not catalog: + toc_start, toc_end = self._find_toc_pages(doc, num_pages) + if toc_start is not None and toc_end is not None: + toc_text = "\n".join( + doc[index].get_text() or "" for index in range(toc_start, toc_end + 1) + ) + printed_catalog = self._parse_toc_lines(toc_text) + catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages) - # 2) optional LLM fallback/enhance only when deterministic parser gives no result - llm_raw_output = "" - llm_error = None - if not catalog and model_config: - llm_catalog, llm_raw_output, llm_error = self._parse_with_llm( - toc_start=toc_start, - toc_end=toc_end, - toc_pages=cleaned, - model_config=model_config, + if not catalog: + catalog = [] + + yield self.create_text_message(json.dumps(catalog, ensure_ascii=False)) + finally: + doc.close() + + def _extract_toc_with_llm( + self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any] + ) -> list[dict[str, int | str]]: + # 先尝试定位目录页 + toc_start, toc_end = self._find_toc_pages(doc, num_pages) + + if toc_start is not None and toc_end is not None: + # 有目录页,提取目录页文本 + toc_text = "\n".join( + doc[index].get_text() or "" for index in range(toc_start, toc_end + 1) ) - if llm_catalog: - catalog = self._normalize_catalog(llm_catalog) + content_offset = toc_end + else: + # 无目录页,提取前15页文本让LLM识别章节结构 + sample = min(num_pages, 15) + toc_text = "\n\n--- 第{}页 ---\n".join( + [""] + [doc[i].get_text() or "" for i in range(sample)] + ) + toc_text = toc_text.strip() + if not toc_text: + return [] + content_offset = 0 - result: dict[str, Any] = { - "toc_start": toc_start, - "toc_end": toc_end, - "catalog": catalog, - "meta": { - "catalog_size": len(catalog), - "parser": "rule" if catalog else "none", - }, - } + # 截断过长文本 + if len(toc_text) > 15000: + toc_text = toc_text[:15000] + "\n...[截断]" - if llm_raw_output: - result["meta"]["llm_used"] = True - if llm_error: - result["meta"]["llm_error"] = llm_error + try: + response = self.session.model.llm.invoke( + model_config=LLMModelConfig(**model_config), + prompt_messages=[ + SystemPromptMessage(content=_TOC_SYSTEM_PROMPT), + UserPromptMessage(content=toc_text), + ], + stream=False, + ) - # always return valid json text payload for downstream json.loads - yield self.create_text_message(json.dumps(result, ensure_ascii=False)) - yield self.create_json_message(result) + llm_text = self._get_response_text(response) + if not llm_text: + return [] - def _parse_with_llm( - self, - toc_start: int, - toc_end: int, - toc_pages: str, - model_config: dict[str, Any], - ) -> tuple[dict[str, Any] | None, str, str | None]: - user_content = ( - f"TOC page index range: {toc_start}..{toc_end}\n\n" - f"TOC raw text:\n{toc_pages}" - ) - response = self.session.model.llm.invoke( - model_config=LLMModelConfig(**model_config), - prompt_messages=[ - SystemPromptMessage(content=_SYSTEM_PROMPT), - UserPromptMessage(content=user_content), - ], - stream=False, - ) + raw_catalog = self._parse_llm_json(llm_text) + if not raw_catalog: + return [] - llm_text = "" - if hasattr(response, "message") and response.message: - content = response.message.content - if isinstance(content, str): - llm_text = content - elif isinstance(content, list): - llm_text = "".join( - item.data if hasattr(item, "data") else str(item) for item in content - ) + # 转换LLM返回的简单格式为完整catalog + return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages) + except Exception: + return [] - parsed = self._extract_json_object(llm_text) - if parsed is None: - return None, llm_text, "Failed to parse LLM output as JSON" - if not isinstance(parsed, dict): - return None, llm_text, "LLM output JSON is not an object" + def _build_catalog_from_llm( + self, raw: list[dict], content_offset: int, num_pages: int + ) -> list[dict[str, int | str]]: + entries: list[tuple[str, int]] = [] + for item in raw: + title = str(item.get("title") or "").strip() + page = self._to_int(item.get("page"), None) + if not title or page is None: + continue + entries.append((title, page)) - return parsed, llm_text, None + if not entries: + return [] + + # 计算偏移量:第一个条目的页码与实际内容起始页的差值 + first_printed_page = entries[0][1] + offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0 + + result: list[dict[str, int | str]] = [] + for i, (title, page) in enumerate(entries): + next_page = entries[i + 1][1] if i + 1 < len(entries) else page + page_start_index = max(0, min(page + offset - 1, num_pages - 1)) + page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1)) + if i == len(entries) - 1: + page_end_index = num_pages - 1 + + result.append({ + "title": title, + "start": page, + "end": max(page, next_page - 1) if i + 1 < len(entries) else page, + "page_start_index": page_start_index, + "page_end_index": page_end_index, + }) + + return result @staticmethod - def _strip_index_lists(text: str) -> str: - # Stop before common appendix lists that pollute TOC parsing. - pattern = re.compile( + def _get_response_text(response: Any) -> str: + if not hasattr(response, "message") or not response.message: + return "" + content = response.message.content + if isinstance(content, str): + text = content + elif isinstance(content, list): + text = "".join( + item.data if hasattr(item, "data") else str(item) for item in content + ) + else: + text = str(content) + + # 清理思考标签 + text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE) + text = re.sub(r"<\|[^>]+\|>", "", text) + return text.strip() + + @staticmethod + def _parse_llm_json(text: str) -> list[dict]: + # 尝试提取JSON代码块 + code_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text) + if code_match: + text = code_match.group(1).strip() + + # 尝试找到JSON数组 + bracket_match = re.search(r"\[[\s\S]*\]", text) + if bracket_match: + text = bracket_match.group(0) + + try: + result = json.loads(text) + if isinstance(result, list): + return result + except Exception: + pass + return [] + + def _catalog_from_metadata(self, toc: list, num_pages: int) -> list[dict[str, int | str]]: + top = [(title, max(0, page - 1)) for level, title, page in toc if level <= 2 and page >= 1] + if not top: + return [] + + result: list[dict[str, int | str]] = [] + for index, (title, start_index) in enumerate(top): + end_index = top[index + 1][1] - 1 if index + 1 < len(top) else num_pages - 1 + result.append({ + "title": title, + "start": start_index + 1, + "end": max(start_index, end_index) + 1, + "page_start_index": start_index, + "page_end_index": max(start_index, end_index), + }) + return result + + def _find_toc_pages(self, doc: fitz.Document, num_pages: int) -> tuple[int | None, int | None]: + toc_start = None + toc_end = None + for page_number in range(min(num_pages, 30)): + text = doc[page_number].get_text() or "" + if any(re.search(pattern, text, re.IGNORECASE) for pattern in self._TOC_PATTERNS): + if toc_start is None: + toc_start = page_number + toc_end = page_number + elif toc_start is not None: + break + return toc_start, toc_end + + def _parse_toc_lines(self, text: str) -> list[dict[str, int | str]]: + marker = re.search( r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", + text, re.IGNORECASE | re.MULTILINE, ) - m = pattern.search(text) - return text[: m.start()].rstrip() if m else text - - def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: - """Parse lines like: - 1.2 Engine Overview ........ 35 - Appendix A 120 - """ - line_pattern = re.compile( - r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$" - ) + if marker: + text = text[: marker.start()] + pattern = re.compile(r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$") entries: list[tuple[str, int]] = [] for raw in text.splitlines(): line = raw.strip() - if not line or len(line) < 3: - continue - if re.fullmatch(r"\d+", line): + if not line or len(line) < 3 or re.fullmatch(r"\d+", line): continue - m = line_pattern.match(line) - if not m: + match = pattern.match(line) + if not match: continue - title = re.sub(r"\s+", " ", m.group("title")).strip("-_:: ") - page = self._to_int(m.group("page"), None) - if not title or page is None: + title = re.sub(r"\s+", " ", match.group("title")).strip("-_::") + page = self._to_int(match.group("page"), None) + if not title or page is None or len(title) <= 1: continue - - # Skip obvious noise. - if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}: + if title.lower() in {"page", "pages", "目录", "contents"}: continue entries.append((title, page)) if not entries: - return {} + return [] - # Deduplicate keeping earliest appearance. dedup: OrderedDict[str, int] = OrderedDict() for title, page in entries: - if title not in dedup: - dedup[title] = page + dedup.setdefault(title, page) titles = list(dedup.keys()) - pages = [dedup[t] for t in titles] + pages = [dedup[title] for title in titles] + result: list[dict[str, int | str]] = [] + for index, title in enumerate(titles): + start = pages[index] + end = max(start, pages[index + 1] - 1) if index + 1 < len(pages) else start + result.append({"title": title, "start": start, "end": end}) + return result - catalog: dict[str, dict[str, int]] = {} - for i, title in enumerate(titles): - start = pages[i] - if i + 1 < len(pages): - next_start = pages[i + 1] - end = max(start, next_start - 1) - else: - end = start - catalog[title] = {"start": int(start), "end": int(end)} + def _attach_page_indexes( + self, catalog: list[dict[str, int | str]], toc_end: int, num_pages: int + ) -> list[dict[str, int | str]]: + if not catalog: + return [] - return catalog + first_page = None + for item in catalog: + start = self._to_int(item.get("start"), None) + if start is not None and (first_page is None or start < first_page): + first_page = start - def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]: - catalog: dict[str, dict[str, int]] = {} - source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw - if not isinstance(source, dict): - return catalog + if first_page is None: + return [] - for name, value in source.items(): - if not isinstance(name, str) or not isinstance(value, dict): - continue - start = self._to_int(value.get("start"), None) - end = self._to_int(value.get("end"), start) + offset = (toc_end + 1) - first_page + result: list[dict[str, int | str]] = [] + for item in catalog: + start = self._to_int(item.get("start"), None) + end = self._to_int(item.get("end"), start) if start is None: continue if end is None: end = start - catalog[name] = {"start": int(start), "end": int(max(start, end))} - return catalog - @staticmethod - def _extract_json_object(text: str) -> Any: - if not text: - return None - - candidates: list[str] = [] - - code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE) - candidates.extend([c.strip() for c in code_blocks if c.strip()]) - - brace_candidate = PdfTocTool._extract_first_brace_object(text) - if brace_candidate: - candidates.append(brace_candidate) - - candidates.append(text.strip()) - - for cand in candidates: - parsed = PdfTocTool._json_try_parse(cand) - if parsed is not None: - return parsed - return None - - @staticmethod - def _extract_first_brace_object(text: str) -> str | None: - start = text.find("{") - if start < 0: - return None - - depth = 0 - in_str = False - escape = False - for i in range(start, len(text)): - ch = text[i] - if in_str: - if escape: - escape = False - elif ch == "\\": - escape = True - elif ch == '"': - in_str = False - continue - - if ch == '"': - in_str = True - elif ch == "{": - depth += 1 - elif ch == "}": - depth -= 1 - if depth == 0: - return text[start : i + 1] - return None - - @staticmethod - def _json_try_parse(text: str) -> Any: - try: - return json.loads(text) - except Exception: - pass - - # Minimal repair: remove trailing commas before } or ] - repaired = re.sub(r",\s*([}\]])", r"\1", text) - try: - return json.loads(repaired) - except Exception: - return None + page_start_index = max(0, min(start + offset, num_pages - 1)) + page_end_index = max(page_start_index, min(end + offset, num_pages - 1)) + result.append({ + "title": str(item.get("title") or "Untitled"), + "start": start, + "end": max(start, end), + "page_start_index": page_start_index, + "page_end_index": page_end_index, + }) + return result @staticmethod def _to_int(value: Any, default: int | None) -> int | None: diff --git a/difyPlugin/pdf/tools/pdf_toc.yaml b/difyPlugin/pdf/tools/pdf_toc.yaml index d938c681..0916a700 100644 --- a/difyPlugin/pdf/tools/pdf_toc.yaml +++ b/difyPlugin/pdf/tools/pdf_toc.yaml @@ -2,63 +2,35 @@ identity: name: "pdf_toc" author: "yslg" label: - en_US: "PDF TOC Parser" - zh_Hans: "PDF目录解析" - pt_BR: "Analisador de Sumário PDF" - ja_JP: "PDF目次解析" + en_US: "PDF TOC" + zh_Hans: "PDF 目录提取" + pt_BR: "PDF TOC" + ja_JP: "PDF TOC" description: human: - en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM" - zh_Hans: "通过LLM将PDF目录文本(来自目录页提取工具的输出)解析为结构化JSON目录" - pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM" - ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析" - llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)." + en_US: "Extract the catalog array from a PDF file using metadata or LLM." + zh_Hans: "从PDF文件中提取目录数组,优先使用元数据,回退使用LLM解析。" + pt_BR: "Extrair o array de catálogo de um arquivo PDF." + ja_JP: "PDFファイルからカタログ配列を抽出する。" + llm: "Extract a catalog array from a PDF file. Returns JSON text like [{title,start,end,page_start_index,page_end_index}]." parameters: - - name: toc_start - type: number + - name: file + type: file required: true label: - en_US: TOC Start Page - zh_Hans: 目录起始页 - pt_BR: Página Inicial do Sumário - ja_JP: 目次開始ページ + en_US: PDF File + zh_Hans: PDF 文件 + pt_BR: PDF File + ja_JP: PDF File human_description: - en_US: "Start page index of TOC (from pdf_column_range output)" - zh_Hans: "目录起始页码(来自目录页提取工具输出的 start)" - pt_BR: "Índice da página inicial do sumário" - ja_JP: "目次の開始ページ番号" - llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'" - form: llm - - name: toc_end - type: number - required: true - label: - en_US: TOC End Page - zh_Hans: 目录结束页 - pt_BR: Página Final do Sumário - ja_JP: 目次終了ページ - human_description: - en_US: "End page index of TOC (from pdf_column_range output)" - zh_Hans: "目录结束页码(来自目录页提取工具输出的 end)" - pt_BR: "Índice da página final do sumário" - ja_JP: "目次の終了ページ番号" - llm_description: "End page index of TOC section, from pdf_column_range output field 'end'" - form: llm - - name: toc_pages - type: string - required: true - label: - en_US: TOC Page Text - zh_Hans: 目录页文本 - pt_BR: Texto das Páginas do Sumário - ja_JP: 目次ページテキスト - human_description: - en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)" - zh_Hans: "目录页原始文本内容(来自目录页提取工具输出的 pages 数组)" - pt_BR: "Conteúdo de texto bruto das páginas do sumário" - ja_JP: "目次ページの生テキスト内容" - llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'" + en_US: "PDF file to inspect" + zh_Hans: "要解析的PDF文件" + pt_BR: "PDF file to inspect" + ja_JP: "PDF file to inspect" + llm_description: "PDF file to extract catalog from" form: llm + fileTypes: + - "pdf" - name: model type: model-selector scope: llm @@ -69,10 +41,10 @@ parameters: pt_BR: Modelo LLM ja_JP: LLMモデル human_description: - en_US: "LLM model for parsing TOC into structured JSON" - zh_Hans: "用于解析目录的 LLM 模型" - pt_BR: "Modelo LLM para análise do sumário" - ja_JP: "目次解析用のLLMモデル" + en_US: "LLM model used for parsing TOC when metadata is unavailable" + zh_Hans: "当元数据不可用时,用于解析目录的LLM模型" + pt_BR: "Modelo LLM para análise de TOC" + ja_JP: "メタデータが利用できない場合のTOC解析用LLMモデル" form: form extra: python: diff --git a/difyPlugin/数据清洗-大文件处理.yml.bak b/difyPlugin/数据清洗-大文件处理.yml.bak new file mode 100644 index 00000000..70ca4830 --- /dev/null +++ b/difyPlugin/数据清洗-大文件处理.yml.bak @@ -0,0 +1,1000 @@ +app: + description: 优化版:支持大文件PDF处理,跨页表格/段落智能识别合并 + icon: 🤖 + icon_background: '#FFEAD5' + mode: workflow + name: 数据清洗-大文件处理 + use_icon_as_answer_icon: false +dependencies: +- current_identifier: null + type: marketplace + value: + marketplace_plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d + version: null +- current_identifier: null + type: marketplace + value: + marketplace_plugin_unique_identifier: langgenius/siliconflow:0.0.44@9dac23fe837d6da24a2cd9ef959c1c93e4e094b7562ad8a2fd3d4cc86c0e3e89 + version: null +- current_identifier: null + type: marketplace + value: + marketplace_plugin_unique_identifier: bowenliang123/md_exporter:3.6.9@3f027d63e80b44d5d5a9f706871afaef37905b8f8a89a2d152dc530211a8acb1 + version: null +- current_identifier: null + type: package + value: + plugin_unique_identifier: yslg/pdf:0.0.1@5e83b87d38ad55c2a1e929311d21a86cef5f9e04394b977b3ba16eb34de08b36 + version: null +kind: app +version: 0.5.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: + file_upload: + allowed_file_extensions: + - .JPG + - .JPEG + - .PNG + - .GIF + - .WEBP + - .SVG + - .PDF + - .pdf + allowed_file_types: + - image + - document + allowed_file_upload_methods: + - local_file + - remote_url + enabled: false + fileUploadConfig: + audio_file_size_limit: 50 + batch_count_limit: 5 + file_size_limit: 500 + image_file_batch_limit: 10 + image_file_size_limit: 10 + single_chunk_attachment_limit: 10 + video_file_size_limit: 100 + workflow_file_upload_limit: 10 + image: + enabled: false + number_limits: 3 + transfer_methods: + - local_file + - remote_url + number_limits: 3 + opening_statement: '' + retriever_resource: + enabled: true + sensitive_word_avoidance: + enabled: false + speech_to_text: + enabled: false + suggested_questions: [] + suggested_questions_after_answer: + enabled: false + text_to_speech: + enabled: false + language: '' + voice: '' + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: start + targetType: if-else + id: 1770703294598-source-1770703342256-target + selected: false + source: '1770703294598' + sourceHandle: source + target: '1770703342256' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: if-else + targetType: llm + id: 1770703342256-true-1770703393190-target + selected: false + source: '1770703342256' + sourceHandle: 'true' + target: '1770703393190' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: if-else + targetType: llm + id: 1770703342256-93d5294c-5984-4bc0-b30d-cd9e2ffba28d-1770703524412-target + selected: false + source: '1770703342256' + sourceHandle: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d + target: '1770703524412' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: llm + targetType: variable-aggregator + id: 1770703393190-source-1770703625287-target + selected: false + source: '1770703393190' + sourceHandle: source + target: '1770703625287' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: llm + targetType: variable-aggregator + id: 1770703524412-source-1770703625287-target + selected: false + source: '1770703524412' + sourceHandle: source + target: '1770703625287' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: if-else + targetType: if-else + id: 1770703342256-6556b05e-3266-4aa7-b196-ec41f5dd766b-1772348592076-target + selected: false + source: '1770703342256' + sourceHandle: 6556b05e-3266-4aa7-b196-ec41f5dd766b + target: '1772348592076' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: document-extractor + id: 1772348592076-false-1770703633813-target + selected: false + source: '1772348592076' + sourceHandle: 'false' + target: '1770703633813' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: if-else + targetType: tool + id: 1772348592076-0b4fd2d4-a592-4421-acbb-822db3004219-1772349027446-target + selected: false + source: '1772348592076' + sourceHandle: 0b4fd2d4-a592-4421-acbb-822db3004219 + target: '1772349027446' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: document-extractor + targetType: variable-aggregator + id: 1770703633813-source-1772348969241-target + selected: false + source: '1770703633813' + sourceHandle: source + target: '1772348969241' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: variable-aggregator + id: 1772349027446-source-1772348969241-target + selected: false + source: '1772349027446' + sourceHandle: source + target: '1772348969241' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: llm + id: 1770703625287-source-1770703671732-target + selected: false + source: '1770703625287' + sourceHandle: source + target: '1770703671732' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: llm + targetType: tool + id: 1770703671732-source-1770704285657-target + selected: false + source: '1770703671732' + sourceHandle: source + target: '1770704285657' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: if-else + targetType: tool + id: 1772348592076-true-1772527425324-target + selected: false + source: '1772348592076' + sourceHandle: 'true' + target: '1772527425324' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: variable-aggregator + targetType: variable-aggregator + id: 1772348969241-source-1770703625287-target + source: '1772348969241' + sourceHandle: source + target: '1770703625287' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: end + id: 1770704285657-source-1770704288628-target + source: '1770704285657' + sourceHandle: source + target: '1770704288628' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: end + id: 1772527425324-source-1772779766541-target + source: '1772527425324' + sourceHandle: source + target: '1772779766541' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + selected: false + title: 用户输入 + type: start + variables: + - allowed_file_extensions: [] + allowed_file_types: + - image + - document + - video + allowed_file_upload_methods: + - local_file + - remote_url + default: '' + hint: '' + label: 文件 + max_length: 48 + options: [] + placeholder: '' + required: true + type: file + variable: file + height: 109 + id: '1770703294598' + position: + x: 0 + y: 55 + positionAbsolute: + x: 0 + y: 55 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + cases: + - case_id: 'true' + conditions: + - comparison_operator: in + id: f88f279e-5736-4b1b-98cf-f8a9621531a0 + value: + - image + varType: file + variable_selector: + - '1770703294598' + - file + - type + id: 'true' + logical_operator: and + - case_id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d + conditions: + - comparison_operator: in + id: 48e8d32a-59c5-4573-8e8a-355dc73a39fc + value: + - video + varType: file + variable_selector: + - '1770703294598' + - file + - type + id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d + logical_operator: and + - case_id: 6556b05e-3266-4aa7-b196-ec41f5dd766b + conditions: + - comparison_operator: in + id: 9916110c-edf7-4a4a-b324-2f8d85c73299 + value: + - document + varType: file + variable_selector: + - '1770703294598' + - file + - type + id: 6556b05e-3266-4aa7-b196-ec41f5dd766b + logical_operator: and + selected: false + title: 条件分支 + type: if-else + height: 220 + id: '1770703342256' + position: + x: 342 + y: 0 + positionAbsolute: + x: 342 + y: 0 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + context: + enabled: false + variable_selector: [] + model: + completion_params: + enable_thinking: true + temperature: 0.7 + mode: chat + name: zai-org/GLM-4.6V + provider: langgenius/siliconflow/siliconflow + prompt_template: + - id: 4b1706f6-3216-4fb7-a6dc-978ce43ff491 + role: system + text: 识别图片中所有内容和文字,并进行合理的描述编排 + reasoning_format: separated + selected: false + title: 图片理解 + type: llm + vision: + configs: + detail: high + variable_selector: + - '1770703294598' + - file + enabled: true + height: 88 + id: '1770703393190' + position: + x: 2772 + y: 82 + positionAbsolute: + x: 2772 + y: 82 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + context: + enabled: false + variable_selector: [] + model: + completion_params: {} + mode: chat + name: Pro/moonshotai/Kimi-K2.5 + provider: langgenius/siliconflow/siliconflow + prompt_template: + - id: 497bebc3-5e75-4c2b-940c-ba485dc1e51a + role: system + text: 识别视频中所有内容和文字,并进行合理的描述编排 + reasoning_format: separated + selected: false + title: 视频理解 + type: llm + vision: + configs: + detail: high + variable_selector: + - '1770703294598' + - file + enabled: true + height: 88 + id: '1770703524412' + position: + x: 1770 + y: 177 + positionAbsolute: + x: 1770 + y: 177 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + cases: + - case_id: 'true' + conditions: + - comparison_operator: contains + id: 7a6d2b1e-9704-41f3-aeba-40c6e2484d56 + value: pdf + varType: string + variable_selector: + - '1770703294598' + - file + - extension + id: 'true' + logical_operator: and + - case_id: 0b4fd2d4-a592-4421-acbb-822db3004219 + conditions: + - comparison_operator: contains + id: 67767b34-ad03-48f4-80ef-100eb78e13ab + value: doc + varType: file + variable_selector: + - '1770703294598' + - file + - extension + logical_operator: and + selected: false + title: 条件分支 2 + type: if-else + height: 172 + id: '1772348592076' + position: + x: 704 + y: 424 + positionAbsolute: + x: 704 + y: 424 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_array_file: false + selected: false + title: 文档提取器 + type: document-extractor + variable_selector: + - '1770703294598' + - file + height: 104 + id: '1770703633813' + position: + x: 1066 + y: 337 + positionAbsolute: + x: 1066 + y: 337 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: Word file to extract text and images from + ja_JP: Word file to extract text and images from + pt_BR: Word file to extract text and images from + zh_Hans: 要提取文本和图片的Word文件 + label: + en_US: Word Content + ja_JP: Word Content + pt_BR: Word Content + zh_Hans: Word 内容 + llm_description: Word file content to be extracted + max: null + min: null + name: word_content + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: file + params: + word_content: '' + plugin_id: samanhappy/word_process + plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d + provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=cb0643689e2f8152d38c44a267a459fae99ff208b0bc164e27ccb053fc1844cd.svg + provider_id: samanhappy/word_process/word_process + provider_name: samanhappy/word_process/word_process + provider_type: builtin + selected: false + title: Word提取器 + tool_configurations: {} + tool_description: 一个将Word文件提取为文本和图片的工具 + tool_label: Word提取器 + tool_name: word_extractor + tool_node_version: '2' + tool_parameters: + word_content: + type: variable + value: + - '1770703294598' + - file + type: tool + height: 52 + id: '1772349027446' + position: + x: 1066 + y: 521 + positionAbsolute: + x: 1066 + y: 521 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + output_type: string + selected: false + title: 文档提取聚合 + type: variable-aggregator + variables: + - - '1772349027446' + - text + - - '1770703633813' + - text + height: 134 + id: '1772348969241' + position: + x: 1428 + y: 344 + positionAbsolute: + x: 1428 + y: 344 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + advanced_settings: + group_enabled: false + groups: + - groupId: 058efed3-3c6a-44d6-8f40-704abda8c413 + group_name: Group1 + output_type: string + variables: + - - '1770703393190' + - text + - - '1770703524412' + - text + - - '1772349100004' + - result + output_type: string + selected: false + title: 文件提取聚合 + type: variable-aggregator + variables: + - - '1770703393190' + - text + - - '1770703524412' + - text + - - '1772348969241' + - output + height: 160 + id: '1770703625287' + position: + x: 3134 + y: 291 + positionAbsolute: + x: 3134 + y: 291 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + context: + enabled: false + variable_selector: [] + model: + completion_params: + temperature: 0.3 + mode: chat + name: Qwen/Qwen3-32B + provider: langgenius/siliconflow/siliconflow + prompt_template: + - id: 48ec1856-fdd7-4f4a-9ce5-1aa635822550 + role: system + text: '你是一个专业的文档整理和合并专家。以下内容是从文档中分块提取并格式化的Markdown文本。由于分块处理,各块之间可能存在跨页断裂和重复内容,需要你进行智能合并。 + + + ## 你的任务 + + + ### 1. 合并跨页表格 + + - 找到所有 `<!-- TABLE_CONTINUES -->` 和对应的 `<!-- TABLE_CONTINUED_FROM_PREV + -->` 标记 + + - 将前一块末尾的不完整表格和后一块开头的延续表格合并为一个完整表格 + + - 确保表头只保留一份,数据行完整拼接,表格结构正确 + + + ### 2. 合并跨页段落 + + - 找到所有 `<!-- PARA_CONTINUES -->` 和 `<!-- PARA_CONTINUED_FROM_PREV -->` + 标记 + + - 将被截断的段落拼接为语义完整的段落 + + + ### 3. 合并跨页列表 + + - 找到所有 `<!-- LIST_CONTINUES -->` 和 `<!-- LIST_CONTINUED_FROM_PREV -->` + 标记 + + - 将被截断的列表合并为完整列表,确保编号连续 + + + ### 4. 去除重复内容 + + - 由于分块时存在页面重叠,相邻块之间可能有重复的段落、表格行或列表项 + + - 识别并去除这些重复内容,每段内容只保留一份 + + + ### 5. 清理所有辅助标记 + + - 移除所有 `<!-- ... -->` 形式的辅助标记和块分隔符 + + - 确保最终输出中不包含任何HTML注释或处理标记 + + + ### 6. 格式规范化 + + - 确保标题层级正确且连续 + + - 确保表格格式完整(有表头行和分隔行) + + - 确保列表编号连续 + + - 统一全文格式风格 + + + 直接输出最终的Markdown内容,不要用```markdown```包裹。 + + + 以下是需要整理合并的内容: + + {{#1770703625287.output#}}' + reasoning_format: separated + selected: false + title: 数据清洗与跨页合并 + type: llm + vision: + enabled: false + height: 88 + id: '1770703671732' + position: + x: 3660 + y: 327 + positionAbsolute: + x: 3660 + y: 327 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: Markdown text + ja_JP: Markdown text + pt_BR: Markdown text + zh_Hans: Markdown格式文本 + label: + en_US: Markdown text + ja_JP: Markdown text + pt_BR: Markdown text + zh_Hans: Markdown格式文本 + llm_description: '' + max: null + min: null + name: md_text + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: Optional custom output file name, and the filename suffix is not + required. + ja_JP: Optional custom output file name, and the filename suffix is not + required. + pt_BR: Optional custom output file name, and the filename suffix is not + required. + zh_Hans: 可选的自定义输出文件名,后缀名无需指定 + label: + en_US: Output Filename + ja_JP: Output Filename + pt_BR: Output Filename + zh_Hans: 输出文件名 + llm_description: '' + max: null + min: null + name: output_filename + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + params: + md_text: '' + output_filename: '' + plugin_id: bowenliang123/md_exporter + plugin_unique_identifier: bowenliang123/md_exporter:3.4.0@a5ce3ac3114f3dd6ab4fe49f0bb931a31af49ff555e479ec45e8aaa5d44157ee + provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f0bad95cda1671b4e49f0e05df6122ef9ec5d554e138f128795d11d3806c00ef.svg + provider_id: bowenliang123/md_exporter/md_exporter + provider_name: bowenliang123/md_exporter/md_exporter + provider_type: builtin + selected: false + title: Markdown ⮕ MD + tool_configurations: {} + tool_description: 将 Markdown 转换为 .md 文件的工具 + tool_label: Markdown ⮕ MD + tool_name: md_to_md + tool_node_version: '2' + tool_parameters: + md_text: + type: mixed + value: '{{#1770703671732.text#}}' + output_filename: + type: mixed + value: '' + type: tool + height: 52 + id: '1770704285657' + position: + x: 4231.079190350343 + y: 573.1529224498603 + positionAbsolute: + x: 4231.079190350343 + y: 573.1529224498603 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + outputs: + - value_selector: + - '1770704285657' + - files + value_type: array[file] + variable: _ + selected: false + title: 输出 + type: end + height: 88 + id: '1770704288628' + position: + x: 5142.505374898874 + y: 614.2288378497078 + positionAbsolute: + x: 5142.505374898874 + y: 614.2288378497078 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: PDF file to convert + ja_JP: 変換するPDFファイル + pt_BR: Arquivo PDF para converter + zh_Hans: 要转换的 PDF 文件 + label: + en_US: PDF File + ja_JP: PDFファイル + pt_BR: Arquivo PDF + zh_Hans: PDF 文件 + llm_description: PDF file to convert to Markdown + max: null + min: null + name: file + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: file + - auto_generate: null + default: true + form: form + human_description: + en_US: Whether to embed images as base64 (default true) + ja_JP: 画像をbase64として埋め込むか + pt_BR: Se deve incorporar imagens como base64 + zh_Hans: 是否将图片以base64嵌入(默认是) + label: + en_US: Include Images + ja_JP: 画像を含める + pt_BR: Incluir Imagens + zh_Hans: 包含图片 + llm_description: Set to true to embed images as base64 + max: null + min: null + name: include_images + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: 150 + form: form + human_description: + en_US: DPI for rendering vector drawings (72-300) + ja_JP: ベクター描画のDPI + pt_BR: DPI para renderizar desenhos vetoriais + zh_Hans: 矢量图渲染DPI(72-300,默认150) + label: + en_US: Image DPI + ja_JP: 画像DPI + pt_BR: DPI da Imagem + zh_Hans: 图片DPI + llm_description: Resolution for rendering vector drawings + max: null + min: null + name: image_dpi + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + params: + file: '' + image_dpi: '' + include_images: '' + plugin_id: yslg/pdf + plugin_unique_identifier: yslg/pdf:0.0.1@cc5f6665002ca7c06855ef6703ee9f6e051ddbfb3d00d2aa899f9f280f45dd61 + provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f1441c071a96f87326f5eb2ae2bfc5a570e9260e7d2b74c2ac15df4037231c64.svg + provider_id: yslg/pdf/pdf + provider_name: yslg/pdf/pdf + provider_type: builtin + selected: true + title: PDF转Markdown + tool_configurations: + image_dpi: + type: constant + value: 150 + include_images: + type: constant + value: true + model: + type: constant + value: + completion_params: {} + mode: chat + model: Qwen/Qwen3-32B + model_type: llm + provider: langgenius/siliconflow/siliconflow + tool_description: 将PDF转换为Markdown,图片base64嵌入,无需大模型 + tool_label: PDF转Markdown + tool_name: pdf_to_markdown + tool_node_version: '2' + tool_parameters: + file: + type: variable + value: + - '1770703294598' + - file + type: tool + height: 140 + id: '1772527425324' + position: + x: 1881.4558888576478 + y: 697.8632689662784 + positionAbsolute: + x: 1881.4558888576478 + y: 697.8632689662784 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + outputs: + - value_selector: + - '1772527425324' + - files + value_type: array[file] + variable: files + selected: false + title: 输出 2 + type: end + height: 88 + id: '1772779766541' + position: + x: 2183.4558888576476 + y: 697.8632689662784 + positionAbsolute: + x: 2183.4558888576476 + y: 697.8632689662784 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -675.5777822239224 + y: 9.568461206490326 + zoom: 0.7578582832552 + rag_pipeline_variables: []