diff --git a/.gitignore b/.gitignore index 4cbe3caa..195f9bd6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ .tmp .trae -**/*.difypkg \ No newline at end of file +**/*.difypkg +urbanLifeServ/* +*/.data \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 203429db..e69de29b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,27 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "Python: FastAPI Server", - "type": "python", - "request": "launch", - "program": "${workspaceFolder}/difyPlugin/main.py", - "console": "integratedTerminal", - "justMyCode": true, - "env": { - "PYTHONUNBUFFERED": "1" - }, - "cwd": "${workspaceFolder}/difyPlugin", - "args": [] - }, - { - "name": "Python: Debug Plugin", - "type": "python", - "request": "launch", - "program": "${workspaceFolder}/difyPlugin/app/plugins/pdf/__init__.py", - "console": "integratedTerminal", - "justMyCode": true, - "cwd": "${workspaceFolder}/difyPlugin" - } - ] -} \ No newline at end of file diff --git a/ai-management-platform b/ai-management-platform index 199d8180..96f7c3aa 160000 --- a/ai-management-platform +++ b/ai-management-platform @@ -1 +1 @@ -Subproject commit 199d8180a698c62d79c5c853302733050fe9c0fa +Subproject commit 96f7c3aa4c9ac8b00e0b98b5a4998b5f910d5337 diff --git a/difyPlugin/pdf/manifest.yaml b/difyPlugin/pdf/manifest.yaml index fb7631bf..27f075f3 100644 --- a/difyPlugin/pdf/manifest.yaml +++ b/difyPlugin/pdf/manifest.yaml @@ -19,6 +19,9 @@ resource: permission: tool: enabled: true + model: + enabled: true + llm: true plugins: tools: - provider/pdf.yaml diff --git a/difyPlugin/pdf/provider/pdf.yaml b/difyPlugin/pdf/provider/pdf.yaml index 83a55577..c7473239 100644 --- a/difyPlugin/pdf/provider/pdf.yaml +++ b/difyPlugin/pdf/provider/pdf.yaml @@ -56,8 +56,12 @@ identity: # en_US: "Access Token" tools: - - tools/pdf.yaml + - tools/pdf_column_range.yaml - tools/pdf_single_page.yaml + - tools/pdf_summary.yaml + - tools/pdf_toc.yaml + - tools/pdf_extract_range.yaml + - tools/pdf_to_markdown.yaml extra: python: source: provider/pdf.py diff --git a/difyPlugin/pdf/requirements.txt b/difyPlugin/pdf/requirements.txt index e9cf72f9..80735ec2 100644 --- a/difyPlugin/pdf/requirements.txt +++ b/difyPlugin/pdf/requirements.txt @@ -1,2 +1,2 @@ dify_plugin>=0.4.0,<0.7.0 -PyPDF2>=3.0.1 +pymupdf>=1.27.1 \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf.py b/difyPlugin/pdf/tools/pdf.py deleted file mode 100644 index fc226c04..00000000 --- a/difyPlugin/pdf/tools/pdf.py +++ /dev/null @@ -1,61 +0,0 @@ -import re -from collections.abc import Generator -from io import BytesIO -from typing import Any - -import PyPDF2 -from dify_plugin import Tool -from dify_plugin.entities.tool import ToolInvokeMessage - - -class PdfTool(Tool): - def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: - file = tool_parameters.get("file") - if not file: - yield self.create_text_message("Error: file is required") - return - - # file.blob returns bytes - pdf_bytes = file.blob - reader = PyPDF2.PdfReader(BytesIO(pdf_bytes)) - num_pages = len(reader.pages) - - toc_start = None - toc_end = None - - toc_patterns = [ - r'目录', - r'Table of Contents', - r'Contents', - r'目次' - ] - - for page_num in range(num_pages): - page = reader.pages[page_num] - text = page.extract_text() or "" - - if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): - if toc_start is None: - toc_start = page_num - toc_end = page_num - elif toc_start is not None and toc_end is not None: - break - - if toc_start is None: - yield self.create_json_message({ - "start": None, - "end": None, - "pages": [] - }) - return - - toc_pages = [] - for page_num in range(toc_start, toc_end + 1): - page = reader.pages[page_num] - toc_pages.append(page.extract_text() or "") - - yield self.create_json_message({ - "start": toc_start, - "end": toc_end, - "pages": toc_pages - }) \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_column_range.py b/difyPlugin/pdf/tools/pdf_column_range.py new file mode 100644 index 00000000..5d5f5db8 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_column_range.py @@ -0,0 +1,107 @@ +import json +import re +from collections.abc import Generator +from io import BytesIO +from typing import Any + +import fitz # PyMuPDF 核心库 +from dify_plugin import Tool +from dify_plugin.entities.tool import ToolInvokeMessage + + +class PdfTool(Tool): + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: + file = tool_parameters.get("file") + if not file: + yield self.create_text_message("Error: file is required") + return + + # 从字节流加载 PDF(替换 PyPDF2) + pdf_bytes = file.blob + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + num_pages = len(doc) + + toc_start = None + toc_end = None + + # 目录匹配正则(与原代码一致) + toc_patterns = [ + r'目录', + r'目 录', + r'目\u3000录', + r'Table of Contents', + r'Contents', + r'目次' + ] + + # 遍历页面识别目录页(逻辑不变,仅替换文本提取方式) + for page_num in range(num_pages): + page = doc[page_num] + text = page.get_text() or "" # PyMuPDF 提取文本 + + if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): + if toc_start is None: + toc_start = page_num + toc_end = page_num + elif toc_start is not None and toc_end is not None: + break + + # 提取目录页文本 + toc_pages = [] + if toc_start is not None and toc_end is not None: + for page_num in range(toc_start, toc_end + 1): + page = doc[page_num] + toc_pages.append(page.get_text() or "") + + # 关闭文档 + doc.close() + + result = { + "start": toc_start, + "end": toc_end, + "pages": toc_pages, + "pages_text": "\n".join(toc_pages) if toc_pages else "", + } + yield self.create_text_message(json.dumps(result, ensure_ascii=False)) + yield self.create_json_message(result) + + +if __name__ == "__main__": + # 测试代码(改用 PyMuPDF) + pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf" + doc = fitz.open(pdf_path) # 本地文件直接打开 + num_pages = len(doc) + + toc_start = None + toc_end = None + + toc_patterns = [ + r'目录', + r'目 录', + r'目\u3000录', + r'Table of Contents', + r'Contents', + r'目次' + ] + + # 遍历页面找目录 + for page_num in range(num_pages): + page = doc[page_num] + text = page.get_text() or "" + if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns): + if toc_start is None: + toc_start = page_num + toc_end = page_num + elif toc_start is not None and toc_end is not None: + break + + # 提取目录页文本 + toc_pages = [] + toc_start = toc_start if toc_start is not None else 18 + toc_end = toc_end if toc_end is not None else toc_start + 9 + for page_num in range(toc_start, toc_end): + page = doc[page_num] + toc_pages.append(page.get_text() or "") + + print(toc_start, toc_end, toc_pages) + doc.close() # 关闭文档 \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf.yaml b/difyPlugin/pdf/tools/pdf_column_range.yaml similarity index 96% rename from difyPlugin/pdf/tools/pdf.yaml rename to difyPlugin/pdf/tools/pdf_column_range.yaml index fe18f6ab..8f758dd7 100644 --- a/difyPlugin/pdf/tools/pdf.yaml +++ b/difyPlugin/pdf/tools/pdf_column_range.yaml @@ -33,4 +33,4 @@ parameters: - "pdf" extra: python: - source: tools/pdf.py \ No newline at end of file + source: tools/pdf_column_range.py \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_extract_range.py b/difyPlugin/pdf/tools/pdf_extract_range.py new file mode 100644 index 00000000..fbaa3927 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_extract_range.py @@ -0,0 +1,48 @@ +import json +from collections.abc import Generator +from typing import Any + +import fitz # PyMuPDF +from dify_plugin import Tool +from dify_plugin.entities.tool import ToolInvokeMessage + + +class PdfExtractRangeTool(Tool): + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: + file = tool_parameters.get("file") + if not file: + yield self.create_text_message("Error: file is required") + return + + start_page = int(tool_parameters.get("start_page", 0)) + end_page = int(tool_parameters.get("end_page", 0)) + + # 打开 PDF + pdf_bytes = file.blob + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + num_pages = len(doc) + + # 边界处理 + start_page = max(0, min(start_page, num_pages - 1)) + end_page = max(start_page, min(end_page, num_pages - 1)) + + # 逐页提取文本 + page_texts = [] + for page_idx in range(start_page, end_page + 1): + page = doc[page_idx] + text = page.get_text("text", sort=True) or "" + page_texts.append(text) + + doc.close() + + # 拼接所有页面文本 + full_text = "\n\n--- 分页 ---\n\n".join(page_texts) + + result = { + "start": start_page, + "end": end_page, + "total_pages": end_page - start_page + 1, + "text": full_text, + } + yield self.create_text_message(json.dumps(result, ensure_ascii=False)) + yield self.create_json_message(result) diff --git a/difyPlugin/pdf/tools/pdf_extract_range.yaml b/difyPlugin/pdf/tools/pdf_extract_range.yaml new file mode 100644 index 00000000..0bc10b6f --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_extract_range.yaml @@ -0,0 +1,68 @@ +identity: + name: "pdf_extract_range" + author: "yslg" + label: + en_US: "Extract Page Range Text" + zh_Hans: "提取页面范围文本" + pt_BR: "Extrair Texto do Intervalo de Páginas" + ja_JP: "ページ範囲テキスト抽出" +description: + human: + en_US: "Extract plain text from a specified page range of a PDF file" + zh_Hans: "从PDF文件的指定页码范围提取纯文本" + pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF" + ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出" + llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range." +parameters: + - name: file + type: file + required: true + label: + en_US: PDF File + zh_Hans: PDF 文件 + pt_BR: Arquivo PDF + ja_JP: PDFファイル + human_description: + en_US: "PDF file to extract text from" + zh_Hans: "要提取文本的 PDF 文件" + pt_BR: "Arquivo PDF para extrair texto" + ja_JP: "テキストを抽出するPDFファイル" + llm_description: "PDF file to extract page range text from" + form: llm + fileTypes: + - "pdf" + - name: start_page + type: number + required: true + label: + en_US: Start Page + zh_Hans: 起始页码 + pt_BR: Página Inicial + ja_JP: 開始ページ + human_description: + en_US: "Start page index (0-based)" + zh_Hans: "起始页码(从0开始)" + pt_BR: "Índice da página inicial (base 0)" + ja_JP: "開始ページ番号(0始まり)" + llm_description: "Start page index (0-based)" + form: llm + default: 0 + - name: end_page + type: number + required: true + label: + en_US: End Page + zh_Hans: 结束页码 + pt_BR: Página Final + ja_JP: 終了ページ + human_description: + en_US: "End page index (0-based, inclusive)" + zh_Hans: "结束页码(从0开始,包含该页)" + pt_BR: "Índice da página final (base 0, inclusivo)" + ja_JP: "終了ページ番号(0始まり、含む)" + llm_description: "End page index (0-based, inclusive)" + form: llm + default: 0 +extra: + python: + source: tools/pdf_extract_range.py diff --git a/difyPlugin/pdf/tools/pdf_single_page.py b/difyPlugin/pdf/tools/pdf_single_page.py index 5ed41ecf..0fa67660 100644 --- a/difyPlugin/pdf/tools/pdf_single_page.py +++ b/difyPlugin/pdf/tools/pdf_single_page.py @@ -1,8 +1,9 @@ +import json from collections.abc import Generator from io import BytesIO from typing import Any -import PyPDF2 +import fitz # PyMuPDF 核心库 from dify_plugin import Tool from dify_plugin.entities.tool import ToolInvokeMessage @@ -16,21 +17,29 @@ class PdfSinglePageTool(Tool): yield self.create_text_message("Error: file is required") return + # 从字节流加载 PDF(替换 PyPDF2 的 PdfReader) pdf_bytes = file.blob - reader = PyPDF2.PdfReader(BytesIO(pdf_bytes)) - num_pages = len(reader.pages) + doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开 + num_pages = len(doc) + # 页码边界处理(逻辑与原代码一致) page_index = int(page) if page_index < 0: page_index = 0 if page_index >= num_pages: page_index = num_pages - 1 - selected_page = reader.pages[page_index] - text = selected_page.extract_text() or "" + # 提取指定页面文本(PyMuPDF 方式) + selected_page = doc[page_index] + text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准 - yield self.create_json_message({ + # 关闭文档释放资源 + doc.close() + + result = { "start": page_index, "end": page_index, "pages": [text] - }) + } + yield self.create_text_message(json.dumps(result, ensure_ascii=False)) + yield self.create_json_message(result) \ No newline at end of file diff --git a/difyPlugin/pdf/tools/pdf_summary.py b/difyPlugin/pdf/tools/pdf_summary.py new file mode 100644 index 00000000..684914c7 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_summary.py @@ -0,0 +1,209 @@ +import json +import re +from collections.abc import Generator +from typing import Any + +import fitz +from dify_plugin import Tool +from dify_plugin.entities.model.llm import LLMModelConfig +from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage +from dify_plugin.entities.tool import ToolInvokeMessage + + +class PdfSummaryTool(Tool): + """Fast PDF page summary tool. + + Default behavior is optimized for throughput in large workflows: + - Extract plain text and lightweight table data only. + - Skip expensive image base64 and drawing path extraction. + - Skip LLM by default unless `use_llm=true` is explicitly passed. + """ + + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: + file = tool_parameters.get("file") + if not file: + yield self.create_text_message("Error: file is required") + return + + start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0) + end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0) + model_config = tool_parameters.get("model") + use_llm = self._to_bool(tool_parameters.get("use_llm"), False) + + max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000) + max_chars_per_page = max(800, min(max_chars_per_page, 20000)) + + llm_prompt = tool_parameters.get( + "llm_prompt", + "请基于输入的PDF页面文本做简洁准确摘要,输出中文要点。不要输出思考过程。", + ) + + pdf_bytes = file.blob + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + try: + num_pages = len(doc) + start_page = max(0, min(start_page, num_pages - 1)) + end_page = max(start_page, min(end_page, num_pages - 1)) + + pages_data: list[dict[str, Any]] = [] + for page_idx in range(start_page, end_page + 1): + page = doc[page_idx] + page_data = self._extract_page_fast(page, page_idx, max_chars_per_page) + pages_data.append(page_data) + + result = { + "total_pages_extracted": len(pages_data), + "page_range": {"start": start_page, "end": end_page}, + "pages": pages_data, + } + yield self.create_json_message(result) + + # Fast local summary first (deterministic, no model latency) + local_text = self._build_local_summary(pages_data) + + # Optional LLM refinement, explicitly enabled only + if use_llm and model_config: + refined = self._summarize_with_llm(local_text, llm_prompt, model_config) + final_text = refined if refined else local_text + else: + final_text = local_text + + if final_text: + yield self.create_text_message(final_text) + finally: + doc.close() + + def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]: + text = (page.get_text("text") or "").strip() + if len(text) > max_chars_per_page: + text = text[:max_chars_per_page] + "\n...[truncated]" + + tables: list[dict[str, Any]] = [] + try: + tabs = page.find_tables() + for tab_idx, tab in enumerate(tabs.tables[:3]): + cells = tab.extract() or [] + tables.append( + { + "index": tab_idx, + "rows": tab.row_count, + "cols": tab.col_count, + "cells": cells[:10], + } + ) + except Exception: + pass + + return { + "page_number": page_idx, + "text": text, + "tables": tables, + "images": [], + "drawings_summary": [], + "text_blocks": [], + "width": float(page.rect.width), + "height": float(page.rect.height), + } + + def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str: + """Output actual page content as Markdown (text + tables). + + No LLM needed downstream — the text is already usable Markdown. + """ + parts: list[str] = [] + for page in pages_data: + text = (page.get("text") or "").strip() + tables = page.get("tables") or [] + + page_parts: list[str] = [] + if text: + page_parts.append(text) + + for tab in tables: + cells = tab.get("cells") or [] + if len(cells) >= 2: + md = self._cells_to_md_table(cells) + if md: + page_parts.append(md) + + if page_parts: + parts.append("\n\n".join(page_parts)) + + return "\n\n--- 分页 ---\n\n".join(parts) + + @staticmethod + def _cells_to_md_table(cells: list) -> str: + if not cells: + return "" + header = cells[0] + ncols = len(header) + if ncols == 0: + return "" + clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip() + lines = [ + "| " + " | ".join(clean(c) for c in header) + " |", + "| " + " | ".join("---" for _ in range(ncols)) + " |", + ] + for row in cells[1:]: + padded = list(row) + [""] * max(0, ncols - len(row)) + lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |") + return "\n".join(lines) + + def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str: + response = self.session.model.llm.invoke( + model_config=LLMModelConfig(**model_config), + prompt_messages=[ + SystemPromptMessage(content=llm_prompt), + UserPromptMessage(content=local_text), + ], + stream=False, + ) + + llm_text = "" + if hasattr(response, "message") and response.message: + content = response.message.content + if isinstance(content, str): + llm_text = content + elif isinstance(content, list): + llm_text = "".join( + item.data if hasattr(item, "data") else str(item) + for item in content + ) + + return self._extract_visible_answer(llm_text) + + @staticmethod + def _extract_visible_answer(text: str) -> str: + if not text: + return "" + + box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text) + if box_match: + text = box_match.group(1) + else: + text = re.sub(r"[\s\S]*?", "", text, flags=re.IGNORECASE) + + text = re.sub(r"<\|[^>]+\|>", "", text) + return text.strip() + + @staticmethod + def _to_int(value: Any, default: int) -> int: + try: + if value is None or value == "": + return default + return int(value) + except Exception: + return default + + @staticmethod + def _to_bool(value: Any, default: bool) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + s = str(value).strip().lower() + if s in {"1", "true", "yes", "on"}: + return True + if s in {"0", "false", "no", "off"}: + return False + return default diff --git a/difyPlugin/pdf/tools/pdf_summary.yaml b/difyPlugin/pdf/tools/pdf_summary.yaml new file mode 100644 index 00000000..059c920d --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_summary.yaml @@ -0,0 +1,99 @@ +identity: + name: "pdf_summary" + author: "yslg" + label: + en_US: "PDF Page Summary" + zh_Hans: "PDF页面概述" + pt_BR: "Resumo de Página PDF" + ja_JP: "PDFページ概要" +description: + human: + en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM" + zh_Hans: "提取PDF页面核心元素(文本、图片、表格、路径)及坐标,并通过LLM进行概述" + pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM" + ja_JP: "PDFページからコア要素(テキスト、画像、テーブル、パス)を座標付きで抽出し、LLMで要約" + llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content" +parameters: + - name: file + type: file + required: true + label: + en_US: PDF File + zh_Hans: PDF 文件 + pt_BR: Arquivo PDF + ja_JP: PDFファイル + human_description: + en_US: "PDF file to process" + zh_Hans: "要处理的 PDF 文件" + pt_BR: "Arquivo PDF para processar" + ja_JP: "処理するPDFファイル" + llm_description: "PDF file to extract elements from and summarize" + form: llm + fileTypes: + - "pdf" + - name: pdf_start_page + type: number + required: true + label: + en_US: Start Page + zh_Hans: 起始页码 + pt_BR: Página Inicial + ja_JP: 開始ページ + human_description: + en_US: "Start page index (0-based)" + zh_Hans: "起始页码(从0开始)" + pt_BR: "Índice da página inicial (base 0)" + ja_JP: "開始ページ番号(0始まり)" + llm_description: "Start page index (0-based) for element extraction" + form: llm + default: 0 + - name: pdf_end_page + type: number + required: true + label: + en_US: End Page + zh_Hans: 结束页码 + pt_BR: Página Final + ja_JP: 終了ページ + human_description: + en_US: "End page index (0-based, inclusive)" + zh_Hans: "结束页码(从0开始,包含该页)" + pt_BR: "Índice da página final (base 0, inclusivo)" + ja_JP: "終了ページ番号(0始まり、含む)" + llm_description: "End page index (0-based, inclusive) for element extraction" + form: llm + default: 0 + - name: model + type: model-selector + scope: llm + required: true + label: + en_US: LLM Model + zh_Hans: LLM 模型 + pt_BR: Modelo LLM + ja_JP: LLMモデル + human_description: + en_US: "LLM model used for summarizing extracted content" + zh_Hans: "用于概述提取内容的 LLM 模型" + pt_BR: "Modelo LLM usado para resumir o conteúdo extraído" + ja_JP: "抽出内容の要約に使用するLLMモデル" + form: form + - name: llm_prompt + type: string + required: false + label: + en_US: LLM Prompt + zh_Hans: LLM 提示词 + pt_BR: Prompt do LLM + ja_JP: LLMプロンプト + human_description: + en_US: "System prompt for LLM summarization" + zh_Hans: "LLM 概述的系统提示词" + pt_BR: "Prompt do sistema para resumo LLM" + ja_JP: "LLM要約用のシステムプロンプト" + llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content" + form: form + default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容(包含文本、图片信息、表格和矢量图形),对每页内容进行准确、简洁的概述。" +extra: + python: + source: tools/pdf_summary.py diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.py b/difyPlugin/pdf/tools/pdf_to_markdown.py new file mode 100644 index 00000000..c00ab31e --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_to_markdown.py @@ -0,0 +1,335 @@ +import base64 +import re +from collections import OrderedDict +from collections.abc import Generator +from typing import Any + +import fitz +from dify_plugin import Tool +from dify_plugin.entities.tool import ToolInvokeMessage + + +class PdfToMarkdownTool(Tool): + """Convert PDF to a single Markdown file. No LLM needed. + + - Auto-detect TOC and organize content by chapters. + - Extract text and tables as Markdown. + - Embed raster images as base64. + - Render vector drawings as base64 PNG. + - Output one .md file via create_blob_message. + """ + + _TOC_PATTERNS = [ + r"目录", r"目 录", r"目\u3000录", + r"Table of Contents", r"Contents", r"目次", + ] + + # ── entry point ────────────────────────────────────────── + + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: + file = tool_parameters.get("file") + if not file: + yield self.create_text_message("Error: file is required") + return + + include_images = self._to_bool(tool_parameters.get("include_images"), True) + image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150) + image_dpi = max(72, min(image_dpi, 300)) + max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw + + doc = fitz.open(stream=file.blob, filetype="pdf") + try: + num_pages = len(doc) + + # 1) Build chapter map (metadata TOC → printed TOC → none) + chapters, content_offset = self._build_chapter_map(doc, num_pages) + + # 2) Convert every page + page_mds: list[str] = [] + for idx in range(num_pages): + md = self._page_to_markdown( + doc, doc[idx], idx, + include_images, image_dpi, max_image_bytes, + ) + page_mds.append(md) + + # 3) Assemble + if chapters: + final_md = self._assemble_by_chapters( + chapters, page_mds, content_offset, num_pages, + ) + else: + final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip()) + + # 4) Output: text (for variable aggregation) + blob (.md file) + yield self.create_text_message(final_md) + md_bytes = final_md.encode("utf-8") + yield self.create_blob_message( + blob=md_bytes, + meta={"mime_type": "text/markdown"}, + ) + finally: + doc.close() + + # ── chapter detection ──────────────────────────────────── + + def _build_chapter_map( + self, doc: fitz.Document, num_pages: int, + ) -> tuple[dict, int]: + """Return (chapters_dict, content_offset). + + Try embedded PDF TOC metadata first (reliable page mapping). + Fall back to scanning printed TOC pages. + """ + toc = doc.get_toc() + if toc: + chapters = self._chapters_from_metadata(toc, num_pages) + if chapters: + return chapters, 0 + + toc_start, toc_end = self._find_toc_pages(doc, num_pages) + if toc_start is not None and toc_end is not None: + toc_text = "\n".join( + doc[i].get_text() or "" for i in range(toc_start, toc_end + 1) + ) + chapters = self._parse_toc_lines(toc_text) + if chapters: + offset = self._guess_offset(chapters, toc_end) + return chapters, offset + + return {}, 0 + + def _chapters_from_metadata( + self, toc: list, num_pages: int, + ) -> dict[str, dict[str, int]]: + top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1] + if not top: + return {} + chapters: dict[str, dict[str, int]] = OrderedDict() + for i, (title, start) in enumerate(top): + end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1 + chapters[title] = {"start": start, "end": max(start, end)} + return chapters + + def _find_toc_pages(self, doc, num_pages): + toc_start = toc_end = None + for pn in range(min(num_pages, 30)): + text = doc[pn].get_text() or "" + if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS): + if toc_start is None: + toc_start = pn + toc_end = pn + elif toc_start is not None: + break + return toc_start, toc_end + + def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: + m = re.search( + r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", + text, re.IGNORECASE | re.MULTILINE, + ) + if m: + text = text[: m.start()] + + pat = re.compile( + r"^\s*(?P.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$" + ) + entries: list[tuple[str, int]] = [] + for raw in text.splitlines(): + line = raw.strip() + if not line or len(line) < 3 or re.fullmatch(r"\d+", line): + continue + m2 = pat.match(line) + if not m2: + continue + title = re.sub(r"\s+", " ", m2.group("title")).strip("-_:: ") + page = self._to_int(m2.group("page"), None) + if not title or page is None or len(title) <= 1: + continue + if title.lower() in {"page", "pages", "目录", "contents"}: + continue + entries.append((title, page)) + + if not entries: + return {} + + dedup: OrderedDict[str, int] = OrderedDict() + for t, p in entries: + dedup.setdefault(t, p) + + titles = list(dedup.keys()) + pages = [dedup[t] for t in titles] + catalog: dict[str, dict[str, int]] = OrderedDict() + for i, t in enumerate(titles): + s = pages[i] + e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s + catalog[t] = {"start": s, "end": e} + return catalog + + @staticmethod + def _guess_offset(chapters: dict, toc_end: int) -> int: + first_page = None + for info in chapters.values(): + s = info["start"] + if first_page is None or s < first_page: + first_page = s + if first_page is None: + return 0 + return (toc_end + 1) - first_page + + # ── per-page conversion ────────────────────────────────── + + def _page_to_markdown( + self, + doc: fitz.Document, + page: fitz.Page, + page_idx: int, + include_images: bool, + image_dpi: int, + max_image_bytes: int, + ) -> str: + parts: list[str] = [] + + # ── text ── + text = (page.get_text("text", sort=True) or "").strip() + if text: + parts.append(text) + + # ── tables → Markdown ── + try: + for tab in (page.find_tables().tables or [])[:5]: + cells = tab.extract() or [] + if len(cells) >= 2: + md = self._cells_to_md_table(cells) + if md: + parts.append(md) + except Exception: + pass + + if not include_images: + return "\n\n".join(parts) + + # ── embedded raster images ── + try: + for img_idx, img_info in enumerate(page.get_images(full=True)): + xref = img_info[0] + try: + data = doc.extract_image(xref) + if not data or not data.get("image"): + continue + raw = data["image"] + if len(raw) > max_image_bytes: + continue + # skip tiny icons (< 20x20) + w = data.get("width", 0) + h = data.get("height", 0) + if w < 20 and h < 20: + continue + ext = data.get("ext", "png") + mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}" + b64 = base64.b64encode(raw).decode("ascii") + parts.append( + f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})" + ) + except Exception: + pass + except Exception: + pass + + # ── vector drawings → render as PNG ── + try: + drawings = page.get_drawings() + if len(drawings) >= 3: + valid_rects: list[fitz.Rect] = [] + for d in drawings: + r = d.get("rect") + if r: + try: + rect = fitz.Rect(r) + if rect.is_valid and not rect.is_empty: + valid_rects.append(rect) + except Exception: + pass + if valid_rects: + bbox = valid_rects[0] + for r in valid_rects[1:]: + bbox |= r + bbox &= page.rect + if bbox.width > 30 and bbox.height > 30: + scale = image_dpi / 72 + mat = fitz.Matrix(scale, scale) + pix = page.get_pixmap(matrix=mat, clip=bbox) + png = pix.tobytes("png") + if len(png) <= max_image_bytes: + b64 = base64.b64encode(png).decode("ascii") + parts.append( + f"![drawing-p{page_idx}](data:image/png;base64,{b64})" + ) + except Exception: + pass + + return "\n\n".join(parts) + + # ── assembly ───────────────────────────────────────────── + + def _assemble_by_chapters( + self, + chapters: dict[str, dict[str, int]], + page_mds: list[str], + offset: int, + num_pages: int, + ) -> str: + parts: list[str] = [] + for name, info in chapters.items(): + s = info["start"] + offset + e = info["end"] + offset + s = max(0, min(s, num_pages - 1)) + e = max(s, min(e, num_pages - 1)) + ch: list[str] = [f"# {name}\n"] + for idx in range(s, e + 1): + if idx < len(page_mds) and page_mds[idx].strip(): + ch.append(page_mds[idx]) + parts.append("\n\n".join(ch)) + return "\n\n---\n\n".join(parts) + + # ── helpers ────────────────────────────────────────────── + + @staticmethod + def _cells_to_md_table(cells: list) -> str: + if not cells: + return "" + header = cells[0] + ncols = len(header) + if ncols == 0: + return "" + clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip() + lines = [ + "| " + " | ".join(clean(c) for c in header) + " |", + "| " + " | ".join("---" for _ in range(ncols)) + " |", + ] + for row in cells[1:]: + padded = list(row) + [""] * max(0, ncols - len(row)) + lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |") + return "\n".join(lines) + + @staticmethod + def _to_int(value: Any, default: int | None) -> int | None: + try: + if value is None or value == "": + return default + return int(value) + except Exception: + return default + + @staticmethod + def _to_bool(value: Any, default: bool) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + s = str(value).strip().lower() + if s in {"1", "true", "yes", "on"}: + return True + if s in {"0", "false", "no", "off"}: + return False + return default diff --git a/difyPlugin/pdf/tools/pdf_to_markdown.yaml b/difyPlugin/pdf/tools/pdf_to_markdown.yaml new file mode 100644 index 00000000..87505722 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_to_markdown.yaml @@ -0,0 +1,68 @@ +identity: + name: "pdf_to_markdown" + author: "yslg" + label: + en_US: "PDF to Markdown" + zh_Hans: "PDF转Markdown" + pt_BR: "PDF para Markdown" + ja_JP: "PDFからMarkdown" +description: + human: + en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed." + zh_Hans: "将PDF转换为单个Markdown文件,图片以base64嵌入,无需大模型" + pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM." + ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。" + llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed." +parameters: + - name: file + type: file + required: true + label: + en_US: PDF File + zh_Hans: PDF 文件 + pt_BR: Arquivo PDF + ja_JP: PDFファイル + human_description: + en_US: "PDF file to convert" + zh_Hans: "要转换的 PDF 文件" + pt_BR: "Arquivo PDF para converter" + ja_JP: "変換するPDFファイル" + llm_description: "PDF file to convert to Markdown" + form: llm + fileTypes: + - "pdf" + - name: include_images + type: boolean + required: false + label: + en_US: Include Images + zh_Hans: 包含图片 + pt_BR: Incluir Imagens + ja_JP: 画像を含める + human_description: + en_US: "Whether to embed images as base64 in the Markdown output (default: true)" + zh_Hans: "是否将图片以base64嵌入Markdown输出(默认:是)" + pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)" + ja_JP: "Markdown出力にbase64として画像を埋め込むかどうか(デフォルト:はい)" + llm_description: "Set to true to embed images as base64, false to skip images" + form: form + default: true + - name: image_dpi + type: number + required: false + label: + en_US: Image DPI + zh_Hans: 图片DPI + pt_BR: DPI da Imagem + ja_JP: 画像DPI + human_description: + en_US: "DPI for rendering vector drawings (72-300, default: 150)" + zh_Hans: "矢量图渲染DPI(72-300,默认150)" + pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)" + ja_JP: "ベクター描画のレンダリングDPI(72-300、デフォルト:150)" + llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150." + form: form + default: 150 +extra: + python: + source: tools/pdf_to_markdown.py diff --git a/difyPlugin/pdf/tools/pdf_toc.py b/difyPlugin/pdf/tools/pdf_toc.py new file mode 100644 index 00000000..a96b86b1 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_toc.py @@ -0,0 +1,273 @@ +import json +import re +from collections import OrderedDict +from collections.abc import Generator +from typing import Any + +from dify_plugin import Tool +from dify_plugin.entities.model.llm import LLMModelConfig +from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage +from dify_plugin.entities.tool import ToolInvokeMessage + +_SYSTEM_PROMPT = """You parse PDF table-of-contents text. +Return only valid JSON object, no markdown fences, no explanation. +Output schema: +{ + "Chapter Name": {"start": 1, "end": 5}, + "Another": {"start": 6, "end": 20} +} +Rules: +- start/end are integer printed page numbers from TOC. +- If end is unknown, use same value as start. +- Keep chapter names exactly as in TOC text. +""" + + +class PdfTocTool(Tool): + def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]: + toc_start = self._to_int(tool_parameters.get("toc_start"), None) + toc_end = self._to_int(tool_parameters.get("toc_end"), None) + toc_pages = (tool_parameters.get("toc_pages") or "").strip() + model_config = tool_parameters.get("model") + + if toc_start is None or toc_end is None: + yield self.create_text_message("Error: toc_start and toc_end are required") + return + + if not toc_pages: + yield self.create_text_message("Error: toc_pages text is empty") + return + + cleaned = self._strip_index_lists(toc_pages) + + # 1) deterministic parser first + catalog = self._parse_toc_lines(cleaned) + + # 2) optional LLM fallback/enhance only when deterministic parser gives no result + llm_raw_output = "" + llm_error = None + if not catalog and model_config: + llm_catalog, llm_raw_output, llm_error = self._parse_with_llm( + toc_start=toc_start, + toc_end=toc_end, + toc_pages=cleaned, + model_config=model_config, + ) + if llm_catalog: + catalog = self._normalize_catalog(llm_catalog) + + result: dict[str, Any] = { + "toc_start": toc_start, + "toc_end": toc_end, + "catalog": catalog, + "meta": { + "catalog_size": len(catalog), + "parser": "rule" if catalog else "none", + }, + } + + if llm_raw_output: + result["meta"]["llm_used"] = True + if llm_error: + result["meta"]["llm_error"] = llm_error + + # always return valid json text payload for downstream json.loads + yield self.create_text_message(json.dumps(result, ensure_ascii=False)) + yield self.create_json_message(result) + + def _parse_with_llm( + self, + toc_start: int, + toc_end: int, + toc_pages: str, + model_config: dict[str, Any], + ) -> tuple[dict[str, Any] | None, str, str | None]: + user_content = ( + f"TOC page index range: {toc_start}..{toc_end}\n\n" + f"TOC raw text:\n{toc_pages}" + ) + response = self.session.model.llm.invoke( + model_config=LLMModelConfig(**model_config), + prompt_messages=[ + SystemPromptMessage(content=_SYSTEM_PROMPT), + UserPromptMessage(content=user_content), + ], + stream=False, + ) + + llm_text = "" + if hasattr(response, "message") and response.message: + content = response.message.content + if isinstance(content, str): + llm_text = content + elif isinstance(content, list): + llm_text = "".join( + item.data if hasattr(item, "data") else str(item) for item in content + ) + + parsed = self._extract_json_object(llm_text) + if parsed is None: + return None, llm_text, "Failed to parse LLM output as JSON" + if not isinstance(parsed, dict): + return None, llm_text, "LLM output JSON is not an object" + + return parsed, llm_text, None + + @staticmethod + def _strip_index_lists(text: str) -> str: + # Stop before common appendix lists that pollute TOC parsing. + pattern = re.compile( + r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)", + re.IGNORECASE | re.MULTILINE, + ) + m = pattern.search(text) + return text[: m.start()].rstrip() if m else text + + def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]: + """Parse lines like: + 1.2 Engine Overview ........ 35 + Appendix A 120 + """ + line_pattern = re.compile( + r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$" + ) + + entries: list[tuple[str, int]] = [] + for raw in text.splitlines(): + line = raw.strip() + if not line or len(line) < 3: + continue + if re.fullmatch(r"\d+", line): + continue + + m = line_pattern.match(line) + if not m: + continue + + title = re.sub(r"\s+", " ", m.group("title")).strip("-_:: ") + page = self._to_int(m.group("page"), None) + if not title or page is None: + continue + + # Skip obvious noise. + if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}: + continue + + entries.append((title, page)) + + if not entries: + return {} + + # Deduplicate keeping earliest appearance. + dedup: OrderedDict[str, int] = OrderedDict() + for title, page in entries: + if title not in dedup: + dedup[title] = page + + titles = list(dedup.keys()) + pages = [dedup[t] for t in titles] + + catalog: dict[str, dict[str, int]] = {} + for i, title in enumerate(titles): + start = pages[i] + if i + 1 < len(pages): + next_start = pages[i + 1] + end = max(start, next_start - 1) + else: + end = start + catalog[title] = {"start": int(start), "end": int(end)} + + return catalog + + def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]: + catalog: dict[str, dict[str, int]] = {} + source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw + if not isinstance(source, dict): + return catalog + + for name, value in source.items(): + if not isinstance(name, str) or not isinstance(value, dict): + continue + start = self._to_int(value.get("start"), None) + end = self._to_int(value.get("end"), start) + if start is None: + continue + if end is None: + end = start + catalog[name] = {"start": int(start), "end": int(max(start, end))} + return catalog + + @staticmethod + def _extract_json_object(text: str) -> Any: + if not text: + return None + + candidates: list[str] = [] + + code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE) + candidates.extend([c.strip() for c in code_blocks if c.strip()]) + + brace_candidate = PdfTocTool._extract_first_brace_object(text) + if brace_candidate: + candidates.append(brace_candidate) + + candidates.append(text.strip()) + + for cand in candidates: + parsed = PdfTocTool._json_try_parse(cand) + if parsed is not None: + return parsed + return None + + @staticmethod + def _extract_first_brace_object(text: str) -> str | None: + start = text.find("{") + if start < 0: + return None + + depth = 0 + in_str = False + escape = False + for i in range(start, len(text)): + ch = text[i] + if in_str: + if escape: + escape = False + elif ch == "\\": + escape = True + elif ch == '"': + in_str = False + continue + + if ch == '"': + in_str = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return text[start : i + 1] + return None + + @staticmethod + def _json_try_parse(text: str) -> Any: + try: + return json.loads(text) + except Exception: + pass + + # Minimal repair: remove trailing commas before } or ] + repaired = re.sub(r",\s*([}\]])", r"\1", text) + try: + return json.loads(repaired) + except Exception: + return None + + @staticmethod + def _to_int(value: Any, default: int | None) -> int | None: + try: + if value is None or value == "": + return default + return int(value) + except Exception: + return default diff --git a/difyPlugin/pdf/tools/pdf_toc.yaml b/difyPlugin/pdf/tools/pdf_toc.yaml new file mode 100644 index 00000000..d938c681 --- /dev/null +++ b/difyPlugin/pdf/tools/pdf_toc.yaml @@ -0,0 +1,79 @@ +identity: + name: "pdf_toc" + author: "yslg" + label: + en_US: "PDF TOC Parser" + zh_Hans: "PDF目录解析" + pt_BR: "Analisador de Sumário PDF" + ja_JP: "PDF目次解析" +description: + human: + en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM" + zh_Hans: "通过LLM将PDF目录文本(来自目录页提取工具的输出)解析为结构化JSON目录" + pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM" + ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析" + llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)." +parameters: + - name: toc_start + type: number + required: true + label: + en_US: TOC Start Page + zh_Hans: 目录起始页 + pt_BR: Página Inicial do Sumário + ja_JP: 目次開始ページ + human_description: + en_US: "Start page index of TOC (from pdf_column_range output)" + zh_Hans: "目录起始页码(来自目录页提取工具输出的 start)" + pt_BR: "Índice da página inicial do sumário" + ja_JP: "目次の開始ページ番号" + llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'" + form: llm + - name: toc_end + type: number + required: true + label: + en_US: TOC End Page + zh_Hans: 目录结束页 + pt_BR: Página Final do Sumário + ja_JP: 目次終了ページ + human_description: + en_US: "End page index of TOC (from pdf_column_range output)" + zh_Hans: "目录结束页码(来自目录页提取工具输出的 end)" + pt_BR: "Índice da página final do sumário" + ja_JP: "目次の終了ページ番号" + llm_description: "End page index of TOC section, from pdf_column_range output field 'end'" + form: llm + - name: toc_pages + type: string + required: true + label: + en_US: TOC Page Text + zh_Hans: 目录页文本 + pt_BR: Texto das Páginas do Sumário + ja_JP: 目次ページテキスト + human_description: + en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)" + zh_Hans: "目录页原始文本内容(来自目录页提取工具输出的 pages 数组)" + pt_BR: "Conteúdo de texto bruto das páginas do sumário" + ja_JP: "目次ページの生テキスト内容" + llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'" + form: llm + - name: model + type: model-selector + scope: llm + required: true + label: + en_US: LLM Model + zh_Hans: LLM 模型 + pt_BR: Modelo LLM + ja_JP: LLMモデル + human_description: + en_US: "LLM model for parsing TOC into structured JSON" + zh_Hans: "用于解析目录的 LLM 模型" + pt_BR: "Modelo LLM para análise do sumário" + ja_JP: "目次解析用のLLMモデル" + form: form +extra: + python: + source: tools/pdf_toc.py