This commit is contained in:
2026-03-15 13:00:30 +08:00
parent 91ff28bdcf
commit 136ddc270c
15 changed files with 1459 additions and 1276 deletions

Submodule ai-management-platform updated: 96f7c3aa4c...6bbe3e4181

View File

@@ -1,4 +1,4 @@
identity:
identity:
author: "yslg"
name: "pdf"
label:
@@ -13,54 +13,8 @@ identity:
ja_JP: "pdfTools"
icon: "icon.svg"
#########################################################################################
# If you want to support OAuth, you can uncomment the following code.
#########################################################################################
# oauth_schema:
# client_schema:
# - name: "client_id"
# type: "secret-input"
# required: true
# url: https://example.com/oauth/authorize
# placeholder:
# en_US: "Please input your Client ID"
# zh_Hans: "请输入你的 Client ID"
# pt_BR: "Insira seu Client ID"
# help:
# en_US: "Client ID is used to authenticate requests to the example.com API."
# zh_Hans: "Client ID 用于认证请求到 example.com API。"
# pt_BR: "Client ID é usado para autenticar solicitações à API do example.com."
# label:
# zh_Hans: "Client ID"
# en_US: "Client ID"
# - name: "client_secret"
# type: "secret-input"
# required: true
# url: https://example.com/oauth/authorize
# placeholder:
# en_US: "Please input your Client Secret"
# zh_Hans: "请输入你的 Client Secret"
# pt_BR: "Insira seu Client Secret"
# help:
# en_US: "Client Secret is used to authenticate requests to the example.com API."
# zh_Hans: "Client Secret 用于认证请求到 example.com API。"
# pt_BR: "Client Secret é usado para autenticar solicitações à API do example.com."
# label:
# zh_Hans: "Client Secret"
# en_US: "Client Secret"
# credentials_schema:
# - name: "access_token"
# type: "secret-input"
# label:
# zh_Hans: "Access Token"
# en_US: "Access Token"
tools:
- tools/pdf_column_range.yaml
- tools/pdf_single_page.yaml
- tools/pdf_summary.yaml
- tools/pdf_toc.yaml
- tools/pdf_extract_range.yaml
- tools/pdf_to_markdown.yaml
extra:
python:

View File

@@ -1,107 +0,0 @@
import json
import re
from collections.abc import Generator
from io import BytesIO
from typing import Any
import fitz # PyMuPDF 核心库
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
# 从字节流加载 PDF替换 PyPDF2
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
toc_start = None
toc_end = None
# 目录匹配正则(与原代码一致)
toc_patterns = [
r'目录',
r'目 录',
r'\u3000录',
r'Table of Contents',
r'Contents',
r'目次'
]
# 遍历页面识别目录页(逻辑不变,仅替换文本提取方式)
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text() or "" # PyMuPDF 提取文本
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
# 提取目录页文本
toc_pages = []
if toc_start is not None and toc_end is not None:
for page_num in range(toc_start, toc_end + 1):
page = doc[page_num]
toc_pages.append(page.get_text() or "")
# 关闭文档
doc.close()
result = {
"start": toc_start,
"end": toc_end,
"pages": toc_pages,
"pages_text": "\n".join(toc_pages) if toc_pages else "",
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)
if __name__ == "__main__":
# 测试代码(改用 PyMuPDF
pdf_path = r"F:\Project\urbanLifeline\docs\AI训练资料\菱重S12R发动机说明书.pdf"
doc = fitz.open(pdf_path) # 本地文件直接打开
num_pages = len(doc)
toc_start = None
toc_end = None
toc_patterns = [
r'目录',
r'目 录',
r'\u3000录',
r'Table of Contents',
r'Contents',
r'目次'
]
# 遍历页面找目录
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in toc_patterns):
if toc_start is None:
toc_start = page_num
toc_end = page_num
elif toc_start is not None and toc_end is not None:
break
# 提取目录页文本
toc_pages = []
toc_start = toc_start if toc_start is not None else 18
toc_end = toc_end if toc_end is not None else toc_start + 9
for page_num in range(toc_start, toc_end):
page = doc[page_num]
toc_pages.append(page.get_text() or "")
print(toc_start, toc_end, toc_pages)
doc.close() # 关闭文档

View File

@@ -1,36 +0,0 @@
identity:
name: "pdf"
author: "yslg"
label:
en_US: "Extract TOC Pages and Content"
zh_Hans: "提取目录页和内容"
pt_BR: "Extrair páginas de sumário e conteúdo"
ja_JP: "目次ページと内容を抽出"
description:
human:
en_US: "Extract table-of-contents page range and all page text in that range"
zh_Hans: "提取目录页范围以及该范围内所有页文本"
pt_BR: "Extrair intervalo de páginas de sumário e todo o texto nesse intervalo"
ja_JP: "目次ページ範囲とその範囲内の全ページテキストを抽出"
llm: "Extract table-of-contents page range and all page text in that range"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to process, output contains start/end/pages"
form: llm
fileTypes:
- "pdf"
extra:
python:
source: tools/pdf_column_range.py

View File

@@ -1,48 +0,0 @@
import json
from collections.abc import Generator
from typing import Any
import fitz # PyMuPDF
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfExtractRangeTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
start_page = int(tool_parameters.get("start_page", 0))
end_page = int(tool_parameters.get("end_page", 0))
# 打开 PDF
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
# 边界处理
start_page = max(0, min(start_page, num_pages - 1))
end_page = max(start_page, min(end_page, num_pages - 1))
# 逐页提取文本
page_texts = []
for page_idx in range(start_page, end_page + 1):
page = doc[page_idx]
text = page.get_text("text", sort=True) or ""
page_texts.append(text)
doc.close()
# 拼接所有页面文本
full_text = "\n\n--- 分页 ---\n\n".join(page_texts)
result = {
"start": start_page,
"end": end_page,
"total_pages": end_page - start_page + 1,
"text": full_text,
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)

View File

@@ -1,68 +0,0 @@
identity:
name: "pdf_extract_range"
author: "yslg"
label:
en_US: "Extract Page Range Text"
zh_Hans: "提取页面范围文本"
pt_BR: "Extrair Texto do Intervalo de Páginas"
ja_JP: "ページ範囲テキスト抽出"
description:
human:
en_US: "Extract plain text from a specified page range of a PDF file"
zh_Hans: "从PDF文件的指定页码范围提取纯文本"
pt_BR: "Extrair texto simples de um intervalo de páginas especificado de um arquivo PDF"
ja_JP: "PDFファイルの指定ページ範囲からプレーンテキストを抽出"
llm: "Extract plain text from PDF pages in the given start-end range. Returns concatenated text of all pages in range."
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to extract text from"
zh_Hans: "要提取文本的 PDF 文件"
pt_BR: "Arquivo PDF para extrair texto"
ja_JP: "テキストを抽出するPDFファイル"
llm_description: "PDF file to extract page range text from"
form: llm
fileTypes:
- "pdf"
- name: start_page
type: number
required: true
label:
en_US: Start Page
zh_Hans: 起始页码
pt_BR: Página Inicial
ja_JP: 開始ページ
human_description:
en_US: "Start page index (0-based)"
zh_Hans: "起始页码从0开始"
pt_BR: "Índice da página inicial (base 0)"
ja_JP: "開始ページ番号0始まり"
llm_description: "Start page index (0-based)"
form: llm
default: 0
- name: end_page
type: number
required: true
label:
en_US: End Page
zh_Hans: 结束页码
pt_BR: Página Final
ja_JP: 終了ページ
human_description:
en_US: "End page index (0-based, inclusive)"
zh_Hans: "结束页码从0开始包含该页"
pt_BR: "Índice da página final (base 0, inclusivo)"
ja_JP: "終了ページ番号0始まり、含む"
llm_description: "End page index (0-based, inclusive)"
form: llm
default: 0
extra:
python:
source: tools/pdf_extract_range.py

View File

@@ -1,45 +0,0 @@
import json
from collections.abc import Generator
from io import BytesIO
from typing import Any
import fitz # PyMuPDF 核心库
from dify_plugin import Tool
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfSinglePageTool(Tool):
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
page = tool_parameters.get("page", 0)
if not file:
yield self.create_text_message("Error: file is required")
return
# 从字节流加载 PDF替换 PyPDF2 的 PdfReader
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # 字节流方式打开
num_pages = len(doc)
# 页码边界处理(逻辑与原代码一致)
page_index = int(page)
if page_index < 0:
page_index = 0
if page_index >= num_pages:
page_index = num_pages - 1
# 提取指定页面文本PyMuPDF 方式)
selected_page = doc[page_index]
text = selected_page.get_text() or "" # get_text() 提取文本,比 PyPDF2 更精准
# 关闭文档释放资源
doc.close()
result = {
"start": page_index,
"end": page_index,
"pages": [text]
}
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)

View File

@@ -1,52 +0,0 @@
identity:
name: "pdf_single_page"
author: "yslg"
label:
en_US: "Extract Single-Page Text"
zh_Hans: "提取单页文字"
pt_BR: "Extrair texto de página única"
ja_JP: "単一ページのテキストを抽出"
description:
human:
en_US: "Extract text from one specified page"
zh_Hans: "提取指定单页文字"
pt_BR: "Extrair texto de uma página especificada"
ja_JP: "指定した1ページのテキストを抽出"
llm: "Extract text from one specified page"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to process"
form: llm
fileTypes:
- "pdf"
- name: page
type: number
required: true
label:
en_US: Page Index
zh_Hans: 页码
pt_BR: Índice da Página
ja_JP: ページ番号
human_description:
en_US: "Single page index to extract"
zh_Hans: "要提取的单页页码"
pt_BR: "Índice da página única para extrair"
ja_JP: "抽出対象のページ番号"
llm_description: "Single page index to extract"
form: llm
default: 0
extra:
python:
source: tools/pdf_single_page.py

View File

@@ -1,209 +0,0 @@
import json
import re
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.model.llm import LLMModelConfig
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
from dify_plugin.entities.tool import ToolInvokeMessage
class PdfSummaryTool(Tool):
"""Fast PDF page summary tool.
Default behavior is optimized for throughput in large workflows:
- Extract plain text and lightweight table data only.
- Skip expensive image base64 and drawing path extraction.
- Skip LLM by default unless `use_llm=true` is explicitly passed.
"""
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
start_page = self._to_int(tool_parameters.get("pdf_start_page"), 0)
end_page = self._to_int(tool_parameters.get("pdf_end_page"), 0)
model_config = tool_parameters.get("model")
use_llm = self._to_bool(tool_parameters.get("use_llm"), False)
max_chars_per_page = self._to_int(tool_parameters.get("max_chars_per_page"), 6000)
max_chars_per_page = max(800, min(max_chars_per_page, 20000))
llm_prompt = tool_parameters.get(
"llm_prompt",
"请基于输入的PDF页面文本做简洁准确摘要输出中文要点。不要输出思考过程。",
)
pdf_bytes = file.blob
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
num_pages = len(doc)
start_page = max(0, min(start_page, num_pages - 1))
end_page = max(start_page, min(end_page, num_pages - 1))
pages_data: list[dict[str, Any]] = []
for page_idx in range(start_page, end_page + 1):
page = doc[page_idx]
page_data = self._extract_page_fast(page, page_idx, max_chars_per_page)
pages_data.append(page_data)
result = {
"total_pages_extracted": len(pages_data),
"page_range": {"start": start_page, "end": end_page},
"pages": pages_data,
}
yield self.create_json_message(result)
# Fast local summary first (deterministic, no model latency)
local_text = self._build_local_summary(pages_data)
# Optional LLM refinement, explicitly enabled only
if use_llm and model_config:
refined = self._summarize_with_llm(local_text, llm_prompt, model_config)
final_text = refined if refined else local_text
else:
final_text = local_text
if final_text:
yield self.create_text_message(final_text)
finally:
doc.close()
def _extract_page_fast(self, page: fitz.Page, page_idx: int, max_chars_per_page: int) -> dict[str, Any]:
text = (page.get_text("text") or "").strip()
if len(text) > max_chars_per_page:
text = text[:max_chars_per_page] + "\n...[truncated]"
tables: list[dict[str, Any]] = []
try:
tabs = page.find_tables()
for tab_idx, tab in enumerate(tabs.tables[:3]):
cells = tab.extract() or []
tables.append(
{
"index": tab_idx,
"rows": tab.row_count,
"cols": tab.col_count,
"cells": cells[:10],
}
)
except Exception:
pass
return {
"page_number": page_idx,
"text": text,
"tables": tables,
"images": [],
"drawings_summary": [],
"text_blocks": [],
"width": float(page.rect.width),
"height": float(page.rect.height),
}
def _build_local_summary(self, pages_data: list[dict[str, Any]]) -> str:
"""Output actual page content as Markdown (text + tables).
No LLM needed downstream — the text is already usable Markdown.
"""
parts: list[str] = []
for page in pages_data:
text = (page.get("text") or "").strip()
tables = page.get("tables") or []
page_parts: list[str] = []
if text:
page_parts.append(text)
for tab in tables:
cells = tab.get("cells") or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
page_parts.append(md)
if page_parts:
parts.append("\n\n".join(page_parts))
return "\n\n--- 分页 ---\n\n".join(parts)
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
return "\n".join(lines)
def _summarize_with_llm(self, local_text: str, llm_prompt: str, model_config: dict[str, Any]) -> str:
response = self.session.model.llm.invoke(
model_config=LLMModelConfig(**model_config),
prompt_messages=[
SystemPromptMessage(content=llm_prompt),
UserPromptMessage(content=local_text),
],
stream=False,
)
llm_text = ""
if hasattr(response, "message") and response.message:
content = response.message.content
if isinstance(content, str):
llm_text = content
elif isinstance(content, list):
llm_text = "".join(
item.data if hasattr(item, "data") else str(item)
for item in content
)
return self._extract_visible_answer(llm_text)
@staticmethod
def _extract_visible_answer(text: str) -> str:
if not text:
return ""
box_match = re.search(r"<\|begin_of_box\|>([\s\S]*?)<\|end_of_box\|>", text)
if box_match:
text = box_match.group(1)
else:
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
text = re.sub(r"<\|[^>]+\|>", "", text)
return text.strip()
@staticmethod
def _to_int(value: Any, default: int) -> int:
try:
if value is None or value == "":
return default
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default

View File

@@ -1,99 +0,0 @@
identity:
name: "pdf_summary"
author: "yslg"
label:
en_US: "PDF Page Summary"
zh_Hans: "PDF页面概述"
pt_BR: "Resumo de Página PDF"
ja_JP: "PDFページ概要"
description:
human:
en_US: "Extract core elements (text, image, table, path) from PDF pages with coordinates, then summarize via LLM"
zh_Hans: "提取PDF页面核心元素文本、图片、表格、路径及坐标并通过LLM进行概述"
pt_BR: "Extrair elementos principais (texto, imagem, tabela, caminho) de páginas PDF com coordenadas e resumir via LLM"
ja_JP: "PDFページからコア要素テキスト、画像、テーブル、パスを座標付きで抽出し、LLMで要約"
llm: "Extract core elements (text, image, table, drawing path) with coordinates from specified PDF page range, then use LLM to summarize the content"
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
human_description:
en_US: "PDF file to process"
zh_Hans: "要处理的 PDF 文件"
pt_BR: "Arquivo PDF para processar"
ja_JP: "処理するPDFファイル"
llm_description: "PDF file to extract elements from and summarize"
form: llm
fileTypes:
- "pdf"
- name: pdf_start_page
type: number
required: true
label:
en_US: Start Page
zh_Hans: 起始页码
pt_BR: Página Inicial
ja_JP: 開始ページ
human_description:
en_US: "Start page index (0-based)"
zh_Hans: "起始页码从0开始"
pt_BR: "Índice da página inicial (base 0)"
ja_JP: "開始ページ番号0始まり"
llm_description: "Start page index (0-based) for element extraction"
form: llm
default: 0
- name: pdf_end_page
type: number
required: true
label:
en_US: End Page
zh_Hans: 结束页码
pt_BR: Página Final
ja_JP: 終了ページ
human_description:
en_US: "End page index (0-based, inclusive)"
zh_Hans: "结束页码从0开始包含该页"
pt_BR: "Índice da página final (base 0, inclusivo)"
ja_JP: "終了ページ番号0始まり、含む"
llm_description: "End page index (0-based, inclusive) for element extraction"
form: llm
default: 0
- name: model
type: model-selector
scope: llm
required: true
label:
en_US: LLM Model
zh_Hans: LLM 模型
pt_BR: Modelo LLM
ja_JP: LLMモデル
human_description:
en_US: "LLM model used for summarizing extracted content"
zh_Hans: "用于概述提取内容的 LLM 模型"
pt_BR: "Modelo LLM usado para resumir o conteúdo extraído"
ja_JP: "抽出内容の要約に使用するLLMモデル"
form: form
- name: llm_prompt
type: string
required: false
label:
en_US: LLM Prompt
zh_Hans: LLM 提示词
pt_BR: Prompt do LLM
ja_JP: LLMプロンプト
human_description:
en_US: "System prompt for LLM summarization"
zh_Hans: "LLM 概述的系统提示词"
pt_BR: "Prompt do sistema para resumo LLM"
ja_JP: "LLM要約用のシステムプロンプト"
llm_description: "System prompt guiding LLM on how to summarize the extracted PDF content"
form: form
default: "你是一个专业的文档分析助手。请根据以下从PDF页面中提取的结构化内容包含文本、图片信息、表格和矢量图形对每页内容进行准确、简洁的概述。"
extra:
python:
source: tools/pdf_summary.py

View File

@@ -1,6 +1,5 @@
import base64
import json
import re
from collections import OrderedDict
from collections.abc import Generator
from typing import Any
@@ -10,306 +9,219 @@ from dify_plugin.entities.tool import ToolInvokeMessage
class PdfToMarkdownTool(Tool):
"""Convert PDF to a single Markdown file. No LLM needed.
- Auto-detect TOC and organize content by chapters.
- Extract text and tables as Markdown.
- Embed raster images as base64.
- Render vector drawings as base64 PNG.
- Output one .md file via create_blob_message.
"""
_TOC_PATTERNS = [
r"目录", r"目 录", r"\u3000录",
r"Table of Contents", r"Contents", r"目次",
]
# ── entry point ──────────────────────────────────────────
"""Convert PDF to Markdown using an external catalog array."""
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
file = tool_parameters.get("file")
catalog_text = (tool_parameters.get("catalog") or "").strip()
if not file:
yield self.create_text_message("Error: file is required")
return
if not catalog_text:
yield self.create_text_message("Error: catalog is required")
return
include_images = self._to_bool(tool_parameters.get("include_images"), True)
image_dpi = self._to_int(tool_parameters.get("image_dpi"), 150)
image_dpi = max(72, min(image_dpi, 300))
max_image_bytes = 2 * 1024 * 1024 # skip images > 2 MB raw
catalog = self._parse_catalog(catalog_text)
if not catalog:
yield self.create_text_message("Error: catalog must be a JSON array with title and page indexes")
return
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
hf_texts = self._detect_headers_footers(doc, num_pages)
page_mds = [self._page_to_markdown(doc[index], hf_texts) for index in range(num_pages)]
final_md = self._assemble_by_catalog(catalog, page_mds, num_pages)
# 1) Build chapter map (metadata TOC → printed TOC → none)
chapters, content_offset = self._build_chapter_map(doc, num_pages)
# 2) Convert every page
page_mds: list[str] = []
for idx in range(num_pages):
md = self._page_to_markdown(
doc, doc[idx], idx,
include_images, image_dpi, max_image_bytes,
)
page_mds.append(md)
# 3) Assemble
if chapters:
final_md = self._assemble_by_chapters(
chapters, page_mds, content_offset, num_pages,
)
else:
final_md = "\n\n---\n\n".join(m for m in page_mds if m.strip())
# 4) Output: text (for variable aggregation) + blob (.md file)
yield self.create_text_message(final_md)
md_bytes = final_md.encode("utf-8")
yield self.create_blob_message(
blob=md_bytes,
blob=final_md.encode("utf-8"),
meta={"mime_type": "text/markdown"},
)
finally:
doc.close()
# ── chapter detection ────────────────────────────────────
def _parse_catalog(self, catalog_text: str) -> list[dict[str, Any]]:
try:
raw = json.loads(catalog_text)
except Exception:
return []
def _build_chapter_map(
self, doc: fitz.Document, num_pages: int,
) -> tuple[dict, int]:
"""Return (chapters_dict, content_offset).
if not isinstance(raw, list):
return []
Try embedded PDF TOC metadata first (reliable page mapping).
Fall back to scanning printed TOC pages.
"""
toc = doc.get_toc()
if toc:
chapters = self._chapters_from_metadata(toc, num_pages)
if chapters:
return chapters, 0
result: list[dict[str, Any]] = []
for item in raw:
if not isinstance(item, dict):
continue
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[i].get_text() or "" for i in range(toc_start, toc_end + 1)
title = str(item.get("title") or "").strip() or "Untitled"
start_index = self._to_int(item.get("page_start_index"), None)
end_index = self._to_int(item.get("page_end_index"), start_index)
if start_index is None:
start = self._to_int(item.get("start"), None)
end = self._to_int(item.get("end"), start)
if start is None:
continue
start_index = max(0, start - 1)
end_index = max(start_index, (end if end is not None else start) - 1)
if end_index is None:
end_index = start_index
result.append(
{
"title": title,
"page_start_index": max(0, start_index),
"page_end_index": max(start_index, end_index),
}
)
chapters = self._parse_toc_lines(toc_text)
if chapters:
offset = self._guess_offset(chapters, toc_end)
return chapters, offset
return result
return {}, 0
def _detect_headers_footers(self, doc: fitz.Document, num_pages: int) -> set[str]:
margin_ratio = 0.08
sample_count = min(num_pages, 30)
text_counts: dict[str, int] = {}
def _chapters_from_metadata(
self, toc: list, num_pages: int,
) -> dict[str, dict[str, int]]:
top = [(t, max(0, p - 1)) for lvl, t, p in toc if lvl <= 2 and p >= 1]
if not top:
return {}
chapters: dict[str, dict[str, int]] = OrderedDict()
for i, (title, start) in enumerate(top):
end = top[i + 1][1] - 1 if i + 1 < len(top) else num_pages - 1
chapters[title] = {"start": start, "end": max(start, end)}
return chapters
def _find_toc_pages(self, doc, num_pages):
toc_start = toc_end = None
for pn in range(min(num_pages, 30)):
text = doc[pn].get_text() or ""
if any(re.search(p, text, re.IGNORECASE) for p in self._TOC_PATTERNS):
if toc_start is None:
toc_start = pn
toc_end = pn
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
m = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text, re.IGNORECASE | re.MULTILINE,
)
if m:
text = text[: m.start()]
pat = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
for idx in range(sample_count):
page = doc[idx]
page_height = page.rect.height
top_limit = page_height * margin_ratio
bottom_limit = page_height * (1 - margin_ratio)
try:
blocks = page.get_text("blocks", sort=True) or []
except Exception:
continue
m2 = pat.match(line)
if not m2:
seen: set[str] = set()
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
title = re.sub(r"\s+", " ", m2.group("title")).strip("-_: ")
page = self._to_int(m2.group("page"), None)
if not title or page is None or len(title) <= 1:
y0, y1 = block[1], block[3]
text = (block[4] or "").strip()
if not text or len(text) < 2 or text in seen:
continue
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if y1 <= top_limit or y0 >= bottom_limit:
seen.add(text)
text_counts[text] = text_counts.get(text, 0) + 1
if not entries:
return {}
threshold = max(3, sample_count * 0.35)
return {text for text, count in text_counts.items() if count >= threshold}
dedup: OrderedDict[str, int] = OrderedDict()
for t, p in entries:
dedup.setdefault(t, p)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
catalog: dict[str, dict[str, int]] = OrderedDict()
for i, t in enumerate(titles):
s = pages[i]
e = max(s, pages[i + 1] - 1) if i + 1 < len(pages) else s
catalog[t] = {"start": s, "end": e}
return catalog
@staticmethod
def _guess_offset(chapters: dict, toc_end: int) -> int:
first_page = None
for info in chapters.values():
s = info["start"]
if first_page is None or s < first_page:
first_page = s
if first_page is None:
return 0
return (toc_end + 1) - first_page
# ── per-page conversion ──────────────────────────────────
def _page_to_markdown(
self,
doc: fitz.Document,
page: fitz.Page,
page_idx: int,
include_images: bool,
image_dpi: int,
max_image_bytes: int,
) -> str:
def _page_to_markdown(self, page: fitz.Page, hf_texts: set[str]) -> str:
parts: list[str] = []
page_height = page.rect.height
top_margin = page_height * 0.06
bottom_margin = page_height * 0.94
table_rects: list[fitz.Rect] = []
table_mds: list[str] = []
try:
find_tables = getattr(page, "find_tables", None)
tables = []
if callable(find_tables):
table_finder = find_tables()
tables = getattr(table_finder, "tables", []) or []
for table in tables[:5]:
try:
table_rects.append(fitz.Rect(table.bbox))
except Exception:
pass
cells = table.extract() or []
if len(cells) < 2:
continue
if hf_texts and len(cells) <= 3:
flat = " ".join(str(cell or "") for row in cells for cell in row)
if any(hf in flat for hf in hf_texts):
continue
md_table = self._cells_to_md_table(cells)
if md_table:
table_mds.append(md_table)
except Exception:
pass
try:
blocks = page.get_text("blocks", sort=True) or []
except Exception:
blocks = []
for block in blocks:
if len(block) < 7 or block[6] != 0:
continue
x0, y0, x1, y1 = block[:4]
text = (block[4] or "").strip()
if not text:
continue
block_rect = fitz.Rect(x0, y0, x1, y1)
if any(self._rects_overlap(block_rect, table_rect) for table_rect in table_rects):
continue
if hf_texts and (y1 <= top_margin or y0 >= bottom_margin):
if any(hf in text for hf in hf_texts):
continue
if re.fullmatch(r"\s*\d{1,4}\s*", text):
continue
# ── text ──
text = (page.get_text("text", sort=True) or "").strip()
if text:
parts.append(text)
# ── tables → Markdown ──
try:
for tab in (page.find_tables().tables or [])[:5]:
cells = tab.extract() or []
if len(cells) >= 2:
md = self._cells_to_md_table(cells)
if md:
parts.append(md)
except Exception:
pass
if not include_images:
parts.extend(table_mds)
return "\n\n".join(parts)
# ── embedded raster images ──
try:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
data = doc.extract_image(xref)
if not data or not data.get("image"):
continue
raw = data["image"]
if len(raw) > max_image_bytes:
continue
# skip tiny icons (< 20x20)
w = data.get("width", 0)
h = data.get("height", 0)
if w < 20 and h < 20:
continue
ext = data.get("ext", "png")
mime = "image/jpeg" if ext in ("jpg", "jpeg") else f"image/{ext}"
b64 = base64.b64encode(raw).decode("ascii")
parts.append(
f"![img-p{page_idx}-{img_idx}](data:{mime};base64,{b64})"
)
except Exception:
pass
except Exception:
pass
# ── vector drawings → render as PNG ──
try:
drawings = page.get_drawings()
if len(drawings) >= 3:
valid_rects: list[fitz.Rect] = []
for d in drawings:
r = d.get("rect")
if r:
try:
rect = fitz.Rect(r)
if rect.is_valid and not rect.is_empty:
valid_rects.append(rect)
except Exception:
pass
if valid_rects:
bbox = valid_rects[0]
for r in valid_rects[1:]:
bbox |= r
bbox &= page.rect
if bbox.width > 30 and bbox.height > 30:
scale = image_dpi / 72
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, clip=bbox)
png = pix.tobytes("png")
if len(png) <= max_image_bytes:
b64 = base64.b64encode(png).decode("ascii")
parts.append(
f"![drawing-p{page_idx}](data:image/png;base64,{b64})"
)
except Exception:
pass
return "\n\n".join(parts)
# ── assembly ─────────────────────────────────────────────
def _assemble_by_chapters(
self,
chapters: dict[str, dict[str, int]],
page_mds: list[str],
offset: int,
num_pages: int,
) -> str:
def _assemble_by_catalog(self, catalog: list[dict[str, Any]], page_mds: list[str], num_pages: int) -> str:
parts: list[str] = []
for name, info in chapters.items():
s = info["start"] + offset
e = info["end"] + offset
s = max(0, min(s, num_pages - 1))
e = max(s, min(e, num_pages - 1))
ch: list[str] = [f"# {name}\n"]
for idx in range(s, e + 1):
if idx < len(page_mds) and page_mds[idx].strip():
ch.append(page_mds[idx])
parts.append("\n\n".join(ch))
return "\n\n---\n\n".join(parts)
used_pages: set[int] = set()
# ── helpers ──────────────────────────────────────────────
for item in catalog:
start = max(0, min(int(item["page_start_index"]), num_pages - 1))
end = max(start, min(int(item["page_end_index"]), num_pages - 1))
chapter_parts = [f"# {item['title']}\n"]
for idx in range(start, end + 1):
if idx < len(page_mds) and page_mds[idx].strip() and idx not in used_pages:
chapter_parts.append(page_mds[idx])
used_pages.add(idx)
if len(chapter_parts) > 1:
parts.append("\n\n".join(chapter_parts))
if parts:
return "\n\n---\n\n".join(parts)
return "\n\n---\n\n".join(m for m in page_mds if m.strip())
@staticmethod
def _rects_overlap(block_rect: fitz.Rect, table_rect: fitz.Rect) -> bool:
inter = block_rect & table_rect
if inter.is_empty:
return False
block_area = block_rect.width * block_rect.height
if block_area <= 0:
return False
return (inter.width * inter.height) / block_area >= 0.3
@staticmethod
def _cells_to_md_table(cells: list) -> str:
if not cells:
return ""
header = cells[0]
ncols = len(header)
if ncols == 0:
return ""
clean = lambda c: str(c or "").replace("|", "\\|").replace("\n", " ").strip()
def clean(value: Any) -> str:
return str(value or "").replace("|", "\\|").replace("\n", " ").strip()
lines = [
"| " + " | ".join(clean(c) for c in header) + " |",
"| " + " | ".join(clean(cell) for cell in header) + " |",
"| " + " | ".join("---" for _ in range(ncols)) + " |",
]
for row in cells[1:]:
padded = list(row) + [""] * max(0, ncols - len(row))
lines.append("| " + " | ".join(clean(c) for c in padded[:ncols]) + " |")
lines.append("| " + " | ".join(clean(cell) for cell in padded[:ncols]) + " |")
return "\n".join(lines)
@staticmethod
@@ -320,16 +232,3 @@ class PdfToMarkdownTool(Tool):
return int(value)
except Exception:
return default
@staticmethod
def _to_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
s = str(value).strip().lower()
if s in {"1", "true", "yes", "on"}:
return True
if s in {"0", "false", "no", "off"}:
return False
return default

View File

@@ -1,68 +1,51 @@
identity:
identity:
name: "pdf_to_markdown"
author: "yslg"
label:
en_US: "PDF to Markdown"
zh_Hans: "PDFMarkdown"
zh_Hans: "PDF to Markdown"
pt_BR: "PDF para Markdown"
ja_JP: "PDFからMarkdown"
ja_JP: "PDF to Markdown"
description:
human:
en_US: "Convert PDF to a single Markdown file with embedded base64 images. No LLM needed."
zh_Hans: "将PDF转换为单个Markdown文件图片以base64嵌入无需大模型"
pt_BR: "Converter PDF em um arquivo Markdown com imagens base64 incorporadas. Sem LLM."
ja_JP: "PDFをbase64画像埋め込みの単一Markdownファイルに変換。LLM不要。"
llm: "Convert a PDF file into a single Markdown (.md) file. Extracts text, tables, images (base64), and vector drawings. Auto-detects TOC and organizes by chapters. No LLM needed."
en_US: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
zh_Hans: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
pt_BR: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
ja_JP: "Convert PDF to Markdown using a catalog array. Images and graphics are ignored."
llm: "Convert a PDF file into Markdown using a catalog JSON array. Ignore images and graphics."
parameters:
- name: file
type: file
required: true
label:
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: Arquivo PDF
ja_JP: PDFファイル
zh_Hans: PDF File
pt_BR: PDF File
ja_JP: PDF File
human_description:
en_US: "PDF file to convert"
zh_Hans: "要转换的 PDF 文件"
pt_BR: "Arquivo PDF para converter"
ja_JP: "変換するPDFファイル"
zh_Hans: "PDF file to convert"
pt_BR: "PDF file to convert"
ja_JP: "PDF file to convert"
llm_description: "PDF file to convert to Markdown"
form: llm
fileTypes:
- "pdf"
- name: include_images
type: boolean
required: false
- name: catalog
type: string
required: true
label:
en_US: Include Images
zh_Hans: 包含图片
pt_BR: Incluir Imagens
ja_JP: 画像を含める
en_US: Catalog JSON
zh_Hans: Catalog JSON
pt_BR: Catalog JSON
ja_JP: Catalog JSON
human_description:
en_US: "Whether to embed images as base64 in the Markdown output (default: true)"
zh_Hans: "是否将图片以base64嵌入Markdown输出默认"
pt_BR: "Se deve incorporar imagens como base64 na saída Markdown (padrão: verdadeiro)"
ja_JP: "Markdown出力にbase64として画像を埋め込むかどうかデフォルトはい"
llm_description: "Set to true to embed images as base64, false to skip images"
form: form
default: true
- name: image_dpi
type: number
required: false
label:
en_US: Image DPI
zh_Hans: 图片DPI
pt_BR: DPI da Imagem
ja_JP: 画像DPI
human_description:
en_US: "DPI for rendering vector drawings (72-300, default: 150)"
zh_Hans: "矢量图渲染DPI72-300默认150"
pt_BR: "DPI para renderizar desenhos vetoriais (72-300, padrão: 150)"
ja_JP: "ベクター描画のレンダリングDPI72-300、デフォルト150"
llm_description: "Resolution for rendering vector drawings as images. Range 72-300, default 150."
form: form
default: 150
en_US: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
zh_Hans: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
pt_BR: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
ja_JP: "Catalog JSON array like [{title,start,end,page_start_index,page_end_index}]"
llm_description: "Catalog JSON array returned by pdf_toc"
form: llm
extra:
python:
source: tools/pdf_to_markdown.py

View File

@@ -4,264 +4,303 @@ from collections import OrderedDict
from collections.abc import Generator
from typing import Any
import fitz
from dify_plugin import Tool
from dify_plugin.entities.model.llm import LLMModelConfig
from dify_plugin.entities.model.message import SystemPromptMessage, UserPromptMessage
from dify_plugin.entities.tool import ToolInvokeMessage
_SYSTEM_PROMPT = """You parse PDF table-of-contents text.
Return only valid JSON object, no markdown fences, no explanation.
Output schema:
{
"Chapter Name": {"start": 1, "end": 5},
"Another": {"start": 6, "end": 20}
}
Rules:
- start/end are integer printed page numbers from TOC.
- If end is unknown, use same value as start.
- Keep chapter names exactly as in TOC text.
"""
_TOC_SYSTEM_PROMPT = """你是专业的PDF目录解析助手。请从以下PDF文本中提取文档的目录/章节结构。
要求:
1. 识别所有一级和二级标题及其对应的页码
2. 只返回纯JSON数组不要markdown代码块不要任何解释
3. 格式: [{"title": "章节标题", "page": 页码数字}]
4. 页码必须是文档中标注的实际页码数字
5. 如果无法识别目录,返回空数组 []"""
class PdfTocTool(Tool):
_TOC_PATTERNS = [
r"目录",
r"\s*录",
r"\u3000录",
r"Table of Contents",
r"Contents",
r"目次",
]
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
toc_start = self._to_int(tool_parameters.get("toc_start"), None)
toc_end = self._to_int(tool_parameters.get("toc_end"), None)
toc_pages = (tool_parameters.get("toc_pages") or "").strip()
file = tool_parameters.get("file")
if not file:
yield self.create_text_message("Error: file is required")
return
model_config = tool_parameters.get("model")
if toc_start is None or toc_end is None:
yield self.create_text_message("Error: toc_start and toc_end are required")
return
doc = fitz.open(stream=file.blob, filetype="pdf")
try:
num_pages = len(doc)
if not toc_pages:
yield self.create_text_message("Error: toc_pages text is empty")
return
# 1) 优先从PDF元数据提取目录
catalog = self._catalog_from_metadata(doc.get_toc(), num_pages)
cleaned = self._strip_index_lists(toc_pages)
# 1) deterministic parser first
catalog = self._parse_toc_lines(cleaned)
# 2) optional LLM fallback/enhance only when deterministic parser gives no result
llm_raw_output = ""
llm_error = None
# 2) 元数据无目录时使用LLM解析
if not catalog and model_config:
llm_catalog, llm_raw_output, llm_error = self._parse_with_llm(
toc_start=toc_start,
toc_end=toc_end,
toc_pages=cleaned,
model_config=model_config,
catalog = self._extract_toc_with_llm(doc, num_pages, model_config)
# 3) 无LLM配置时回退到正则解析
if not catalog:
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
if toc_start is not None and toc_end is not None:
toc_text = "\n".join(
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
)
if llm_catalog:
catalog = self._normalize_catalog(llm_catalog)
printed_catalog = self._parse_toc_lines(toc_text)
catalog = self._attach_page_indexes(printed_catalog, toc_end, num_pages)
result: dict[str, Any] = {
"toc_start": toc_start,
"toc_end": toc_end,
"catalog": catalog,
"meta": {
"catalog_size": len(catalog),
"parser": "rule" if catalog else "none",
},
}
if not catalog:
catalog = []
if llm_raw_output:
result["meta"]["llm_used"] = True
if llm_error:
result["meta"]["llm_error"] = llm_error
yield self.create_text_message(json.dumps(catalog, ensure_ascii=False))
finally:
doc.close()
# always return valid json text payload for downstream json.loads
yield self.create_text_message(json.dumps(result, ensure_ascii=False))
yield self.create_json_message(result)
def _extract_toc_with_llm(
self, doc: fitz.Document, num_pages: int, model_config: dict[str, Any]
) -> list[dict[str, int | str]]:
# 先尝试定位目录页
toc_start, toc_end = self._find_toc_pages(doc, num_pages)
def _parse_with_llm(
self,
toc_start: int,
toc_end: int,
toc_pages: str,
model_config: dict[str, Any],
) -> tuple[dict[str, Any] | None, str, str | None]:
user_content = (
f"TOC page index range: {toc_start}..{toc_end}\n\n"
f"TOC raw text:\n{toc_pages}"
if toc_start is not None and toc_end is not None:
# 有目录页,提取目录页文本
toc_text = "\n".join(
doc[index].get_text() or "" for index in range(toc_start, toc_end + 1)
)
content_offset = toc_end
else:
# 无目录页提取前15页文本让LLM识别章节结构
sample = min(num_pages, 15)
toc_text = "\n\n--- 第{}页 ---\n".join(
[""] + [doc[i].get_text() or "" for i in range(sample)]
)
toc_text = toc_text.strip()
if not toc_text:
return []
content_offset = 0
# 截断过长文本
if len(toc_text) > 15000:
toc_text = toc_text[:15000] + "\n...[截断]"
try:
response = self.session.model.llm.invoke(
model_config=LLMModelConfig(**model_config),
prompt_messages=[
SystemPromptMessage(content=_SYSTEM_PROMPT),
UserPromptMessage(content=user_content),
SystemPromptMessage(content=_TOC_SYSTEM_PROMPT),
UserPromptMessage(content=toc_text),
],
stream=False,
)
llm_text = ""
if hasattr(response, "message") and response.message:
content = response.message.content
if isinstance(content, str):
llm_text = content
elif isinstance(content, list):
llm_text = "".join(
item.data if hasattr(item, "data") else str(item) for item in content
)
llm_text = self._get_response_text(response)
if not llm_text:
return []
parsed = self._extract_json_object(llm_text)
if parsed is None:
return None, llm_text, "Failed to parse LLM output as JSON"
if not isinstance(parsed, dict):
return None, llm_text, "LLM output JSON is not an object"
raw_catalog = self._parse_llm_json(llm_text)
if not raw_catalog:
return []
return parsed, llm_text, None
# 转换LLM返回的简单格式为完整catalog
return self._build_catalog_from_llm(raw_catalog, content_offset, num_pages)
except Exception:
return []
def _build_catalog_from_llm(
self, raw: list[dict], content_offset: int, num_pages: int
) -> list[dict[str, int | str]]:
entries: list[tuple[str, int]] = []
for item in raw:
title = str(item.get("title") or "").strip()
page = self._to_int(item.get("page"), None)
if not title or page is None:
continue
entries.append((title, page))
if not entries:
return []
# 计算偏移量:第一个条目的页码与实际内容起始页的差值
first_printed_page = entries[0][1]
offset = (content_offset + 1) - first_printed_page if content_offset > 0 else 0
result: list[dict[str, int | str]] = []
for i, (title, page) in enumerate(entries):
next_page = entries[i + 1][1] if i + 1 < len(entries) else page
page_start_index = max(0, min(page + offset - 1, num_pages - 1))
page_end_index = max(page_start_index, min(next_page + offset - 2, num_pages - 1))
if i == len(entries) - 1:
page_end_index = num_pages - 1
result.append({
"title": title,
"start": page,
"end": max(page, next_page - 1) if i + 1 < len(entries) else page,
"page_start_index": page_start_index,
"page_end_index": page_end_index,
})
return result
@staticmethod
def _strip_index_lists(text: str) -> str:
# Stop before common appendix lists that pollute TOC parsing.
pattern = re.compile(
def _get_response_text(response: Any) -> str:
if not hasattr(response, "message") or not response.message:
return ""
content = response.message.content
if isinstance(content, str):
text = content
elif isinstance(content, list):
text = "".join(
item.data if hasattr(item, "data") else str(item) for item in content
)
else:
text = str(content)
# 清理思考标签
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
text = re.sub(r"<\|[^>]+\|>", "", text)
return text.strip()
@staticmethod
def _parse_llm_json(text: str) -> list[dict]:
# 尝试提取JSON代码块
code_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
if code_match:
text = code_match.group(1).strip()
# 尝试找到JSON数组
bracket_match = re.search(r"\[[\s\S]*\]", text)
if bracket_match:
text = bracket_match.group(0)
try:
result = json.loads(text)
if isinstance(result, list):
return result
except Exception:
pass
return []
def _catalog_from_metadata(self, toc: list, num_pages: int) -> list[dict[str, int | str]]:
top = [(title, max(0, page - 1)) for level, title, page in toc if level <= 2 and page >= 1]
if not top:
return []
result: list[dict[str, int | str]] = []
for index, (title, start_index) in enumerate(top):
end_index = top[index + 1][1] - 1 if index + 1 < len(top) else num_pages - 1
result.append({
"title": title,
"start": start_index + 1,
"end": max(start_index, end_index) + 1,
"page_start_index": start_index,
"page_end_index": max(start_index, end_index),
})
return result
def _find_toc_pages(self, doc: fitz.Document, num_pages: int) -> tuple[int | None, int | None]:
toc_start = None
toc_end = None
for page_number in range(min(num_pages, 30)):
text = doc[page_number].get_text() or ""
if any(re.search(pattern, text, re.IGNORECASE) for pattern in self._TOC_PATTERNS):
if toc_start is None:
toc_start = page_number
toc_end = page_number
elif toc_start is not None:
break
return toc_start, toc_end
def _parse_toc_lines(self, text: str) -> list[dict[str, int | str]]:
marker = re.search(
r"^(List\s+of\s+Figures|List\s+of\s+Tables|图目录|表目录)",
text,
re.IGNORECASE | re.MULTILINE,
)
m = pattern.search(text)
return text[: m.start()].rstrip() if m else text
def _parse_toc_lines(self, text: str) -> dict[str, dict[str, int]]:
"""Parse lines like:
1.2 Engine Overview ........ 35
Appendix A 120
"""
line_pattern = re.compile(
r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$"
)
if marker:
text = text[: marker.start()]
pattern = re.compile(r"^\s*(?P<title>.+?)\s*(?:\.{2,}|\s)\s*(?P<page>\d{1,5})\s*$")
entries: list[tuple[str, int]] = []
for raw in text.splitlines():
line = raw.strip()
if not line or len(line) < 3:
continue
if re.fullmatch(r"\d+", line):
if not line or len(line) < 3 or re.fullmatch(r"\d+", line):
continue
m = line_pattern.match(line)
if not m:
match = pattern.match(line)
if not match:
continue
title = re.sub(r"\s+", " ", m.group("title")).strip("-_: ")
page = self._to_int(m.group("page"), None)
if not title or page is None:
title = re.sub(r"\s+", " ", match.group("title")).strip("-_:")
page = self._to_int(match.group("page"), None)
if not title or page is None or len(title) <= 1:
continue
# Skip obvious noise.
if len(title) <= 1 or title.lower() in {"page", "pages", "目录", "contents"}:
if title.lower() in {"page", "pages", "目录", "contents"}:
continue
entries.append((title, page))
if not entries:
return {}
return []
# Deduplicate keeping earliest appearance.
dedup: OrderedDict[str, int] = OrderedDict()
for title, page in entries:
if title not in dedup:
dedup[title] = page
dedup.setdefault(title, page)
titles = list(dedup.keys())
pages = [dedup[t] for t in titles]
pages = [dedup[title] for title in titles]
result: list[dict[str, int | str]] = []
for index, title in enumerate(titles):
start = pages[index]
end = max(start, pages[index + 1] - 1) if index + 1 < len(pages) else start
result.append({"title": title, "start": start, "end": end})
return result
catalog: dict[str, dict[str, int]] = {}
for i, title in enumerate(titles):
start = pages[i]
if i + 1 < len(pages):
next_start = pages[i + 1]
end = max(start, next_start - 1)
else:
end = start
catalog[title] = {"start": int(start), "end": int(end)}
def _attach_page_indexes(
self, catalog: list[dict[str, int | str]], toc_end: int, num_pages: int
) -> list[dict[str, int | str]]:
if not catalog:
return []
return catalog
first_page = None
for item in catalog:
start = self._to_int(item.get("start"), None)
if start is not None and (first_page is None or start < first_page):
first_page = start
def _normalize_catalog(self, raw: dict[str, Any]) -> dict[str, dict[str, int]]:
catalog: dict[str, dict[str, int]] = {}
source = raw.get("catalog") if isinstance(raw.get("catalog"), dict) else raw
if not isinstance(source, dict):
return catalog
if first_page is None:
return []
for name, value in source.items():
if not isinstance(name, str) or not isinstance(value, dict):
continue
start = self._to_int(value.get("start"), None)
end = self._to_int(value.get("end"), start)
offset = (toc_end + 1) - first_page
result: list[dict[str, int | str]] = []
for item in catalog:
start = self._to_int(item.get("start"), None)
end = self._to_int(item.get("end"), start)
if start is None:
continue
if end is None:
end = start
catalog[name] = {"start": int(start), "end": int(max(start, end))}
return catalog
@staticmethod
def _extract_json_object(text: str) -> Any:
if not text:
return None
candidates: list[str] = []
code_blocks = re.findall(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
candidates.extend([c.strip() for c in code_blocks if c.strip()])
brace_candidate = PdfTocTool._extract_first_brace_object(text)
if brace_candidate:
candidates.append(brace_candidate)
candidates.append(text.strip())
for cand in candidates:
parsed = PdfTocTool._json_try_parse(cand)
if parsed is not None:
return parsed
return None
@staticmethod
def _extract_first_brace_object(text: str) -> str | None:
start = text.find("{")
if start < 0:
return None
depth = 0
in_str = False
escape = False
for i in range(start, len(text)):
ch = text[i]
if in_str:
if escape:
escape = False
elif ch == "\\":
escape = True
elif ch == '"':
in_str = False
continue
if ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return text[start : i + 1]
return None
@staticmethod
def _json_try_parse(text: str) -> Any:
try:
return json.loads(text)
except Exception:
pass
# Minimal repair: remove trailing commas before } or ]
repaired = re.sub(r",\s*([}\]])", r"\1", text)
try:
return json.loads(repaired)
except Exception:
return None
page_start_index = max(0, min(start + offset, num_pages - 1))
page_end_index = max(page_start_index, min(end + offset, num_pages - 1))
result.append({
"title": str(item.get("title") or "Untitled"),
"start": start,
"end": max(start, end),
"page_start_index": page_start_index,
"page_end_index": page_end_index,
})
return result
@staticmethod
def _to_int(value: Any, default: int | None) -> int | None:

View File

@@ -2,63 +2,35 @@ identity:
name: "pdf_toc"
author: "yslg"
label:
en_US: "PDF TOC Parser"
zh_Hans: "PDF目录解析"
pt_BR: "Analisador de Sumário PDF"
ja_JP: "PDF目次解析"
en_US: "PDF TOC"
zh_Hans: "PDF 目录提取"
pt_BR: "PDF TOC"
ja_JP: "PDF TOC"
description:
human:
en_US: "Parse PDF table-of-contents text (from pdf_column_range) into structured JSON catalog via LLM"
zh_Hans: "通过LLM将PDF目录文本来自目录页提取工具的输出解析为结构化JSON目录"
pt_BR: "Analisar texto do sumário PDF em catálogo JSON estruturado via LLM"
ja_JP: "LLMを使用してPDF目次テキストを構造化JSONカタログに解析"
llm: "Parse PDF table-of-contents text into structured JSON with chapter names and page ranges. Input is the output of pdf_column_range tool (start/end/pages)."
en_US: "Extract the catalog array from a PDF file using metadata or LLM."
zh_Hans: "从PDF文件中提取目录数组优先使用元数据回退使用LLM解析。"
pt_BR: "Extrair o array de catálogo de um arquivo PDF."
ja_JP: "PDFファイルからカタログ配列を抽出する。"
llm: "Extract a catalog array from a PDF file. Returns JSON text like [{title,start,end,page_start_index,page_end_index}]."
parameters:
- name: toc_start
type: number
- name: file
type: file
required: true
label:
en_US: TOC Start Page
zh_Hans: 目录起始页
pt_BR: Página Inicial do Sumário
ja_JP: 目次開始ページ
en_US: PDF File
zh_Hans: PDF 文件
pt_BR: PDF File
ja_JP: PDF File
human_description:
en_US: "Start page index of TOC (from pdf_column_range output)"
zh_Hans: "目录起始页码(来自目录页提取工具输出的 start"
pt_BR: "Índice da página inicial do sumário"
ja_JP: "目次の開始ページ番号"
llm_description: "Start page index of TOC section, from pdf_column_range output field 'start'"
form: llm
- name: toc_end
type: number
required: true
label:
en_US: TOC End Page
zh_Hans: 目录结束页
pt_BR: Página Final do Sumário
ja_JP: 目次終了ページ
human_description:
en_US: "End page index of TOC (from pdf_column_range output)"
zh_Hans: "目录结束页码(来自目录页提取工具输出的 end"
pt_BR: "Índice da página final do sumário"
ja_JP: "目次の終了ページ番号"
llm_description: "End page index of TOC section, from pdf_column_range output field 'end'"
form: llm
- name: toc_pages
type: string
required: true
label:
en_US: TOC Page Text
zh_Hans: 目录页文本
pt_BR: Texto das Páginas do Sumário
ja_JP: 目次ページテキスト
human_description:
en_US: "Raw text content of TOC pages (from pdf_column_range output 'pages' array, joined)"
zh_Hans: "目录页原始文本内容(来自目录页提取工具输出的 pages 数组)"
pt_BR: "Conteúdo de texto bruto das páginas do sumário"
ja_JP: "目次ページの生テキスト内容"
llm_description: "Raw text content extracted from TOC pages, from pdf_column_range output field 'pages'"
en_US: "PDF file to inspect"
zh_Hans: "要解析的PDF文件"
pt_BR: "PDF file to inspect"
ja_JP: "PDF file to inspect"
llm_description: "PDF file to extract catalog from"
form: llm
fileTypes:
- "pdf"
- name: model
type: model-selector
scope: llm
@@ -69,10 +41,10 @@ parameters:
pt_BR: Modelo LLM
ja_JP: LLMモデル
human_description:
en_US: "LLM model for parsing TOC into structured JSON"
zh_Hans: "用于解析目录的 LLM 模型"
pt_BR: "Modelo LLM para análise do sumário"
ja_JP: "目次解析用LLMモデル"
en_US: "LLM model used for parsing TOC when metadata is unavailable"
zh_Hans: "当元数据不可用时,用于解析目录的LLM模型"
pt_BR: "Modelo LLM para análise de TOC"
ja_JP: "メタデータが利用できない場合のTOC解析用LLMモデル"
form: form
extra:
python:

File diff suppressed because it is too large Load Diff