Files
urbanLifeline/difyPlugin/数据清洗-大文件处理.yml.bak

1001 lines
28 KiB
YAML
Raw Normal View History

2026-03-15 13:00:30 +08:00
app:
description: 优化版支持大文件PDF处理跨页表格/段落智能识别合并
icon: 🤖
icon_background: '#FFEAD5'
mode: workflow
name: 数据清洗-大文件处理
use_icon_as_answer_icon: false
dependencies:
- current_identifier: null
type: marketplace
value:
marketplace_plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d
version: null
- current_identifier: null
type: marketplace
value:
marketplace_plugin_unique_identifier: langgenius/siliconflow:0.0.44@9dac23fe837d6da24a2cd9ef959c1c93e4e094b7562ad8a2fd3d4cc86c0e3e89
version: null
- current_identifier: null
type: marketplace
value:
marketplace_plugin_unique_identifier: bowenliang123/md_exporter:3.6.9@3f027d63e80b44d5d5a9f706871afaef37905b8f8a89a2d152dc530211a8acb1
version: null
- current_identifier: null
type: package
value:
plugin_unique_identifier: yslg/pdf:0.0.1@5e83b87d38ad55c2a1e929311d21a86cef5f9e04394b977b3ba16eb34de08b36
version: null
kind: app
version: 0.5.0
workflow:
conversation_variables: []
environment_variables: []
features:
file_upload:
allowed_file_extensions:
- .JPG
- .JPEG
- .PNG
- .GIF
- .WEBP
- .SVG
- .PDF
- .pdf
allowed_file_types:
- image
- document
allowed_file_upload_methods:
- local_file
- remote_url
enabled: false
fileUploadConfig:
audio_file_size_limit: 50
batch_count_limit: 5
file_size_limit: 500
image_file_batch_limit: 10
image_file_size_limit: 10
single_chunk_attachment_limit: 10
video_file_size_limit: 100
workflow_file_upload_limit: 10
image:
enabled: false
number_limits: 3
transfer_methods:
- local_file
- remote_url
number_limits: 3
opening_statement: ''
retriever_resource:
enabled: true
sensitive_word_avoidance:
enabled: false
speech_to_text:
enabled: false
suggested_questions: []
suggested_questions_after_answer:
enabled: false
text_to_speech:
enabled: false
language: ''
voice: ''
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: start
targetType: if-else
id: 1770703294598-source-1770703342256-target
selected: false
source: '1770703294598'
sourceHandle: source
target: '1770703342256'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: if-else
targetType: llm
id: 1770703342256-true-1770703393190-target
selected: false
source: '1770703342256'
sourceHandle: 'true'
target: '1770703393190'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: if-else
targetType: llm
id: 1770703342256-93d5294c-5984-4bc0-b30d-cd9e2ffba28d-1770703524412-target
selected: false
source: '1770703342256'
sourceHandle: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d
target: '1770703524412'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: llm
targetType: variable-aggregator
id: 1770703393190-source-1770703625287-target
selected: false
source: '1770703393190'
sourceHandle: source
target: '1770703625287'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: llm
targetType: variable-aggregator
id: 1770703524412-source-1770703625287-target
selected: false
source: '1770703524412'
sourceHandle: source
target: '1770703625287'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: if-else
targetType: if-else
id: 1770703342256-6556b05e-3266-4aa7-b196-ec41f5dd766b-1772348592076-target
selected: false
source: '1770703342256'
sourceHandle: 6556b05e-3266-4aa7-b196-ec41f5dd766b
target: '1772348592076'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1772348592076-false-1770703633813-target
selected: false
source: '1772348592076'
sourceHandle: 'false'
target: '1770703633813'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: if-else
targetType: tool
id: 1772348592076-0b4fd2d4-a592-4421-acbb-822db3004219-1772349027446-target
selected: false
source: '1772348592076'
sourceHandle: 0b4fd2d4-a592-4421-acbb-822db3004219
target: '1772349027446'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1770703633813-source-1772348969241-target
selected: false
source: '1770703633813'
sourceHandle: source
target: '1772348969241'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1772349027446-source-1772348969241-target
selected: false
source: '1772349027446'
sourceHandle: source
target: '1772348969241'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: llm
id: 1770703625287-source-1770703671732-target
selected: false
source: '1770703625287'
sourceHandle: source
target: '1770703671732'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: llm
targetType: tool
id: 1770703671732-source-1770704285657-target
selected: false
source: '1770703671732'
sourceHandle: source
target: '1770704285657'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: if-else
targetType: tool
id: 1772348592076-true-1772527425324-target
selected: false
source: '1772348592076'
sourceHandle: 'true'
target: '1772527425324'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: variable-aggregator
targetType: variable-aggregator
id: 1772348969241-source-1770703625287-target
source: '1772348969241'
sourceHandle: source
target: '1770703625287'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: end
id: 1770704285657-source-1770704288628-target
source: '1770704285657'
sourceHandle: source
target: '1770704288628'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: end
id: 1772527425324-source-1772779766541-target
source: '1772527425324'
sourceHandle: source
target: '1772779766541'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
selected: false
title: 用户输入
type: start
variables:
- allowed_file_extensions: []
allowed_file_types:
- image
- document
- video
allowed_file_upload_methods:
- local_file
- remote_url
default: ''
hint: ''
label: 文件
max_length: 48
options: []
placeholder: ''
required: true
type: file
variable: file
height: 109
id: '1770703294598'
position:
x: 0
y: 55
positionAbsolute:
x: 0
y: 55
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 'true'
conditions:
- comparison_operator: in
id: f88f279e-5736-4b1b-98cf-f8a9621531a0
value:
- image
varType: file
variable_selector:
- '1770703294598'
- file
- type
id: 'true'
logical_operator: and
- case_id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d
conditions:
- comparison_operator: in
id: 48e8d32a-59c5-4573-8e8a-355dc73a39fc
value:
- video
varType: file
variable_selector:
- '1770703294598'
- file
- type
id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d
logical_operator: and
- case_id: 6556b05e-3266-4aa7-b196-ec41f5dd766b
conditions:
- comparison_operator: in
id: 9916110c-edf7-4a4a-b324-2f8d85c73299
value:
- document
varType: file
variable_selector:
- '1770703294598'
- file
- type
id: 6556b05e-3266-4aa7-b196-ec41f5dd766b
logical_operator: and
selected: false
title: 条件分支
type: if-else
height: 220
id: '1770703342256'
position:
x: 342
y: 0
positionAbsolute:
x: 342
y: 0
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
context:
enabled: false
variable_selector: []
model:
completion_params:
enable_thinking: true
temperature: 0.7
mode: chat
name: zai-org/GLM-4.6V
provider: langgenius/siliconflow/siliconflow
prompt_template:
- id: 4b1706f6-3216-4fb7-a6dc-978ce43ff491
role: system
text: 识别图片中所有内容和文字,并进行合理的描述编排
reasoning_format: separated
selected: false
title: 图片理解
type: llm
vision:
configs:
detail: high
variable_selector:
- '1770703294598'
- file
enabled: true
height: 88
id: '1770703393190'
position:
x: 2772
y: 82
positionAbsolute:
x: 2772
y: 82
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
context:
enabled: false
variable_selector: []
model:
completion_params: {}
mode: chat
name: Pro/moonshotai/Kimi-K2.5
provider: langgenius/siliconflow/siliconflow
prompt_template:
- id: 497bebc3-5e75-4c2b-940c-ba485dc1e51a
role: system
text: 识别视频中所有内容和文字,并进行合理的描述编排
reasoning_format: separated
selected: false
title: 视频理解
type: llm
vision:
configs:
detail: high
variable_selector:
- '1770703294598'
- file
enabled: true
height: 88
id: '1770703524412'
position:
x: 1770
y: 177
positionAbsolute:
x: 1770
y: 177
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 'true'
conditions:
- comparison_operator: contains
id: 7a6d2b1e-9704-41f3-aeba-40c6e2484d56
value: pdf
varType: string
variable_selector:
- '1770703294598'
- file
- extension
id: 'true'
logical_operator: and
- case_id: 0b4fd2d4-a592-4421-acbb-822db3004219
conditions:
- comparison_operator: contains
id: 67767b34-ad03-48f4-80ef-100eb78e13ab
value: doc
varType: file
variable_selector:
- '1770703294598'
- file
- extension
logical_operator: and
selected: false
title: 条件分支 2
type: if-else
height: 172
id: '1772348592076'
position:
x: 704
y: 424
positionAbsolute:
x: 704
y: 424
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1770703294598'
- file
height: 104
id: '1770703633813'
position:
x: 1066
y: 337
positionAbsolute:
x: 1066
y: 337
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: Word file to extract text and images from
ja_JP: Word file to extract text and images from
pt_BR: Word file to extract text and images from
zh_Hans: 要提取文本和图片的Word文件
label:
en_US: Word Content
ja_JP: Word Content
pt_BR: Word Content
zh_Hans: Word 内容
llm_description: Word file content to be extracted
max: null
min: null
name: word_content
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
word_content: ''
plugin_id: samanhappy/word_process
plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d
provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=cb0643689e2f8152d38c44a267a459fae99ff208b0bc164e27ccb053fc1844cd.svg
provider_id: samanhappy/word_process/word_process
provider_name: samanhappy/word_process/word_process
provider_type: builtin
selected: false
title: Word提取器
tool_configurations: {}
tool_description: 一个将Word文件提取为文本和图片的工具
tool_label: Word提取器
tool_name: word_extractor
tool_node_version: '2'
tool_parameters:
word_content:
type: variable
value:
- '1770703294598'
- file
type: tool
height: 52
id: '1772349027446'
position:
x: 1066
y: 521
positionAbsolute:
x: 1066
y: 521
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: 文档提取聚合
type: variable-aggregator
variables:
- - '1772349027446'
- text
- - '1770703633813'
- text
height: 134
id: '1772348969241'
position:
x: 1428
y: 344
positionAbsolute:
x: 1428
y: 344
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: 058efed3-3c6a-44d6-8f40-704abda8c413
group_name: Group1
output_type: string
variables:
- - '1770703393190'
- text
- - '1770703524412'
- text
- - '1772349100004'
- result
output_type: string
selected: false
title: 文件提取聚合
type: variable-aggregator
variables:
- - '1770703393190'
- text
- - '1770703524412'
- text
- - '1772348969241'
- output
height: 160
id: '1770703625287'
position:
x: 3134
y: 291
positionAbsolute:
x: 3134
y: 291
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
context:
enabled: false
variable_selector: []
model:
completion_params:
temperature: 0.3
mode: chat
name: Qwen/Qwen3-32B
provider: langgenius/siliconflow/siliconflow
prompt_template:
- id: 48ec1856-fdd7-4f4a-9ce5-1aa635822550
role: system
text: '你是一个专业的文档整理和合并专家。以下内容是从文档中分块提取并格式化的Markdown文本。由于分块处理各块之间可能存在跨页断裂和重复内容需要你进行智能合并。
## 你的任务
### 1. 合并跨页表格
- 找到所有 `<!-- TABLE_CONTINUES -->` 和对应的 `<!-- TABLE_CONTINUED_FROM_PREV
-->` 标记
- 将前一块末尾的不完整表格和后一块开头的延续表格合并为一个完整表格
- 确保表头只保留一份,数据行完整拼接,表格结构正确
### 2. 合并跨页段落
- 找到所有 `<!-- PARA_CONTINUES -->` 和 `<!-- PARA_CONTINUED_FROM_PREV -->`
标记
- 将被截断的段落拼接为语义完整的段落
### 3. 合并跨页列表
- 找到所有 `<!-- LIST_CONTINUES -->` 和 `<!-- LIST_CONTINUED_FROM_PREV -->`
标记
- 将被截断的列表合并为完整列表,确保编号连续
### 4. 去除重复内容
- 由于分块时存在页面重叠,相邻块之间可能有重复的段落、表格行或列表项
- 识别并去除这些重复内容,每段内容只保留一份
### 5. 清理所有辅助标记
- 移除所有 `<!-- ... -->` 形式的辅助标记和块分隔符
- 确保最终输出中不包含任何HTML注释或处理标记
### 6. 格式规范化
- 确保标题层级正确且连续
- 确保表格格式完整(有表头行和分隔行)
- 确保列表编号连续
- 统一全文格式风格
直接输出最终的Markdown内容不要用```markdown```包裹。
以下是需要整理合并的内容:
{{#1770703625287.output#}}'
reasoning_format: separated
selected: false
title: 数据清洗与跨页合并
type: llm
vision:
enabled: false
height: 88
id: '1770703671732'
position:
x: 3660
y: 327
positionAbsolute:
x: 3660
y: 327
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: Markdown text
ja_JP: Markdown text
pt_BR: Markdown text
zh_Hans: Markdown格式文本
label:
en_US: Markdown text
ja_JP: Markdown text
pt_BR: Markdown text
zh_Hans: Markdown格式文本
llm_description: ''
max: null
min: null
name: md_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: Optional custom output file name, and the filename suffix is not
required.
ja_JP: Optional custom output file name, and the filename suffix is not
required.
pt_BR: Optional custom output file name, and the filename suffix is not
required.
zh_Hans: 可选的自定义输出文件名,后缀名无需指定
label:
en_US: Output Filename
ja_JP: Output Filename
pt_BR: Output Filename
zh_Hans: 输出文件名
llm_description: ''
max: null
min: null
name: output_filename
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
params:
md_text: ''
output_filename: ''
plugin_id: bowenliang123/md_exporter
plugin_unique_identifier: bowenliang123/md_exporter:3.4.0@a5ce3ac3114f3dd6ab4fe49f0bb931a31af49ff555e479ec45e8aaa5d44157ee
provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f0bad95cda1671b4e49f0e05df6122ef9ec5d554e138f128795d11d3806c00ef.svg
provider_id: bowenliang123/md_exporter/md_exporter
provider_name: bowenliang123/md_exporter/md_exporter
provider_type: builtin
selected: false
title: Markdown ⮕ MD
tool_configurations: {}
tool_description: 将 Markdown 转换为 .md 文件的工具
tool_label: Markdown ⮕ MD
tool_name: md_to_md
tool_node_version: '2'
tool_parameters:
md_text:
type: mixed
value: '{{#1770703671732.text#}}'
output_filename:
type: mixed
value: ''
type: tool
height: 52
id: '1770704285657'
position:
x: 4231.079190350343
y: 573.1529224498603
positionAbsolute:
x: 4231.079190350343
y: 573.1529224498603
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
outputs:
- value_selector:
- '1770704285657'
- files
value_type: array[file]
variable: _
selected: false
title: 输出
type: end
height: 88
id: '1770704288628'
position:
x: 5142.505374898874
y: 614.2288378497078
positionAbsolute:
x: 5142.505374898874
y: 614.2288378497078
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: PDF file to convert
ja_JP: 変換するPDFファイル
pt_BR: Arquivo PDF para converter
zh_Hans: 要转换的 PDF 文件
label:
en_US: PDF File
ja_JP: PDFファイル
pt_BR: Arquivo PDF
zh_Hans: PDF 文件
llm_description: PDF file to convert to Markdown
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
- auto_generate: null
default: true
form: form
human_description:
en_US: Whether to embed images as base64 (default true)
ja_JP: 画像をbase64として埋め込むか
pt_BR: Se deve incorporar imagens como base64
zh_Hans: 是否将图片以base64嵌入默认是
label:
en_US: Include Images
ja_JP: 画像を含める
pt_BR: Incluir Imagens
zh_Hans: 包含图片
llm_description: Set to true to embed images as base64
max: null
min: null
name: include_images
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 150
form: form
human_description:
en_US: DPI for rendering vector drawings (72-300)
ja_JP: ベクター描画のDPI
pt_BR: DPI para renderizar desenhos vetoriais
zh_Hans: 矢量图渲染DPI72-300默认150
label:
en_US: Image DPI
ja_JP: 画像DPI
pt_BR: DPI da Imagem
zh_Hans: 图片DPI
llm_description: Resolution for rendering vector drawings
max: null
min: null
name: image_dpi
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
params:
file: ''
image_dpi: ''
include_images: ''
plugin_id: yslg/pdf
plugin_unique_identifier: yslg/pdf:0.0.1@cc5f6665002ca7c06855ef6703ee9f6e051ddbfb3d00d2aa899f9f280f45dd61
provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f1441c071a96f87326f5eb2ae2bfc5a570e9260e7d2b74c2ac15df4037231c64.svg
provider_id: yslg/pdf/pdf
provider_name: yslg/pdf/pdf
provider_type: builtin
selected: true
title: PDF转Markdown
tool_configurations:
image_dpi:
type: constant
value: 150
include_images:
type: constant
value: true
model:
type: constant
value:
completion_params: {}
mode: chat
model: Qwen/Qwen3-32B
model_type: llm
provider: langgenius/siliconflow/siliconflow
tool_description: 将PDF转换为Markdown图片base64嵌入无需大模型
tool_label: PDF转Markdown
tool_name: pdf_to_markdown
tool_node_version: '2'
tool_parameters:
file:
type: variable
value:
- '1770703294598'
- file
type: tool
height: 140
id: '1772527425324'
position:
x: 1881.4558888576478
y: 697.8632689662784
positionAbsolute:
x: 1881.4558888576478
y: 697.8632689662784
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
outputs:
- value_selector:
- '1772527425324'
- files
value_type: array[file]
variable: files
selected: false
title: 输出 2
type: end
height: 88
id: '1772779766541'
position:
x: 2183.4558888576476
y: 697.8632689662784
positionAbsolute:
x: 2183.4558888576476
y: 697.8632689662784
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -675.5777822239224
y: 9.568461206490326
zoom: 0.7578582832552
rag_pipeline_variables: []