app: description: 优化版:支持大文件PDF处理,跨页表格/段落智能识别合并 icon: 🤖 icon_background: '#FFEAD5' mode: workflow name: 数据清洗-大文件处理 use_icon_as_answer_icon: false dependencies: - current_identifier: null type: marketplace value: marketplace_plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d version: null - current_identifier: null type: marketplace value: marketplace_plugin_unique_identifier: langgenius/siliconflow:0.0.44@9dac23fe837d6da24a2cd9ef959c1c93e4e094b7562ad8a2fd3d4cc86c0e3e89 version: null - current_identifier: null type: marketplace value: marketplace_plugin_unique_identifier: bowenliang123/md_exporter:3.6.9@3f027d63e80b44d5d5a9f706871afaef37905b8f8a89a2d152dc530211a8acb1 version: null - current_identifier: null type: package value: plugin_unique_identifier: yslg/pdf:0.0.1@5e83b87d38ad55c2a1e929311d21a86cef5f9e04394b977b3ba16eb34de08b36 version: null kind: app version: 0.5.0 workflow: conversation_variables: [] environment_variables: [] features: file_upload: allowed_file_extensions: - .JPG - .JPEG - .PNG - .GIF - .WEBP - .SVG - .PDF - .pdf allowed_file_types: - image - document allowed_file_upload_methods: - local_file - remote_url enabled: false fileUploadConfig: audio_file_size_limit: 50 batch_count_limit: 5 file_size_limit: 500 image_file_batch_limit: 10 image_file_size_limit: 10 single_chunk_attachment_limit: 10 video_file_size_limit: 100 workflow_file_upload_limit: 10 image: enabled: false number_limits: 3 transfer_methods: - local_file - remote_url number_limits: 3 opening_statement: '' retriever_resource: enabled: true sensitive_word_avoidance: enabled: false speech_to_text: enabled: false suggested_questions: [] suggested_questions_after_answer: enabled: false text_to_speech: enabled: false language: '' voice: '' graph: edges: - data: isInIteration: false isInLoop: false sourceType: start targetType: if-else id: 1770703294598-source-1770703342256-target selected: false source: '1770703294598' sourceHandle: source target: '1770703342256' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: if-else targetType: llm id: 1770703342256-true-1770703393190-target selected: false source: '1770703342256' sourceHandle: 'true' target: '1770703393190' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: if-else targetType: llm id: 1770703342256-93d5294c-5984-4bc0-b30d-cd9e2ffba28d-1770703524412-target selected: false source: '1770703342256' sourceHandle: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d target: '1770703524412' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: llm targetType: variable-aggregator id: 1770703393190-source-1770703625287-target selected: false source: '1770703393190' sourceHandle: source target: '1770703625287' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: llm targetType: variable-aggregator id: 1770703524412-source-1770703625287-target selected: false source: '1770703524412' sourceHandle: source target: '1770703625287' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: if-else targetType: if-else id: 1770703342256-6556b05e-3266-4aa7-b196-ec41f5dd766b-1772348592076-target selected: false source: '1770703342256' sourceHandle: 6556b05e-3266-4aa7-b196-ec41f5dd766b target: '1772348592076' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: if-else targetType: document-extractor id: 1772348592076-false-1770703633813-target selected: false source: '1772348592076' sourceHandle: 'false' target: '1770703633813' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: if-else targetType: tool id: 1772348592076-0b4fd2d4-a592-4421-acbb-822db3004219-1772349027446-target selected: false source: '1772348592076' sourceHandle: 0b4fd2d4-a592-4421-acbb-822db3004219 target: '1772349027446' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: document-extractor targetType: variable-aggregator id: 1770703633813-source-1772348969241-target selected: false source: '1770703633813' sourceHandle: source target: '1772348969241' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: tool targetType: variable-aggregator id: 1772349027446-source-1772348969241-target selected: false source: '1772349027446' sourceHandle: source target: '1772348969241' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: variable-aggregator targetType: llm id: 1770703625287-source-1770703671732-target selected: false source: '1770703625287' sourceHandle: source target: '1770703671732' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: llm targetType: tool id: 1770703671732-source-1770704285657-target selected: false source: '1770703671732' sourceHandle: source target: '1770704285657' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: if-else targetType: tool id: 1772348592076-true-1772527425324-target selected: false source: '1772348592076' sourceHandle: 'true' target: '1772527425324' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: variable-aggregator targetType: variable-aggregator id: 1772348969241-source-1770703625287-target source: '1772348969241' sourceHandle: source target: '1770703625287' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: tool targetType: end id: 1770704285657-source-1770704288628-target source: '1770704285657' sourceHandle: source target: '1770704288628' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: tool targetType: end id: 1772527425324-source-1772779766541-target source: '1772527425324' sourceHandle: source target: '1772779766541' targetHandle: target type: custom zIndex: 0 nodes: - data: selected: false title: 用户输入 type: start variables: - allowed_file_extensions: [] allowed_file_types: - image - document - video allowed_file_upload_methods: - local_file - remote_url default: '' hint: '' label: 文件 max_length: 48 options: [] placeholder: '' required: true type: file variable: file height: 109 id: '1770703294598' position: x: 0 y: 55 positionAbsolute: x: 0 y: 55 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: cases: - case_id: 'true' conditions: - comparison_operator: in id: f88f279e-5736-4b1b-98cf-f8a9621531a0 value: - image varType: file variable_selector: - '1770703294598' - file - type id: 'true' logical_operator: and - case_id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d conditions: - comparison_operator: in id: 48e8d32a-59c5-4573-8e8a-355dc73a39fc value: - video varType: file variable_selector: - '1770703294598' - file - type id: 93d5294c-5984-4bc0-b30d-cd9e2ffba28d logical_operator: and - case_id: 6556b05e-3266-4aa7-b196-ec41f5dd766b conditions: - comparison_operator: in id: 9916110c-edf7-4a4a-b324-2f8d85c73299 value: - document varType: file variable_selector: - '1770703294598' - file - type id: 6556b05e-3266-4aa7-b196-ec41f5dd766b logical_operator: and selected: false title: 条件分支 type: if-else height: 220 id: '1770703342256' position: x: 342 y: 0 positionAbsolute: x: 342 y: 0 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: context: enabled: false variable_selector: [] model: completion_params: enable_thinking: true temperature: 0.7 mode: chat name: zai-org/GLM-4.6V provider: langgenius/siliconflow/siliconflow prompt_template: - id: 4b1706f6-3216-4fb7-a6dc-978ce43ff491 role: system text: 识别图片中所有内容和文字,并进行合理的描述编排 reasoning_format: separated selected: false title: 图片理解 type: llm vision: configs: detail: high variable_selector: - '1770703294598' - file enabled: true height: 88 id: '1770703393190' position: x: 2772 y: 82 positionAbsolute: x: 2772 y: 82 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: context: enabled: false variable_selector: [] model: completion_params: {} mode: chat name: Pro/moonshotai/Kimi-K2.5 provider: langgenius/siliconflow/siliconflow prompt_template: - id: 497bebc3-5e75-4c2b-940c-ba485dc1e51a role: system text: 识别视频中所有内容和文字,并进行合理的描述编排 reasoning_format: separated selected: false title: 视频理解 type: llm vision: configs: detail: high variable_selector: - '1770703294598' - file enabled: true height: 88 id: '1770703524412' position: x: 1770 y: 177 positionAbsolute: x: 1770 y: 177 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: cases: - case_id: 'true' conditions: - comparison_operator: contains id: 7a6d2b1e-9704-41f3-aeba-40c6e2484d56 value: pdf varType: string variable_selector: - '1770703294598' - file - extension id: 'true' logical_operator: and - case_id: 0b4fd2d4-a592-4421-acbb-822db3004219 conditions: - comparison_operator: contains id: 67767b34-ad03-48f4-80ef-100eb78e13ab value: doc varType: file variable_selector: - '1770703294598' - file - extension logical_operator: and selected: false title: 条件分支 2 type: if-else height: 172 id: '1772348592076' position: x: 704 y: 424 positionAbsolute: x: 704 y: 424 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_array_file: false selected: false title: 文档提取器 type: document-extractor variable_selector: - '1770703294598' - file height: 104 id: '1770703633813' position: x: 1066 y: 337 positionAbsolute: x: 1066 y: 337 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_team_authorization: true paramSchemas: - auto_generate: null default: null form: llm human_description: en_US: Word file to extract text and images from ja_JP: Word file to extract text and images from pt_BR: Word file to extract text and images from zh_Hans: 要提取文本和图片的Word文件 label: en_US: Word Content ja_JP: Word Content pt_BR: Word Content zh_Hans: Word 内容 llm_description: Word file content to be extracted max: null min: null name: word_content options: [] placeholder: null precision: null required: true scope: null template: null type: file params: word_content: '' plugin_id: samanhappy/word_process plugin_unique_identifier: samanhappy/word_process:0.0.1@003ecc76645cf2d5160d4e009a29d8eba2946eaaf7134c49971c3b9fedbfab0d provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=cb0643689e2f8152d38c44a267a459fae99ff208b0bc164e27ccb053fc1844cd.svg provider_id: samanhappy/word_process/word_process provider_name: samanhappy/word_process/word_process provider_type: builtin selected: false title: Word提取器 tool_configurations: {} tool_description: 一个将Word文件提取为文本和图片的工具 tool_label: Word提取器 tool_name: word_extractor tool_node_version: '2' tool_parameters: word_content: type: variable value: - '1770703294598' - file type: tool height: 52 id: '1772349027446' position: x: 1066 y: 521 positionAbsolute: x: 1066 y: 521 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: output_type: string selected: false title: 文档提取聚合 type: variable-aggregator variables: - - '1772349027446' - text - - '1770703633813' - text height: 134 id: '1772348969241' position: x: 1428 y: 344 positionAbsolute: x: 1428 y: 344 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: advanced_settings: group_enabled: false groups: - groupId: 058efed3-3c6a-44d6-8f40-704abda8c413 group_name: Group1 output_type: string variables: - - '1770703393190' - text - - '1770703524412' - text - - '1772349100004' - result output_type: string selected: false title: 文件提取聚合 type: variable-aggregator variables: - - '1770703393190' - text - - '1770703524412' - text - - '1772348969241' - output height: 160 id: '1770703625287' position: x: 3134 y: 291 positionAbsolute: x: 3134 y: 291 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: context: enabled: false variable_selector: [] model: completion_params: temperature: 0.3 mode: chat name: Qwen/Qwen3-32B provider: langgenius/siliconflow/siliconflow prompt_template: - id: 48ec1856-fdd7-4f4a-9ce5-1aa635822550 role: system text: '你是一个专业的文档整理和合并专家。以下内容是从文档中分块提取并格式化的Markdown文本。由于分块处理,各块之间可能存在跨页断裂和重复内容,需要你进行智能合并。 ## 你的任务 ### 1. 合并跨页表格 - 找到所有 `` 和对应的 `` 标记 - 将前一块末尾的不完整表格和后一块开头的延续表格合并为一个完整表格 - 确保表头只保留一份,数据行完整拼接,表格结构正确 ### 2. 合并跨页段落 - 找到所有 `` 和 `` 标记 - 将被截断的段落拼接为语义完整的段落 ### 3. 合并跨页列表 - 找到所有 `` 和 `` 标记 - 将被截断的列表合并为完整列表,确保编号连续 ### 4. 去除重复内容 - 由于分块时存在页面重叠,相邻块之间可能有重复的段落、表格行或列表项 - 识别并去除这些重复内容,每段内容只保留一份 ### 5. 清理所有辅助标记 - 移除所有 `` 形式的辅助标记和块分隔符 - 确保最终输出中不包含任何HTML注释或处理标记 ### 6. 格式规范化 - 确保标题层级正确且连续 - 确保表格格式完整(有表头行和分隔行) - 确保列表编号连续 - 统一全文格式风格 直接输出最终的Markdown内容,不要用```markdown```包裹。 以下是需要整理合并的内容: {{#1770703625287.output#}}' reasoning_format: separated selected: false title: 数据清洗与跨页合并 type: llm vision: enabled: false height: 88 id: '1770703671732' position: x: 3660 y: 327 positionAbsolute: x: 3660 y: 327 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_team_authorization: true paramSchemas: - auto_generate: null default: null form: llm human_description: en_US: Markdown text ja_JP: Markdown text pt_BR: Markdown text zh_Hans: Markdown格式文本 label: en_US: Markdown text ja_JP: Markdown text pt_BR: Markdown text zh_Hans: Markdown格式文本 llm_description: '' max: null min: null name: md_text options: [] placeholder: null precision: null required: true scope: null template: null type: string - auto_generate: null default: null form: llm human_description: en_US: Optional custom output file name, and the filename suffix is not required. ja_JP: Optional custom output file name, and the filename suffix is not required. pt_BR: Optional custom output file name, and the filename suffix is not required. zh_Hans: 可选的自定义输出文件名,后缀名无需指定 label: en_US: Output Filename ja_JP: Output Filename pt_BR: Output Filename zh_Hans: 输出文件名 llm_description: '' max: null min: null name: output_filename options: [] placeholder: null precision: null required: false scope: null template: null type: string params: md_text: '' output_filename: '' plugin_id: bowenliang123/md_exporter plugin_unique_identifier: bowenliang123/md_exporter:3.4.0@a5ce3ac3114f3dd6ab4fe49f0bb931a31af49ff555e479ec45e8aaa5d44157ee provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f0bad95cda1671b4e49f0e05df6122ef9ec5d554e138f128795d11d3806c00ef.svg provider_id: bowenliang123/md_exporter/md_exporter provider_name: bowenliang123/md_exporter/md_exporter provider_type: builtin selected: false title: Markdown ⮕ MD tool_configurations: {} tool_description: 将 Markdown 转换为 .md 文件的工具 tool_label: Markdown ⮕ MD tool_name: md_to_md tool_node_version: '2' tool_parameters: md_text: type: mixed value: '{{#1770703671732.text#}}' output_filename: type: mixed value: '' type: tool height: 52 id: '1770704285657' position: x: 4231.079190350343 y: 573.1529224498603 positionAbsolute: x: 4231.079190350343 y: 573.1529224498603 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: outputs: - value_selector: - '1770704285657' - files value_type: array[file] variable: _ selected: false title: 输出 type: end height: 88 id: '1770704288628' position: x: 5142.505374898874 y: 614.2288378497078 positionAbsolute: x: 5142.505374898874 y: 614.2288378497078 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_team_authorization: true paramSchemas: - auto_generate: null default: null form: llm human_description: en_US: PDF file to convert ja_JP: 変換するPDFファイル pt_BR: Arquivo PDF para converter zh_Hans: 要转换的 PDF 文件 label: en_US: PDF File ja_JP: PDFファイル pt_BR: Arquivo PDF zh_Hans: PDF 文件 llm_description: PDF file to convert to Markdown max: null min: null name: file options: [] placeholder: null precision: null required: true scope: null template: null type: file - auto_generate: null default: true form: form human_description: en_US: Whether to embed images as base64 (default true) ja_JP: 画像をbase64として埋め込むか pt_BR: Se deve incorporar imagens como base64 zh_Hans: 是否将图片以base64嵌入(默认是) label: en_US: Include Images ja_JP: 画像を含める pt_BR: Incluir Imagens zh_Hans: 包含图片 llm_description: Set to true to embed images as base64 max: null min: null name: include_images options: [] placeholder: null precision: null required: false scope: null template: null type: boolean - auto_generate: null default: 150 form: form human_description: en_US: DPI for rendering vector drawings (72-300) ja_JP: ベクター描画のDPI pt_BR: DPI para renderizar desenhos vetoriais zh_Hans: 矢量图渲染DPI(72-300,默认150) label: en_US: Image DPI ja_JP: 画像DPI pt_BR: DPI da Imagem zh_Hans: 图片DPI llm_description: Resolution for rendering vector drawings max: null min: null name: image_dpi options: [] placeholder: null precision: null required: false scope: null template: null type: number params: file: '' image_dpi: '' include_images: '' plugin_id: yslg/pdf plugin_unique_identifier: yslg/pdf:0.0.1@cc5f6665002ca7c06855ef6703ee9f6e051ddbfb3d00d2aa899f9f280f45dd61 provider_icon: https://dify.org.xyzh.yslg/console/api/workspaces/current/plugin/icon?tenant_id=fe3bcf55-9a04-4850-8473-7f97e1c09b97&filename=f1441c071a96f87326f5eb2ae2bfc5a570e9260e7d2b74c2ac15df4037231c64.svg provider_id: yslg/pdf/pdf provider_name: yslg/pdf/pdf provider_type: builtin selected: true title: PDF转Markdown tool_configurations: image_dpi: type: constant value: 150 include_images: type: constant value: true model: type: constant value: completion_params: {} mode: chat model: Qwen/Qwen3-32B model_type: llm provider: langgenius/siliconflow/siliconflow tool_description: 将PDF转换为Markdown,图片base64嵌入,无需大模型 tool_label: PDF转Markdown tool_name: pdf_to_markdown tool_node_version: '2' tool_parameters: file: type: variable value: - '1770703294598' - file type: tool height: 140 id: '1772527425324' position: x: 1881.4558888576478 y: 697.8632689662784 positionAbsolute: x: 1881.4558888576478 y: 697.8632689662784 selected: true sourcePosition: right targetPosition: left type: custom width: 242 - data: outputs: - value_selector: - '1772527425324' - files value_type: array[file] variable: files selected: false title: 输出 2 type: end height: 88 id: '1772779766541' position: x: 2183.4558888576476 y: 697.8632689662784 positionAbsolute: x: 2183.4558888576476 y: 697.8632689662784 selected: false sourcePosition: right targetPosition: left type: custom width: 242 viewport: x: -675.5777822239224 y: 9.568461206490326 zoom: 0.7578582832552 rag_pipeline_variables: []