dependencies: - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608 kind: rag_pipeline rag_pipeline: description: '' icon: 📙 icon_background: '' icon_type: emoji name: website-crawl-general-economy version: 0.1.0 workflow: conversation_variables: [] environment_variables: [] features: {} graph: edges: - data: isInIteration: false isInLoop: false sourceType: datasource targetType: variable-aggregator id: 1752491761974-source-1752565435219-target source: '1752491761974' sourceHandle: source target: '1752565435219' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: datasource targetType: variable-aggregator id: 1752565402678-source-1752565435219-target source: '1752565402678' sourceHandle: source target: '1752565435219' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: variable-aggregator targetType: tool id: 1752565435219-source-1752569675978-target source: '1752565435219' sourceHandle: source target: '1752569675978' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: tool targetType: knowledge-index id: 1752569675978-source-1752477924228-target source: '1752569675978' sourceHandle: source target: '1752477924228' targetHandle: target type: custom zIndex: 0 nodes: - data: chunk_structure: text_model embedding_model: text-embedding-ada-002 embedding_model_provider: langgenius/openai/openai index_chunk_variable_selector: - '1752569675978' - result indexing_technique: economy keyword_number: 10 retrieval_model: score_threshold: 0.5 score_threshold_enabled: false search_method: keyword_search top_k: 3 vector_setting: embedding_model_name: text-embedding-ada-002 embedding_provider_name: langgenius/openai/openai selected: true title: Knowledge Base type: knowledge-index height: 114 id: '1752477924228' position: x: 2140.4053851189346 y: 281.3910724383104 positionAbsolute: x: 2140.4053851189346 y: 281.3910724383104 selected: true sourcePosition: right targetPosition: left type: custom width: 242 - data: datasource_configurations: {} datasource_label: Jina Reader datasource_name: jina_reader datasource_parameters: crawl_sub_pages: type: mixed value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}' limit: type: variable value: - rag - '1752491761974' - jina_limit url: type: mixed value: '{{#rag.1752491761974.jina_url#}}' use_sitemap: type: mixed value: '{{#rag.1752491761974.jina_use_sitemap#}}' plugin_id: langgenius/jina_datasource provider_name: jinareader provider_type: website_crawl selected: false title: Jina Reader type: datasource height: 52 id: '1752491761974' position: x: 1067.7526055798794 y: 281.3910724383104 positionAbsolute: x: 1067.7526055798794 y: 281.3910724383104 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: datasource_configurations: {} datasource_label: Firecrawl datasource_name: crawl datasource_parameters: crawl_subpages: type: mixed value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}' exclude_paths: type: mixed value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}' include_paths: type: mixed value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}' limit: type: variable value: - rag - '1752565402678' - firecrawl_limit max_depth: type: variable value: - rag - '1752565402678' - firecrawl_max_depth only_main_content: type: mixed value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}' url: type: mixed value: '{{#rag.1752565402678.firecrawl_url#}}' plugin_id: langgenius/firecrawl_datasource provider_name: firecrawl provider_type: website_crawl selected: false title: Firecrawl type: datasource height: 52 id: '1752565402678' position: x: 1067.7526055798794 y: 417.32608398342404 positionAbsolute: x: 1067.7526055798794 y: 417.32608398342404 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: output_type: string selected: false title: Variable Aggregator type: variable-aggregator variables: - - '1752491761974' - content - - '1752565402678' - content height: 129 id: '1752565435219' position: x: 1505.4306671642219 y: 281.3910724383104 positionAbsolute: x: 1505.4306671642219 y: 281.3910724383104 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_team_authorization: true output_schema: properties: result: description: The result of the general chunk tool. properties: general_chunks: items: description: The chunk of the text. type: string type: array type: object type: object paramSchemas: - auto_generate: null default: null form: llm human_description: en_US: The text you want to chunk. ja_JP: チャンク化したいテキスト。 pt_BR: O texto que você deseja dividir. zh_Hans: 你想要分块的文本。 label: en_US: Input Variable ja_JP: 入力変数 pt_BR: Variável de entrada zh_Hans: 输入变量 llm_description: The text you want to chunk. max: null min: null name: input_variable options: [] placeholder: null precision: null required: true scope: null template: null type: string - auto_generate: null default: null form: llm human_description: en_US: The delimiter of the chunks. ja_JP: チャンクの区切り記号。 pt_BR: O delimitador dos pedaços. zh_Hans: 块的分隔符。 label: en_US: Delimiter ja_JP: 区切り記号 pt_BR: Delimitador zh_Hans: 分隔符 llm_description: The delimiter of the chunks, the format of the delimiter must be a string. max: null min: null name: delimiter options: [] placeholder: null precision: null required: true scope: null template: null type: string - auto_generate: null default: null form: llm human_description: en_US: The maximum chunk length. ja_JP: 最大長のチャンク。 pt_BR: O comprimento máximo do bloco zh_Hans: 最大块的长度。 label: en_US: Maximum Chunk Length ja_JP: チャンク最大長 pt_BR: O comprimento máximo do bloco zh_Hans: 最大块的长度 llm_description: The maximum chunk length, the format of the chunk size must be an integer. max: null min: null name: max_chunk_length options: [] placeholder: null precision: null required: true scope: null template: null type: number - auto_generate: null default: null form: llm human_description: en_US: The chunk overlap length. ja_JP: チャンクの重複長 pt_BR: The chunk overlap length. zh_Hans: 块的重叠长度。 label: en_US: Chunk Overlap Length ja_JP: チャンク重複長 pt_BR: Chunk Overlap Length zh_Hans: 块的重叠长度 llm_description: The chunk overlap length, the format of the chunk overlap length must be an integer. max: null min: null name: chunk_overlap_length options: [] placeholder: null precision: null required: false scope: null template: null type: number - auto_generate: null default: null form: llm human_description: en_US: Replace consecutive spaces, newlines and tabs ja_JP: 連続のスペース、改行、まだはタブを置換する pt_BR: Replace consecutive spaces, newlines and tabs zh_Hans: 替换连续的空格、换行符和制表符 label: en_US: Replace Consecutive Spaces, Newlines and Tabs ja_JP: 連続のスペース、改行、まだはタブを置換する pt_BR: Replace Consecutive Spaces, Newlines and Tabs zh_Hans: 替换连续的空格、换行符和制表符 llm_description: Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean. max: null min: null name: replace_consecutive_spaces_newlines_tabs options: [] placeholder: null precision: null required: false scope: null template: null type: boolean - auto_generate: null default: null form: llm human_description: en_US: Delete all URLs and email addresses ja_JP: すべてのURLとメールアドレスを削除する pt_BR: Delete all URLs and email addresses zh_Hans: 删除所有URL和电子邮件地址 label: en_US: Delete All URLs and Email Addresses ja_JP: すべてのURLとメールアドレスを削除する pt_BR: Delete All URLs and Email Addresses zh_Hans: 删除所有URL和电子邮件地址 llm_description: Delete all URLs and email addresses, the format of the delete must be a boolean. max: null min: null name: delete_all_urls_and_email_addresses options: [] placeholder: null precision: null required: false scope: null template: null type: boolean params: chunk_overlap_length: '' delete_all_urls_and_email_addresses: '' delimiter: '' input_variable: '' max_chunk_length: '' replace_consecutive_spaces_newlines_tabs: '' provider_id: langgenius/general_chunker/general_chunker provider_name: langgenius/general_chunker/general_chunker provider_type: builtin selected: false title: General Chunker tool_configurations: {} tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same. tool_label: General Chunker tool_name: general_chunker tool_parameters: chunk_overlap_length: type: variable value: - rag - shared - chunk_overlap delete_all_urls_and_email_addresses: type: mixed value: '{{#rag.shared.delete_urls_email#}}' delimiter: type: mixed value: '{{#rag.shared.delimiter#}}' input_variable: type: mixed value: '{{#1752565435219.output#}}' max_chunk_length: type: variable value: - rag - shared - max_chunk_length replace_consecutive_spaces_newlines_tabs: type: mixed value: '{{#rag.shared.replace_consecutive_spaces#}}' type: tool height: 52 id: '1752569675978' position: x: 1807.4306671642219 y: 281.3910724383104 positionAbsolute: x: 1807.4306671642219 y: 281.3910724383104 sourcePosition: right targetPosition: left type: custom width: 242 viewport: x: -707.721097109337 y: -93.07807382100896 zoom: 0.9350632198875476 rag_pipeline_variables: - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: URL max_length: 256 options: [] placeholder: https://docs.dify.ai/en/ required: true tooltips: null type: text-input unit: null variable: jina_url - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: 10 label: Limit max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: null variable: jina_limit - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: Crawl sub-pages max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: jina_crawl_sub_pages - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: Use sitemap max_length: 48 options: [] placeholder: null required: false tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages. type: checkbox unit: null variable: jina_use_sitemap - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: URL max_length: 256 options: [] placeholder: https://docs.dify.ai/en/ required: true tooltips: null type: text-input unit: null variable: firecrawl_url - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: true label: Crawl sub-pages max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: firecrawl_crawl_sub_pages - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: 10 label: Limit max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: null variable: firecrawl_limit - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Max depth max_length: 48 options: [] placeholder: '' required: false tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes the page of the entered url, depth 1 scrapes the url and everything after enteredURL + one /, and so on. type: number unit: null variable: firecrawl_max_depth - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Exclude paths max_length: 256 options: [] placeholder: blog/*, /about/* required: false tooltips: null type: text-input unit: null variable: firecrawl_exclude_paths - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Include only paths max_length: 256 options: [] placeholder: articles/* required: false tooltips: null type: text-input unit: null variable: firecrawl_include_only_paths - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: firecrawl_extract_main_content max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: firecrawl_extract_main_content - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: \n\n label: Delimiter max_length: 100 options: [] placeholder: null required: true tooltips: A delimiter is the character used to separate text. \n\n is recommended for splitting the original document into large parent chunks. You can also use special delimiters defined by yourself. type: text-input unit: null variable: delimiter - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: 1024 label: Maximum chunk length max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: characters variable: max_chunk_length - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: 50 label: chunk_overlap max_length: 48 options: [] placeholder: null required: false tooltips: Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%–25% of the maximum chunk size. type: number unit: characters variable: chunk_overlap - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: null label: replace_consecutive_spaces max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: replace_consecutive_spaces - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: null label: Delete all URLs and email addresses max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: delete_urls_email