dependencies: - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40 - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a - current_identifier: null type: marketplace value: plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608 kind: rag_pipeline rag_pipeline: description: '' icon: 📙 icon_background: '' icon_type: emoji name: website-crawl-parentchild version: 0.1.0 workflow: conversation_variables: [] environment_variables: [] features: {} graph: edges: - data: isInLoop: false sourceType: tool targetType: knowledge-index id: 1752490343805-source-1752477924228-target source: '1752490343805' sourceHandle: source target: '1752477924228' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: datasource targetType: variable-aggregator id: 1752491761974-source-1752565435219-target source: '1752491761974' sourceHandle: source target: '1752565435219' targetHandle: target type: custom zIndex: 0 - data: isInIteration: false isInLoop: false sourceType: variable-aggregator targetType: tool id: 1752565435219-source-1752490343805-target source: '1752565435219' sourceHandle: source target: '1752490343805' targetHandle: target type: custom zIndex: 0 - data: isInLoop: false sourceType: datasource targetType: variable-aggregator id: 1752565402678-source-1752565435219-target source: '1752565402678' sourceHandle: source target: '1752565435219' targetHandle: target type: custom zIndex: 0 nodes: - data: chunk_structure: hierarchical_model embedding_model: text-embedding-ada-002 embedding_model_provider: langgenius/openai/openai index_chunk_variable_selector: - '1752490343805' - result indexing_technique: high_quality keyword_number: 10 retrieval_model: score_threshold: 0.5 score_threshold_enabled: false search_method: semantic_search top_k: 3 vector_setting: embedding_model_name: text-embedding-ada-002 embedding_provider_name: langgenius/openai/openai selected: false title: Knowledge Base type: knowledge-index height: 114 id: '1752477924228' position: x: 2215.5544306817387 y: 281.3910724383104 positionAbsolute: x: 2215.5544306817387 y: 281.3910724383104 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: is_team_authorization: true output_schema: properties: result: description: Parent child chunks result items: type: object type: array type: object paramSchemas: - auto_generate: null default: null form: llm human_description: en_US: The text you want to chunk. ja_JP: チャンク化したいテキスト。 pt_BR: O texto que você deseja dividir. zh_Hans: 你想要分块的文本。 label: en_US: Input text ja_JP: 入力テキスト pt_BR: Texto de entrada zh_Hans: 输入文本 llm_description: The text you want to chunk. max: null min: null name: input_text options: [] placeholder: null precision: null required: true scope: null template: null type: string - auto_generate: null default: 1024 form: llm human_description: en_US: Maximum length for chunking ja_JP: チャンク分割の最大長 pt_BR: Comprimento máximo para divisão zh_Hans: 用于分块的最大长度 label: en_US: Maximum Length ja_JP: 最大長 pt_BR: Comprimento Máximo zh_Hans: 最大长度 llm_description: Maximum length allowed per chunk max: null min: null name: max_length options: [] placeholder: null precision: null required: false scope: null template: null type: number - auto_generate: null default: ' ' form: llm human_description: en_US: Separator used for chunking ja_JP: チャンク分割に使用する区切り文字 pt_BR: Separador usado para divisão zh_Hans: 用于分块的分隔符 label: en_US: Chunk Separator ja_JP: チャンク区切り文字 pt_BR: Separador de Divisão zh_Hans: 分块分隔符 llm_description: The separator used to split chunks max: null min: null name: separator options: [] placeholder: null precision: null required: false scope: null template: null type: string - auto_generate: null default: 512 form: llm human_description: en_US: Maximum length for subchunking ja_JP: サブチャンク分割の最大長 pt_BR: Comprimento máximo para subdivisão zh_Hans: 用于子分块的最大长度 label: en_US: Subchunk Maximum Length ja_JP: サブチャンク最大長 pt_BR: Comprimento Máximo de Subdivisão zh_Hans: 子分块最大长度 llm_description: Maximum length allowed per subchunk max: null min: null name: subchunk_max_length options: [] placeholder: null precision: null required: false scope: null template: null type: number - auto_generate: null default: '. ' form: llm human_description: en_US: Separator used for subchunking ja_JP: サブチャンク分割に使用する区切り文字 pt_BR: Separador usado para subdivisão zh_Hans: 用于子分块的分隔符 label: en_US: Subchunk Separator ja_JP: サブチャンキング用セパレーター pt_BR: Separador de Subdivisão zh_Hans: 子分块分隔符 llm_description: The separator used to split subchunks max: null min: null name: subchunk_separator options: [] placeholder: null precision: null required: false scope: null template: null type: string - auto_generate: null default: paragraph form: llm human_description: en_US: Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve. ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。 pt_BR: Dividir texto em parágrafos com base no separador e no comprimento máximo do bloco, usando o texto dividido como bloco pai ou documento completo como bloco pai e diretamente recuperá-lo. zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。 label: en_US: Parent Mode ja_JP: 親子モード pt_BR: Modo Pai zh_Hans: 父块模式 llm_description: Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve. max: null min: null name: parent_mode options: - icon: '' label: en_US: Paragraph ja_JP: 段落 pt_BR: Parágrafo zh_Hans: 段落 value: paragraph - icon: '' label: en_US: Full Document ja_JP: 全文 pt_BR: Documento Completo zh_Hans: 全文 value: full_doc placeholder: null precision: null required: true scope: null template: null type: select - auto_generate: null default: 0 form: llm human_description: en_US: Whether to remove extra spaces in the text ja_JP: テキスト内の余分なスペースを削除するかどうか pt_BR: Se deve remover espaços extras no texto zh_Hans: 是否移除文本中的多余空格 label: en_US: Remove Extra Spaces ja_JP: 余分なスペースを削除 pt_BR: Remover Espaços Extras zh_Hans: 移除多余空格 llm_description: Whether to remove extra spaces in the text max: null min: null name: remove_extra_spaces options: [] placeholder: null precision: null required: false scope: null template: null type: boolean - auto_generate: null default: 0 form: llm human_description: en_US: Whether to remove URLs and emails in the text ja_JP: テキスト内のURLやメールアドレスを削除するかどうか pt_BR: Se deve remover URLs e e-mails no texto zh_Hans: 是否移除文本中的URL和电子邮件地址 label: en_US: Remove URLs and Emails ja_JP: URLとメールアドレスを削除 pt_BR: Remover URLs e E-mails zh_Hans: 移除URL和电子邮件地址 llm_description: Whether to remove URLs and emails in the text max: null min: null name: remove_urls_emails options: [] placeholder: null precision: null required: false scope: null template: null type: boolean params: input_text: '' max_length: '' parent_mode: '' remove_extra_spaces: '' remove_urls_emails: '' separator: '' subchunk_max_length: '' subchunk_separator: '' provider_id: langgenius/parentchild_chunker/parentchild_chunker provider_name: langgenius/parentchild_chunker/parentchild_chunker provider_type: builtin selected: true title: Parent-child Chunker tool_configurations: {} tool_description: Parent-child Chunk Structure tool_label: Parent-child Chunker tool_name: parentchild_chunker tool_parameters: input_text: type: mixed value: '{{#1752565435219.output#}}' max_length: type: variable value: - rag - shared - max_chunk_length parent_mode: type: variable value: - rag - shared - parent_mode remove_extra_spaces: type: mixed value: '{{#rag.shared.replace_consecutive_spaces#}}' remove_urls_emails: type: mixed value: '{{#rag.shared.delete_urls_email#}}' separator: type: mixed value: '{{#rag.shared.delimiter#}}' subchunk_max_length: type: variable value: - rag - shared - child_max_chunk_length subchunk_separator: type: mixed value: '{{#rag.shared.child_delimiter#}}' type: tool height: 52 id: '1752490343805' position: x: 1853.5260563244174 y: 281.3910724383104 positionAbsolute: x: 1853.5260563244174 y: 281.3910724383104 selected: true sourcePosition: right targetPosition: left type: custom width: 242 - data: datasource_configurations: {} datasource_label: Jina Reader datasource_name: jina_reader datasource_parameters: crawl_sub_pages: type: mixed value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}' limit: type: variable value: - rag - '1752491761974' - jina_limit url: type: mixed value: '{{#rag.1752491761974.jina_url#}}' use_sitemap: type: mixed value: '{{#rag.1752491761974.jina_use_sitemap#}}' plugin_id: langgenius/jina_datasource provider_name: jinareader provider_type: website_crawl selected: false title: Jina Reader type: datasource height: 52 id: '1752491761974' position: x: 1067.7526055798794 y: 281.3910724383104 positionAbsolute: x: 1067.7526055798794 y: 281.3910724383104 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: datasource_configurations: {} datasource_label: Firecrawl datasource_name: crawl datasource_parameters: crawl_subpages: type: mixed value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}' exclude_paths: type: mixed value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}' include_paths: type: mixed value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}' limit: type: variable value: - rag - '1752565402678' - firecrawl_limit max_depth: type: variable value: - rag - '1752565402678' - firecrawl_max_depth only_main_content: type: mixed value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}' url: type: mixed value: '{{#rag.1752565402678.firecrawl_url#}}' plugin_id: langgenius/firecrawl_datasource provider_name: firecrawl provider_type: website_crawl selected: false title: Firecrawl type: datasource height: 52 id: '1752565402678' position: x: 1067.7526055798794 y: 417.32608398342404 positionAbsolute: x: 1067.7526055798794 y: 417.32608398342404 selected: false sourcePosition: right targetPosition: left type: custom width: 242 - data: output_type: string selected: false title: Variable Aggregator type: variable-aggregator variables: - - '1752491761974' - content - - '1752565402678' - content height: 129 id: '1752565435219' position: x: 1505.4306671642219 y: 281.3910724383104 positionAbsolute: x: 1505.4306671642219 y: 281.3910724383104 selected: false sourcePosition: right targetPosition: left type: custom width: 242 viewport: x: -826.1791044466438 y: -71.91725474841303 zoom: 0.9980166672552107 rag_pipeline_variables: - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: URL max_length: 256 options: [] placeholder: https://docs.dify.ai/en/ required: true tooltips: null type: text-input unit: null variable: jina_url - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: 10 label: Limit max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: null variable: jina_limit - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: Crawl sub-pages max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: jina_crawl_sub_pages - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752491761974' default_value: null label: Use sitemap max_length: 48 options: [] placeholder: null required: false tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages. type: checkbox unit: null variable: jina_use_sitemap - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: URL max_length: 256 options: [] placeholder: https://docs.dify.ai/en/ required: true tooltips: null type: text-input unit: null variable: firecrawl_url - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: true label: Crawl sub-pages max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: firecrawl_crawl_sub_pages - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: 10 label: Limit max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: null variable: firecrawl_limit - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Max depth max_length: 48 options: [] placeholder: '' required: false tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes the page of the entered url, depth 1 scrapes the url and everything after enteredURL + one /, and so on. type: number unit: null variable: firecrawl_max_depth - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Exclude paths max_length: 256 options: [] placeholder: blog/*, /about/* required: false tooltips: null type: text-input unit: null variable: firecrawl_exclude_paths - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: Include only paths max_length: 256 options: [] placeholder: articles/* required: false tooltips: null type: text-input unit: null variable: firecrawl_include_only_paths - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: '1752565402678' default_value: null label: firecrawl_extract_main_content max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: firecrawl_extract_main_content - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: \n\n label: delimiter max_length: 100 options: [] placeholder: null required: true tooltips: A delimiter is the character used to separate text. \n\n is recommended for splitting the original document into large parent chunks. You can also use special delimiters defined by yourself. type: text-input unit: null variable: delimiter - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: 1024 label: Maximum chunk length max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: characters variable: max_chunk_length - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: \n label: Child delimiter max_length: 199 options: [] placeholder: null required: true tooltips: A delimiter is the character used to separate text. \n\n is recommended for splitting the original document into large parent chunks. You can also use special delimiters defined by yourself. type: text-input unit: null variable: child_delimiter - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: 512 label: Child max chunk length max_length: 48 options: [] placeholder: null required: true tooltips: null type: number unit: characters variable: child_max_chunk_length - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: paragraph label: Parent mode max_length: 48 options: - full_doc - paragraph placeholder: null required: true tooltips: null type: select unit: null variable: parent_mode - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: null label: Replace consecutive spaces, newlines and tabs max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: replace_consecutive_spaces - allow_file_extension: null allow_file_upload_methods: null allowed_file_types: null belong_to_node_id: shared default_value: null label: Delete all URLs and email addresses max_length: 48 options: [] placeholder: null required: false tooltips: null type: checkbox unit: null variable: delete_urls_email