675 lines
20 KiB
YAML
675 lines
20 KiB
YAML
dependencies:
|
||
- current_identifier: null
|
||
type: marketplace
|
||
value:
|
||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||
- current_identifier: null
|
||
type: marketplace
|
||
value:
|
||
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
|
||
- current_identifier: null
|
||
type: marketplace
|
||
value:
|
||
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
|
||
kind: rag_pipeline
|
||
rag_pipeline:
|
||
description: ''
|
||
icon: 📙
|
||
icon_background: ''
|
||
icon_type: emoji
|
||
name: website-crawl-general-economy
|
||
version: 0.1.0
|
||
workflow:
|
||
conversation_variables: []
|
||
environment_variables: []
|
||
features: {}
|
||
graph:
|
||
edges:
|
||
- data:
|
||
isInIteration: false
|
||
isInLoop: false
|
||
sourceType: datasource
|
||
targetType: variable-aggregator
|
||
id: 1752491761974-source-1752565435219-target
|
||
source: '1752491761974'
|
||
sourceHandle: source
|
||
target: '1752565435219'
|
||
targetHandle: target
|
||
type: custom
|
||
zIndex: 0
|
||
- data:
|
||
isInLoop: false
|
||
sourceType: datasource
|
||
targetType: variable-aggregator
|
||
id: 1752565402678-source-1752565435219-target
|
||
source: '1752565402678'
|
||
sourceHandle: source
|
||
target: '1752565435219'
|
||
targetHandle: target
|
||
type: custom
|
||
zIndex: 0
|
||
- data:
|
||
isInIteration: false
|
||
isInLoop: false
|
||
sourceType: variable-aggregator
|
||
targetType: tool
|
||
id: 1752565435219-source-1752569675978-target
|
||
source: '1752565435219'
|
||
sourceHandle: source
|
||
target: '1752569675978'
|
||
targetHandle: target
|
||
type: custom
|
||
zIndex: 0
|
||
- data:
|
||
isInLoop: false
|
||
sourceType: tool
|
||
targetType: knowledge-index
|
||
id: 1752569675978-source-1752477924228-target
|
||
source: '1752569675978'
|
||
sourceHandle: source
|
||
target: '1752477924228'
|
||
targetHandle: target
|
||
type: custom
|
||
zIndex: 0
|
||
nodes:
|
||
- data:
|
||
chunk_structure: text_model
|
||
embedding_model: text-embedding-ada-002
|
||
embedding_model_provider: langgenius/openai/openai
|
||
index_chunk_variable_selector:
|
||
- '1752569675978'
|
||
- result
|
||
indexing_technique: economy
|
||
keyword_number: 10
|
||
retrieval_model:
|
||
score_threshold: 0.5
|
||
score_threshold_enabled: false
|
||
search_method: keyword_search
|
||
top_k: 3
|
||
vector_setting:
|
||
embedding_model_name: text-embedding-ada-002
|
||
embedding_provider_name: langgenius/openai/openai
|
||
selected: true
|
||
title: Knowledge Base
|
||
type: knowledge-index
|
||
height: 114
|
||
id: '1752477924228'
|
||
position:
|
||
x: 2140.4053851189346
|
||
y: 281.3910724383104
|
||
positionAbsolute:
|
||
x: 2140.4053851189346
|
||
y: 281.3910724383104
|
||
selected: true
|
||
sourcePosition: right
|
||
targetPosition: left
|
||
type: custom
|
||
width: 242
|
||
- data:
|
||
datasource_configurations: {}
|
||
datasource_label: Jina Reader
|
||
datasource_name: jina_reader
|
||
datasource_parameters:
|
||
crawl_sub_pages:
|
||
type: mixed
|
||
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
|
||
limit:
|
||
type: variable
|
||
value:
|
||
- rag
|
||
- '1752491761974'
|
||
- jina_limit
|
||
url:
|
||
type: mixed
|
||
value: '{{#rag.1752491761974.jina_url#}}'
|
||
use_sitemap:
|
||
type: mixed
|
||
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
|
||
plugin_id: langgenius/jina_datasource
|
||
provider_name: jinareader
|
||
provider_type: website_crawl
|
||
selected: false
|
||
title: Jina Reader
|
||
type: datasource
|
||
height: 52
|
||
id: '1752491761974'
|
||
position:
|
||
x: 1067.7526055798794
|
||
y: 281.3910724383104
|
||
positionAbsolute:
|
||
x: 1067.7526055798794
|
||
y: 281.3910724383104
|
||
selected: false
|
||
sourcePosition: right
|
||
targetPosition: left
|
||
type: custom
|
||
width: 242
|
||
- data:
|
||
datasource_configurations: {}
|
||
datasource_label: Firecrawl
|
||
datasource_name: crawl
|
||
datasource_parameters:
|
||
crawl_subpages:
|
||
type: mixed
|
||
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
|
||
exclude_paths:
|
||
type: mixed
|
||
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
|
||
include_paths:
|
||
type: mixed
|
||
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
|
||
limit:
|
||
type: variable
|
||
value:
|
||
- rag
|
||
- '1752565402678'
|
||
- firecrawl_limit
|
||
max_depth:
|
||
type: variable
|
||
value:
|
||
- rag
|
||
- '1752565402678'
|
||
- firecrawl_max_depth
|
||
only_main_content:
|
||
type: mixed
|
||
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
|
||
url:
|
||
type: mixed
|
||
value: '{{#rag.1752565402678.firecrawl_url#}}'
|
||
plugin_id: langgenius/firecrawl_datasource
|
||
provider_name: firecrawl
|
||
provider_type: website_crawl
|
||
selected: false
|
||
title: Firecrawl
|
||
type: datasource
|
||
height: 52
|
||
id: '1752565402678'
|
||
position:
|
||
x: 1067.7526055798794
|
||
y: 417.32608398342404
|
||
positionAbsolute:
|
||
x: 1067.7526055798794
|
||
y: 417.32608398342404
|
||
selected: false
|
||
sourcePosition: right
|
||
targetPosition: left
|
||
type: custom
|
||
width: 242
|
||
- data:
|
||
output_type: string
|
||
selected: false
|
||
title: Variable Aggregator
|
||
type: variable-aggregator
|
||
variables:
|
||
- - '1752491761974'
|
||
- content
|
||
- - '1752565402678'
|
||
- content
|
||
height: 129
|
||
id: '1752565435219'
|
||
position:
|
||
x: 1505.4306671642219
|
||
y: 281.3910724383104
|
||
positionAbsolute:
|
||
x: 1505.4306671642219
|
||
y: 281.3910724383104
|
||
selected: false
|
||
sourcePosition: right
|
||
targetPosition: left
|
||
type: custom
|
||
width: 242
|
||
- data:
|
||
is_team_authorization: true
|
||
output_schema:
|
||
properties:
|
||
result:
|
||
description: The result of the general chunk tool.
|
||
properties:
|
||
general_chunks:
|
||
items:
|
||
description: The chunk of the text.
|
||
type: string
|
||
type: array
|
||
type: object
|
||
type: object
|
||
paramSchemas:
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: The text you want to chunk.
|
||
ja_JP: チャンク化したいテキスト。
|
||
pt_BR: O texto que você deseja dividir.
|
||
zh_Hans: 你想要分块的文本。
|
||
label:
|
||
en_US: Input Variable
|
||
ja_JP: 入力変数
|
||
pt_BR: Variável de entrada
|
||
zh_Hans: 输入变量
|
||
llm_description: The text you want to chunk.
|
||
max: null
|
||
min: null
|
||
name: input_variable
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: true
|
||
scope: null
|
||
template: null
|
||
type: string
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: The delimiter of the chunks.
|
||
ja_JP: チャンクの区切り記号。
|
||
pt_BR: O delimitador dos pedaços.
|
||
zh_Hans: 块的分隔符。
|
||
label:
|
||
en_US: Delimiter
|
||
ja_JP: 区切り記号
|
||
pt_BR: Delimitador
|
||
zh_Hans: 分隔符
|
||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||
must be a string.
|
||
max: null
|
||
min: null
|
||
name: delimiter
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: true
|
||
scope: null
|
||
template: null
|
||
type: string
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: The maximum chunk length.
|
||
ja_JP: 最大長のチャンク。
|
||
pt_BR: O comprimento máximo do bloco
|
||
zh_Hans: 最大块的长度。
|
||
label:
|
||
en_US: Maximum Chunk Length
|
||
ja_JP: チャンク最大長
|
||
pt_BR: O comprimento máximo do bloco
|
||
zh_Hans: 最大块的长度
|
||
llm_description: The maximum chunk length, the format of the chunk size
|
||
must be an integer.
|
||
max: null
|
||
min: null
|
||
name: max_chunk_length
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: true
|
||
scope: null
|
||
template: null
|
||
type: number
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: The chunk overlap length.
|
||
ja_JP: チャンクの重複長
|
||
pt_BR: The chunk overlap length.
|
||
zh_Hans: 块的重叠长度。
|
||
label:
|
||
en_US: Chunk Overlap Length
|
||
ja_JP: チャンク重複長
|
||
pt_BR: Chunk Overlap Length
|
||
zh_Hans: 块的重叠长度
|
||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||
length must be an integer.
|
||
max: null
|
||
min: null
|
||
name: chunk_overlap_length
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: false
|
||
scope: null
|
||
template: null
|
||
type: number
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: Replace consecutive spaces, newlines and tabs
|
||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||
zh_Hans: 替换连续的空格、换行符和制表符
|
||
label:
|
||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||
zh_Hans: 替换连续的空格、换行符和制表符
|
||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||
of the replace must be a boolean.
|
||
max: null
|
||
min: null
|
||
name: replace_consecutive_spaces_newlines_tabs
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: false
|
||
scope: null
|
||
template: null
|
||
type: boolean
|
||
- auto_generate: null
|
||
default: null
|
||
form: llm
|
||
human_description:
|
||
en_US: Delete all URLs and email addresses
|
||
ja_JP: すべてのURLとメールアドレスを削除する
|
||
pt_BR: Delete all URLs and email addresses
|
||
zh_Hans: 删除所有URL和电子邮件地址
|
||
label:
|
||
en_US: Delete All URLs and Email Addresses
|
||
ja_JP: すべてのURLとメールアドレスを削除する
|
||
pt_BR: Delete All URLs and Email Addresses
|
||
zh_Hans: 删除所有URL和电子邮件地址
|
||
llm_description: Delete all URLs and email addresses, the format of the
|
||
delete must be a boolean.
|
||
max: null
|
||
min: null
|
||
name: delete_all_urls_and_email_addresses
|
||
options: []
|
||
placeholder: null
|
||
precision: null
|
||
required: false
|
||
scope: null
|
||
template: null
|
||
type: boolean
|
||
params:
|
||
chunk_overlap_length: ''
|
||
delete_all_urls_and_email_addresses: ''
|
||
delimiter: ''
|
||
input_variable: ''
|
||
max_chunk_length: ''
|
||
replace_consecutive_spaces_newlines_tabs: ''
|
||
provider_id: langgenius/general_chunker/general_chunker
|
||
provider_name: langgenius/general_chunker/general_chunker
|
||
provider_type: builtin
|
||
selected: false
|
||
title: General Chunker
|
||
tool_configurations: {}
|
||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||
tool_label: General Chunker
|
||
tool_name: general_chunker
|
||
tool_parameters:
|
||
chunk_overlap_length:
|
||
type: variable
|
||
value:
|
||
- rag
|
||
- shared
|
||
- chunk_overlap
|
||
delete_all_urls_and_email_addresses:
|
||
type: mixed
|
||
value: '{{#rag.shared.delete_urls_email#}}'
|
||
delimiter:
|
||
type: mixed
|
||
value: '{{#rag.shared.delimiter#}}'
|
||
input_variable:
|
||
type: mixed
|
||
value: '{{#1752565435219.output#}}'
|
||
max_chunk_length:
|
||
type: variable
|
||
value:
|
||
- rag
|
||
- shared
|
||
- max_chunk_length
|
||
replace_consecutive_spaces_newlines_tabs:
|
||
type: mixed
|
||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||
type: tool
|
||
height: 52
|
||
id: '1752569675978'
|
||
position:
|
||
x: 1807.4306671642219
|
||
y: 281.3910724383104
|
||
positionAbsolute:
|
||
x: 1807.4306671642219
|
||
y: 281.3910724383104
|
||
sourcePosition: right
|
||
targetPosition: left
|
||
type: custom
|
||
width: 242
|
||
viewport:
|
||
x: -707.721097109337
|
||
y: -93.07807382100896
|
||
zoom: 0.9350632198875476
|
||
rag_pipeline_variables:
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752491761974'
|
||
default_value: null
|
||
label: URL
|
||
max_length: 256
|
||
options: []
|
||
placeholder: https://docs.dify.ai/en/
|
||
required: true
|
||
tooltips: null
|
||
type: text-input
|
||
unit: null
|
||
variable: jina_url
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752491761974'
|
||
default_value: 10
|
||
label: Limit
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: true
|
||
tooltips: null
|
||
type: number
|
||
unit: null
|
||
variable: jina_limit
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752491761974'
|
||
default_value: null
|
||
label: Crawl sub-pages
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: null
|
||
type: checkbox
|
||
unit: null
|
||
variable: jina_crawl_sub_pages
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752491761974'
|
||
default_value: null
|
||
label: Use sitemap
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
|
||
iteratively based on page relevance, yielding fewer but higher-quality pages.
|
||
type: checkbox
|
||
unit: null
|
||
variable: jina_use_sitemap
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: null
|
||
label: URL
|
||
max_length: 256
|
||
options: []
|
||
placeholder: https://docs.dify.ai/en/
|
||
required: true
|
||
tooltips: null
|
||
type: text-input
|
||
unit: null
|
||
variable: firecrawl_url
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: true
|
||
label: Crawl sub-pages
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: null
|
||
type: checkbox
|
||
unit: null
|
||
variable: firecrawl_crawl_sub_pages
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: 10
|
||
label: Limit
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: true
|
||
tooltips: null
|
||
type: number
|
||
unit: null
|
||
variable: firecrawl_limit
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: null
|
||
label: Max depth
|
||
max_length: 48
|
||
options: []
|
||
placeholder: ''
|
||
required: false
|
||
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
|
||
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
|
||
+ one /, and so on.
|
||
type: number
|
||
unit: null
|
||
variable: firecrawl_max_depth
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: null
|
||
label: Exclude paths
|
||
max_length: 256
|
||
options: []
|
||
placeholder: blog/*, /about/*
|
||
required: false
|
||
tooltips: null
|
||
type: text-input
|
||
unit: null
|
||
variable: firecrawl_exclude_paths
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: null
|
||
label: Include only paths
|
||
max_length: 256
|
||
options: []
|
||
placeholder: articles/*
|
||
required: false
|
||
tooltips: null
|
||
type: text-input
|
||
unit: null
|
||
variable: firecrawl_include_only_paths
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: '1752565402678'
|
||
default_value: null
|
||
label: firecrawl_extract_main_content
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: null
|
||
type: checkbox
|
||
unit: null
|
||
variable: firecrawl_extract_main_content
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: shared
|
||
default_value: \n\n
|
||
label: Delimiter
|
||
max_length: 100
|
||
options: []
|
||
placeholder: null
|
||
required: true
|
||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||
for splitting the original document into large parent chunks. You can also use
|
||
special delimiters defined by yourself.
|
||
type: text-input
|
||
unit: null
|
||
variable: delimiter
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: shared
|
||
default_value: 1024
|
||
label: Maximum chunk length
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: true
|
||
tooltips: null
|
||
type: number
|
||
unit: characters
|
||
variable: max_chunk_length
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: shared
|
||
default_value: 50
|
||
label: chunk_overlap
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: Setting the chunk overlap can maintain the semantic relevance between
|
||
them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
|
||
maximum chunk size.
|
||
type: number
|
||
unit: characters
|
||
variable: chunk_overlap
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: shared
|
||
default_value: null
|
||
label: replace_consecutive_spaces
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: null
|
||
type: checkbox
|
||
unit: null
|
||
variable: replace_consecutive_spaces
|
||
- allow_file_extension: null
|
||
allow_file_upload_methods: null
|
||
allowed_file_types: null
|
||
belong_to_node_id: shared
|
||
default_value: null
|
||
label: Delete all URLs and email addresses
|
||
max_length: 48
|
||
options: []
|
||
placeholder: null
|
||
required: false
|
||
tooltips: null
|
||
type: checkbox
|
||
unit: null
|
||
variable: delete_urls_email
|