This commit is contained in:
2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
from collections.abc import Mapping
from typing import Any
from pydantic import BaseModel
class DatasourceNodeRunApiEntity(BaseModel):
pipeline_id: str
node_id: str
inputs: dict[str, Any]
datasource_type: str
credential_id: str | None = None
is_published: bool
class PipelineRunApiEntity(BaseModel):
inputs: Mapping[str, Any]
datasource_type: str
datasource_info_list: list[Mapping[str, Any]]
start_node_id: str
is_published: bool
response_mode: str

View File

@@ -0,0 +1,115 @@
from collections.abc import Mapping
from typing import Any, Union
from configs import dify_config
from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
from core.app.entities.app_invoke_entities import InvokeFrom
from extensions.ext_database import db
from models.dataset import Document, Pipeline
from models.model import Account, App, EndUser
from models.workflow import Workflow
from services.rag_pipeline.rag_pipeline import RagPipelineService
class PipelineGenerateService:
@classmethod
def generate(
cls,
pipeline: Pipeline,
user: Union[Account, EndUser],
args: Mapping[str, Any],
invoke_from: InvokeFrom,
streaming: bool = True,
):
"""
Pipeline Content Generate
:param pipeline: pipeline
:param user: user
:param args: args
:param invoke_from: invoke from
:param streaming: streaming
:return:
"""
try:
workflow = cls._get_workflow(pipeline, invoke_from)
if original_document_id := args.get("original_document_id"):
# update document status to waiting
cls.update_document_status(original_document_id)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().generate(
pipeline=pipeline,
workflow=workflow,
user=user,
args=args,
invoke_from=invoke_from,
streaming=streaming,
call_depth=0,
workflow_thread_pool_id=None,
),
)
except Exception:
raise
@staticmethod
def _get_max_active_requests(app_model: App) -> int:
max_active_requests = app_model.max_active_requests
if max_active_requests is None:
max_active_requests = int(dify_config.APP_MAX_ACTIVE_REQUESTS)
return max_active_requests
@classmethod
def generate_single_iteration(
cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True
):
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().single_iteration_generate(
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
)
)
@classmethod
def generate_single_loop(cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True):
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().single_loop_generate(
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
)
)
@classmethod
def _get_workflow(cls, pipeline: Pipeline, invoke_from: InvokeFrom) -> Workflow:
"""
Get workflow
:param pipeline: pipeline
:param invoke_from: invoke from
:return:
"""
rag_pipeline_service = RagPipelineService()
if invoke_from == InvokeFrom.DEBUGGER:
# fetch draft workflow by app_model
workflow = rag_pipeline_service.get_draft_workflow(pipeline=pipeline)
if not workflow:
raise ValueError("Workflow not initialized")
else:
# fetch published workflow by app_model
workflow = rag_pipeline_service.get_published_workflow(pipeline=pipeline)
if not workflow:
raise ValueError("Workflow not published")
return workflow
@classmethod
def update_document_status(cls, document_id: str):
"""
Update document status to waiting
:param document_id: document id
"""
document = db.session.query(Document).where(Document.id == document_id).first()
if document:
document.indexing_status = "waiting"
db.session.add(document)
db.session.commit()

View File

@@ -0,0 +1,63 @@
import json
from os import path
from pathlib import Path
from flask import current_app
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class BuiltInPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval pipeline template from built-in, the location is constants/pipeline_templates.json
"""
builtin_data: dict | None = None
def get_type(self) -> str:
return PipelineTemplateType.BUILTIN
def get_pipeline_templates(self, language: str) -> dict:
result = self.fetch_pipeline_templates_from_builtin(language)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_builtin(template_id)
return result
@classmethod
def _get_builtin_data(cls) -> dict:
"""
Get builtin data.
:return:
"""
if cls.builtin_data:
return cls.builtin_data
root_path = current_app.root_path
cls.builtin_data = json.loads(
Path(path.join(root_path, "constants", "pipeline_templates.json")).read_text(encoding="utf-8")
)
return cls.builtin_data or {}
@classmethod
def fetch_pipeline_templates_from_builtin(cls, language: str) -> dict:
"""
Fetch pipeline templates from builtin.
:param language: language
:return:
"""
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
return builtin_data.get("pipeline_templates", {}).get(language, {})
@classmethod
def fetch_pipeline_template_detail_from_builtin(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from builtin.
:param template_id: Template ID
:return:
"""
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
return builtin_data.get("pipeline_templates", {}).get(template_id)

View File

@@ -0,0 +1,80 @@
import yaml
from extensions.ext_database import db
from libs.login import current_account_with_tenant
from models.dataset import PipelineCustomizedTemplate
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class CustomizedPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval recommended app from database
"""
def get_pipeline_templates(self, language: str) -> dict:
_, current_tenant_id = current_account_with_tenant()
result = self.fetch_pipeline_templates_from_customized(tenant_id=current_tenant_id, language=language)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_type(self) -> str:
return PipelineTemplateType.CUSTOMIZED
@classmethod
def fetch_pipeline_templates_from_customized(cls, tenant_id: str, language: str) -> dict:
"""
Fetch pipeline templates from db.
:param tenant_id: tenant id
:param language: language
:return:
"""
pipeline_customized_templates = (
db.session.query(PipelineCustomizedTemplate)
.where(PipelineCustomizedTemplate.tenant_id == tenant_id, PipelineCustomizedTemplate.language == language)
.order_by(PipelineCustomizedTemplate.position.asc(), PipelineCustomizedTemplate.created_at.desc())
.all()
)
recommended_pipelines_results = []
for pipeline_customized_template in pipeline_customized_templates:
recommended_pipeline_result = {
"id": pipeline_customized_template.id,
"name": pipeline_customized_template.name,
"description": pipeline_customized_template.description,
"icon": pipeline_customized_template.icon,
"position": pipeline_customized_template.position,
"chunk_structure": pipeline_customized_template.chunk_structure,
}
recommended_pipelines_results.append(recommended_pipeline_result)
return {"pipeline_templates": recommended_pipelines_results}
@classmethod
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from db.
:param template_id: Template ID
:return:
"""
pipeline_template = (
db.session.query(PipelineCustomizedTemplate).where(PipelineCustomizedTemplate.id == template_id).first()
)
if not pipeline_template:
return None
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
graph_data = dsl_data.get("workflow", {}).get("graph", {})
return {
"id": pipeline_template.id,
"name": pipeline_template.name,
"icon_info": pipeline_template.icon,
"description": pipeline_template.description,
"chunk_structure": pipeline_template.chunk_structure,
"export_data": pipeline_template.yaml_content,
"graph": graph_data,
"created_by": pipeline_template.created_user_name,
}

View File

@@ -0,0 +1,77 @@
import yaml
from extensions.ext_database import db
from models.dataset import PipelineBuiltInTemplate
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class DatabasePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval pipeline template from database
"""
def get_pipeline_templates(self, language: str) -> dict:
result = self.fetch_pipeline_templates_from_db(language)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_type(self) -> str:
return PipelineTemplateType.DATABASE
@classmethod
def fetch_pipeline_templates_from_db(cls, language: str) -> dict:
"""
Fetch pipeline templates from db.
:param language: language
:return:
"""
pipeline_built_in_templates: list[PipelineBuiltInTemplate] = (
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.language == language).all()
)
recommended_pipelines_results = []
for pipeline_built_in_template in pipeline_built_in_templates:
recommended_pipeline_result = {
"id": pipeline_built_in_template.id,
"name": pipeline_built_in_template.name,
"description": pipeline_built_in_template.description,
"icon": pipeline_built_in_template.icon,
"copyright": pipeline_built_in_template.copyright,
"privacy_policy": pipeline_built_in_template.privacy_policy,
"position": pipeline_built_in_template.position,
"chunk_structure": pipeline_built_in_template.chunk_structure,
}
recommended_pipelines_results.append(recommended_pipeline_result)
return {"pipeline_templates": recommended_pipelines_results}
@classmethod
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from db.
:param pipeline_id: Pipeline ID
:return:
"""
# is in public recommended list
pipeline_template = (
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.id == template_id).first()
)
if not pipeline_template:
return None
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
graph_data = dsl_data.get("workflow", {}).get("graph", {})
return {
"id": pipeline_template.id,
"name": pipeline_template.name,
"icon_info": pipeline_template.icon,
"description": pipeline_template.description,
"chunk_structure": pipeline_template.chunk_structure,
"export_data": pipeline_template.yaml_content,
"graph": graph_data,
}

View File

@@ -0,0 +1,17 @@
from abc import ABC, abstractmethod
class PipelineTemplateRetrievalBase(ABC):
"""Interface for pipeline template retrieval."""
@abstractmethod
def get_pipeline_templates(self, language: str) -> dict:
raise NotImplementedError
@abstractmethod
def get_pipeline_template_detail(self, template_id: str) -> dict | None:
raise NotImplementedError
@abstractmethod
def get_type(self) -> str:
raise NotImplementedError

View File

@@ -0,0 +1,26 @@
from services.rag_pipeline.pipeline_template.built_in.built_in_retrieval import BuiltInPipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.customized.customized_retrieval import CustomizedPipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
from services.rag_pipeline.pipeline_template.remote.remote_retrieval import RemotePipelineTemplateRetrieval
class PipelineTemplateRetrievalFactory:
@staticmethod
def get_pipeline_template_factory(mode: str) -> type[PipelineTemplateRetrievalBase]:
match mode:
case PipelineTemplateType.REMOTE:
return RemotePipelineTemplateRetrieval
case PipelineTemplateType.CUSTOMIZED:
return CustomizedPipelineTemplateRetrieval
case PipelineTemplateType.DATABASE:
return DatabasePipelineTemplateRetrieval
case PipelineTemplateType.BUILTIN:
return BuiltInPipelineTemplateRetrieval
case _:
raise ValueError(f"invalid fetch recommended apps mode: {mode}")
@staticmethod
def get_built_in_pipeline_template_retrieval():
return BuiltInPipelineTemplateRetrieval

View File

@@ -0,0 +1,8 @@
from enum import StrEnum
class PipelineTemplateType(StrEnum):
REMOTE = "remote"
DATABASE = "database"
CUSTOMIZED = "customized"
BUILTIN = "builtin"

View File

@@ -0,0 +1,67 @@
import logging
import httpx
from configs import dify_config
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
logger = logging.getLogger(__name__)
class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval recommended app from dify official
"""
def get_pipeline_template_detail(self, template_id: str):
try:
result = self.fetch_pipeline_template_detail_from_dify_official(template_id)
except Exception as e:
logger.warning("fetch recommended app detail from dify official failed: %r, switch to database.", e)
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_pipeline_templates(self, language: str) -> dict:
try:
result = self.fetch_pipeline_templates_from_dify_official(language)
except Exception as e:
logger.warning("fetch pipeline templates from dify official failed: %r, switch to database.", e)
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_templates_from_db(language)
return result
def get_type(self) -> str:
return PipelineTemplateType.REMOTE
@classmethod
def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from dify official.
:param template_id: Pipeline ID
:return:
"""
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
url = f"{domain}/pipeline-templates/{template_id}"
response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
if response.status_code != 200:
return None
data: dict = response.json()
return data
@classmethod
def fetch_pipeline_templates_from_dify_official(cls, language: str) -> dict:
"""
Fetch pipeline templates from dify official.
:param language: language
:return:
"""
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
url = f"{domain}/pipeline-templates?language={language}"
response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
if response.status_code != 200:
raise ValueError(f"fetch pipeline templates failed, status code: {response.status_code}")
result: dict = response.json()
return result

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,945 @@
import base64
import hashlib
import json
import logging
import uuid
from collections.abc import Mapping
from datetime import UTC, datetime
from enum import StrEnum
from typing import cast
from urllib.parse import urlparse
from uuid import uuid4
import yaml # type: ignore
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from flask_login import current_user
from packaging import version
from pydantic import BaseModel, Field
from sqlalchemy import select
from sqlalchemy.orm import Session
from core.helper import ssrf_proxy
from core.helper.name_generator import generate_incremental_name
from core.model_runtime.utils.encoders import jsonable_encoder
from core.plugin.entities.plugin import PluginDependency
from core.workflow.enums import NodeType
from core.workflow.nodes.datasource.entities import DatasourceNodeData
from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
from core.workflow.nodes.llm.entities import LLMNodeData
from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
from core.workflow.nodes.question_classifier.entities import QuestionClassifierNodeData
from core.workflow.nodes.tool.entities import ToolNodeData
from extensions.ext_redis import redis_client
from factories import variable_factory
from models import Account
from models.dataset import Dataset, DatasetCollectionBinding, Pipeline
from models.workflow import Workflow, WorkflowType
from services.entities.knowledge_entities.rag_pipeline_entities import (
IconInfo,
KnowledgeConfiguration,
RagPipelineDatasetCreateEntity,
)
from services.plugin.dependencies_analysis import DependenciesAnalysisService
logger = logging.getLogger(__name__)
IMPORT_INFO_REDIS_KEY_PREFIX = "app_import_info:"
CHECK_DEPENDENCIES_REDIS_KEY_PREFIX = "app_check_dependencies:"
IMPORT_INFO_REDIS_EXPIRY = 10 * 60 # 10 minutes
DSL_MAX_SIZE = 10 * 1024 * 1024 # 10MB
CURRENT_DSL_VERSION = "0.1.0"
class ImportMode(StrEnum):
YAML_CONTENT = "yaml-content"
YAML_URL = "yaml-url"
class ImportStatus(StrEnum):
COMPLETED = "completed"
COMPLETED_WITH_WARNINGS = "completed-with-warnings"
PENDING = "pending"
FAILED = "failed"
class RagPipelineImportInfo(BaseModel):
id: str
status: ImportStatus
pipeline_id: str | None = None
current_dsl_version: str = CURRENT_DSL_VERSION
imported_dsl_version: str = ""
error: str = ""
dataset_id: str | None = None
class CheckDependenciesResult(BaseModel):
leaked_dependencies: list[PluginDependency] = Field(default_factory=list)
def _check_version_compatibility(imported_version: str) -> ImportStatus:
"""Determine import status based on version comparison"""
try:
current_ver = version.parse(CURRENT_DSL_VERSION)
imported_ver = version.parse(imported_version)
except version.InvalidVersion:
return ImportStatus.FAILED
# If imported version is newer than current, always return PENDING
if imported_ver > current_ver:
return ImportStatus.PENDING
# If imported version is older than current's major, return PENDING
if imported_ver.major < current_ver.major:
return ImportStatus.PENDING
# If imported version is older than current's minor, return COMPLETED_WITH_WARNINGS
if imported_ver.minor < current_ver.minor:
return ImportStatus.COMPLETED_WITH_WARNINGS
# If imported version equals or is older than current's micro, return COMPLETED
return ImportStatus.COMPLETED
class RagPipelinePendingData(BaseModel):
import_mode: str
yaml_content: str
pipeline_id: str | None
class CheckDependenciesPendingData(BaseModel):
dependencies: list[PluginDependency]
pipeline_id: str | None
class RagPipelineDslService:
def __init__(self, session: Session):
self._session = session
def import_rag_pipeline(
self,
*,
account: Account,
import_mode: str,
yaml_content: str | None = None,
yaml_url: str | None = None,
pipeline_id: str | None = None,
dataset: Dataset | None = None,
dataset_name: str | None = None,
icon_info: IconInfo | None = None,
) -> RagPipelineImportInfo:
"""Import an app from YAML content or URL."""
import_id = str(uuid.uuid4())
# Validate import mode
try:
mode = ImportMode(import_mode)
except ValueError:
raise ValueError(f"Invalid import_mode: {import_mode}")
# Get YAML content
content: str = ""
if mode == ImportMode.YAML_URL:
if not yaml_url:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="yaml_url is required when import_mode is yaml-url",
)
try:
parsed_url = urlparse(yaml_url)
if (
parsed_url.scheme == "https"
and parsed_url.netloc == "github.com"
and parsed_url.path.endswith((".yml", ".yaml"))
):
yaml_url = yaml_url.replace("https://github.com", "https://raw.githubusercontent.com")
yaml_url = yaml_url.replace("/blob/", "/")
response = ssrf_proxy.get(yaml_url.strip(), follow_redirects=True, timeout=(10, 10))
response.raise_for_status()
content = response.content.decode()
if len(content) > DSL_MAX_SIZE:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="File size exceeds the limit of 10MB",
)
if not content:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Empty content from url",
)
except Exception as e:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=f"Error fetching YAML from URL: {str(e)}",
)
elif mode == ImportMode.YAML_CONTENT:
if not yaml_content:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="yaml_content is required when import_mode is yaml-content",
)
content = yaml_content
# Process YAML content
try:
# Parse YAML to validate format
data = yaml.safe_load(content)
if not isinstance(data, dict):
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Invalid YAML format: content must be a mapping",
)
# Validate and fix DSL version
if not data.get("version"):
data["version"] = "0.1.0"
if not data.get("kind") or data.get("kind") != "rag_pipeline":
data["kind"] = "rag_pipeline"
imported_version = data.get("version", "0.1.0")
# check if imported_version is a float-like string
if not isinstance(imported_version, str):
raise ValueError(f"Invalid version type, expected str, got {type(imported_version)}")
status = _check_version_compatibility(imported_version)
# Extract app data
pipeline_data = data.get("rag_pipeline")
if not pipeline_data:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Missing rag_pipeline data in YAML content",
)
# If app_id is provided, check if it exists
pipeline = None
if pipeline_id:
stmt = select(Pipeline).where(
Pipeline.id == pipeline_id,
Pipeline.tenant_id == account.current_tenant_id,
)
pipeline = self._session.scalar(stmt)
if not pipeline:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Pipeline not found",
)
dataset = pipeline.retrieve_dataset(session=self._session)
# If major version mismatch, store import info in Redis
if status == ImportStatus.PENDING:
pending_data = RagPipelinePendingData(
import_mode=import_mode,
yaml_content=content,
pipeline_id=pipeline_id,
)
redis_client.setex(
f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}",
IMPORT_INFO_REDIS_EXPIRY,
pending_data.model_dump_json(),
)
return RagPipelineImportInfo(
id=import_id,
status=status,
pipeline_id=pipeline_id,
imported_dsl_version=imported_version,
)
# Extract dependencies
dependencies = data.get("dependencies", [])
check_dependencies_pending_data = None
if dependencies:
check_dependencies_pending_data = [PluginDependency.model_validate(d) for d in dependencies]
# Create or update pipeline
pipeline = self._create_or_update_pipeline(
pipeline=pipeline,
data=data,
account=account,
dependencies=check_dependencies_pending_data,
)
# create dataset
name = pipeline.name or "Untitled"
description = pipeline.description
if icon_info:
icon_type = icon_info.icon_type
icon = icon_info.icon
icon_background = icon_info.icon_background
icon_url = icon_info.icon_url
else:
icon_type = data.get("rag_pipeline", {}).get("icon_type")
icon = data.get("rag_pipeline", {}).get("icon")
icon_background = data.get("rag_pipeline", {}).get("icon_background")
icon_url = data.get("rag_pipeline", {}).get("icon_url")
workflow = data.get("workflow", {})
graph = workflow.get("graph", {})
nodes = graph.get("nodes", [])
dataset_id = None
for node in nodes:
if node.get("data", {}).get("type") == "knowledge-index":
knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
if (
dataset
and pipeline.is_published
and dataset.chunk_structure != knowledge_configuration.chunk_structure
):
raise ValueError("Chunk structure is not compatible with the published pipeline")
if not dataset:
datasets = self._session.query(Dataset).filter_by(tenant_id=account.current_tenant_id).all()
names = [dataset.name for dataset in datasets]
generate_name = generate_incremental_name(names, name)
dataset = Dataset(
tenant_id=account.current_tenant_id,
name=generate_name,
description=description,
icon_info={
"icon_type": icon_type,
"icon": icon,
"icon_background": icon_background,
"icon_url": icon_url,
},
indexing_technique=knowledge_configuration.indexing_technique,
created_by=account.id,
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
runtime_mode="rag_pipeline",
chunk_structure=knowledge_configuration.chunk_structure,
)
if knowledge_configuration.indexing_technique == "high_quality":
dataset_collection_binding = (
self._session.query(DatasetCollectionBinding)
.where(
DatasetCollectionBinding.provider_name
== knowledge_configuration.embedding_model_provider,
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
DatasetCollectionBinding.type == "dataset",
)
.order_by(DatasetCollectionBinding.created_at)
.first()
)
if not dataset_collection_binding:
dataset_collection_binding = DatasetCollectionBinding(
provider_name=knowledge_configuration.embedding_model_provider,
model_name=knowledge_configuration.embedding_model,
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
type="dataset",
)
self._session.add(dataset_collection_binding)
self._session.commit()
dataset_collection_binding_id = dataset_collection_binding.id
dataset.collection_binding_id = dataset_collection_binding_id
dataset.embedding_model = knowledge_configuration.embedding_model
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
elif knowledge_configuration.indexing_technique == "economy":
dataset.keyword_number = knowledge_configuration.keyword_number
dataset.pipeline_id = pipeline.id
self._session.add(dataset)
self._session.commit()
dataset_id = dataset.id
if not dataset_id:
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
return RagPipelineImportInfo(
id=import_id,
status=status,
pipeline_id=pipeline.id,
dataset_id=dataset_id,
imported_dsl_version=imported_version,
)
except yaml.YAMLError as e:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=f"Invalid YAML format: {str(e)}",
)
except Exception as e:
logger.exception("Failed to import app")
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=str(e),
)
def confirm_import(self, *, import_id: str, account: Account) -> RagPipelineImportInfo:
"""
Confirm an import that requires confirmation
"""
redis_key = f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}"
pending_data = redis_client.get(redis_key)
if not pending_data:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Import information expired or does not exist",
)
try:
if not isinstance(pending_data, str | bytes):
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Invalid import information",
)
pending_data = RagPipelinePendingData.model_validate_json(pending_data)
data = yaml.safe_load(pending_data.yaml_content)
pipeline = None
if pending_data.pipeline_id:
stmt = select(Pipeline).where(
Pipeline.id == pending_data.pipeline_id,
Pipeline.tenant_id == account.current_tenant_id,
)
pipeline = self._session.scalar(stmt)
# Create or update app
pipeline = self._create_or_update_pipeline(
pipeline=pipeline,
data=data,
account=account,
)
dataset = pipeline.retrieve_dataset(session=self._session)
# create dataset
name = pipeline.name
description = pipeline.description
icon_type = data.get("rag_pipeline", {}).get("icon_type")
icon = data.get("rag_pipeline", {}).get("icon")
icon_background = data.get("rag_pipeline", {}).get("icon_background")
icon_url = data.get("rag_pipeline", {}).get("icon_url")
workflow = data.get("workflow", {})
graph = workflow.get("graph", {})
nodes = graph.get("nodes", [])
dataset_id = None
for node in nodes:
if node.get("data", {}).get("type") == "knowledge-index":
knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
if not dataset:
dataset = Dataset(
tenant_id=account.current_tenant_id,
name=name,
description=description,
icon_info={
"icon_type": icon_type,
"icon": icon,
"icon_background": icon_background,
"icon_url": icon_url,
},
indexing_technique=knowledge_configuration.indexing_technique,
created_by=account.id,
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
runtime_mode="rag_pipeline",
chunk_structure=knowledge_configuration.chunk_structure,
)
else:
dataset.indexing_technique = knowledge_configuration.indexing_technique
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
dataset.runtime_mode = "rag_pipeline"
dataset.chunk_structure = knowledge_configuration.chunk_structure
if knowledge_configuration.indexing_technique == "high_quality":
dataset_collection_binding = (
self._session.query(DatasetCollectionBinding)
.where(
DatasetCollectionBinding.provider_name
== knowledge_configuration.embedding_model_provider,
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
DatasetCollectionBinding.type == "dataset",
)
.order_by(DatasetCollectionBinding.created_at)
.first()
)
if not dataset_collection_binding:
dataset_collection_binding = DatasetCollectionBinding(
provider_name=knowledge_configuration.embedding_model_provider,
model_name=knowledge_configuration.embedding_model,
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
type="dataset",
)
self._session.add(dataset_collection_binding)
self._session.commit()
dataset_collection_binding_id = dataset_collection_binding.id
dataset.collection_binding_id = dataset_collection_binding_id
dataset.embedding_model = knowledge_configuration.embedding_model
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
elif knowledge_configuration.indexing_technique == "economy":
dataset.keyword_number = knowledge_configuration.keyword_number
dataset.pipeline_id = pipeline.id
self._session.add(dataset)
self._session.commit()
dataset_id = dataset.id
if not dataset_id:
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
# Delete import info from Redis
redis_client.delete(redis_key)
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.COMPLETED,
pipeline_id=pipeline.id,
dataset_id=dataset_id,
current_dsl_version=CURRENT_DSL_VERSION,
imported_dsl_version=data.get("version", "0.1.0"),
)
except Exception as e:
logger.exception("Error confirming import")
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=str(e),
)
def check_dependencies(
self,
*,
pipeline: Pipeline,
) -> CheckDependenciesResult:
"""Check dependencies"""
# Get dependencies from Redis
redis_key = f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}"
dependencies = redis_client.get(redis_key)
if not dependencies:
return CheckDependenciesResult()
# Extract dependencies
dependencies = CheckDependenciesPendingData.model_validate_json(dependencies)
# Get leaked dependencies
leaked_dependencies = DependenciesAnalysisService.get_leaked_dependencies(
tenant_id=pipeline.tenant_id, dependencies=dependencies.dependencies
)
return CheckDependenciesResult(
leaked_dependencies=leaked_dependencies,
)
def _create_or_update_pipeline(
self,
*,
pipeline: Pipeline | None,
data: dict,
account: Account,
dependencies: list[PluginDependency] | None = None,
) -> Pipeline:
"""Create a new app or update an existing one."""
if not account.current_tenant_id:
raise ValueError("Tenant id is required")
pipeline_data = data.get("rag_pipeline", {})
# Initialize pipeline based on mode
workflow_data = data.get("workflow")
if not workflow_data or not isinstance(workflow_data, dict):
raise ValueError("Missing workflow data for rag pipeline")
environment_variables_list = workflow_data.get("environment_variables", [])
environment_variables = [
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
]
conversation_variables_list = workflow_data.get("conversation_variables", [])
conversation_variables = [
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
]
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
graph = workflow_data.get("graph", {})
for node in graph.get("nodes", []):
if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL:
dataset_ids = node["data"].get("dataset_ids", [])
node["data"]["dataset_ids"] = [
decrypted_id
for dataset_id in dataset_ids
if (
decrypted_id := self.decrypt_dataset_id(
encrypted_data=dataset_id,
tenant_id=account.current_tenant_id,
)
)
]
if pipeline:
# Update existing pipeline
pipeline.name = pipeline_data.get("name", pipeline.name)
pipeline.description = pipeline_data.get("description", pipeline.description)
pipeline.updated_by = account.id
else:
if account.current_tenant_id is None:
raise ValueError("Current tenant is not set")
# Create new app
pipeline = Pipeline(
tenant_id=account.current_tenant_id,
name=pipeline_data.get("name", ""),
description=pipeline_data.get("description", ""),
created_by=account.id,
updated_by=account.id,
)
pipeline.id = str(uuid4())
self._session.add(pipeline)
self._session.commit()
# save dependencies
if dependencies:
redis_client.setex(
f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}",
IMPORT_INFO_REDIS_EXPIRY,
CheckDependenciesPendingData(pipeline_id=pipeline.id, dependencies=dependencies).model_dump_json(),
)
workflow = (
self._session.query(Workflow)
.where(
Workflow.tenant_id == pipeline.tenant_id,
Workflow.app_id == pipeline.id,
Workflow.version == "draft",
)
.first()
)
# create draft workflow if not found
if not workflow:
workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE,
version="draft",
graph=json.dumps(graph),
created_by=account.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
self._session.add(workflow)
self._session.flush()
pipeline.workflow_id = workflow.id
else:
workflow.graph = json.dumps(graph)
workflow.updated_by = account.id
workflow.updated_at = datetime.now(UTC).replace(tzinfo=None)
workflow.environment_variables = environment_variables
workflow.conversation_variables = conversation_variables
workflow.rag_pipeline_variables = rag_pipeline_variables_list
# commit db session changes
self._session.commit()
return pipeline
def export_rag_pipeline_dsl(self, pipeline: Pipeline, include_secret: bool = False) -> str:
"""
Export pipeline
:param pipeline: Pipeline instance
:param include_secret: Whether include secret variable
:return:
"""
dataset = pipeline.retrieve_dataset(session=self._session)
if not dataset:
raise ValueError("Missing dataset for rag pipeline")
icon_info = dataset.icon_info
export_data = {
"version": CURRENT_DSL_VERSION,
"kind": "rag_pipeline",
"rag_pipeline": {
"name": dataset.name,
"icon": icon_info.get("icon", "📙") if icon_info else "📙",
"icon_type": icon_info.get("icon_type", "emoji") if icon_info else "emoji",
"icon_background": icon_info.get("icon_background", "#FFEAD5") if icon_info else "#FFEAD5",
"icon_url": icon_info.get("icon_url") if icon_info else None,
"description": pipeline.description,
},
}
self._append_workflow_export_data(export_data=export_data, pipeline=pipeline, include_secret=include_secret)
return yaml.dump(export_data, allow_unicode=True) # type: ignore
def _append_workflow_export_data(self, *, export_data: dict, pipeline: Pipeline, include_secret: bool) -> None:
"""
Append workflow export data
:param export_data: export data
:param pipeline: Pipeline instance
"""
workflow = (
self._session.query(Workflow)
.where(
Workflow.tenant_id == pipeline.tenant_id,
Workflow.app_id == pipeline.id,
Workflow.version == "draft",
)
.first()
)
if not workflow:
raise ValueError("Missing draft workflow configuration, please check.")
workflow_dict = workflow.to_dict(include_secret=include_secret)
for node in workflow_dict.get("graph", {}).get("nodes", []):
node_data = node.get("data", {})
if not node_data:
continue
data_type = node_data.get("type", "")
if data_type == NodeType.KNOWLEDGE_RETRIEVAL:
dataset_ids = node_data.get("dataset_ids", [])
node["data"]["dataset_ids"] = [
self.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=pipeline.tenant_id)
for dataset_id in dataset_ids
]
# filter credential id from tool node
if not include_secret and data_type == NodeType.TOOL:
node_data.pop("credential_id", None)
# filter credential id from agent node
if not include_secret and data_type == NodeType.AGENT:
for tool in node_data.get("agent_parameters", {}).get("tools", {}).get("value", []):
tool.pop("credential_id", None)
export_data["workflow"] = workflow_dict
dependencies = self._extract_dependencies_from_workflow(workflow)
export_data["dependencies"] = [
jsonable_encoder(d.model_dump())
for d in DependenciesAnalysisService.generate_dependencies(
tenant_id=pipeline.tenant_id, dependencies=dependencies
)
]
def _extract_dependencies_from_workflow(self, workflow: Workflow) -> list[str]:
"""
Extract dependencies from workflow
:param workflow: Workflow instance
:return: dependencies list format like ["langgenius/google"]
"""
graph = workflow.graph_dict
dependencies = self._extract_dependencies_from_workflow_graph(graph)
return dependencies
def _extract_dependencies_from_workflow_graph(self, graph: Mapping) -> list[str]:
"""
Extract dependencies from workflow graph
:param graph: Workflow graph
:return: dependencies list format like ["langgenius/google"]
"""
dependencies = []
for node in graph.get("nodes", []):
try:
typ = node.get("data", {}).get("type")
match typ:
case NodeType.TOOL:
tool_entity = ToolNodeData.model_validate(node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_tool_dependency(tool_entity.provider_id),
)
case NodeType.DATASOURCE:
datasource_entity = DatasourceNodeData.model_validate(node["data"])
if datasource_entity.provider_type != "local_file":
dependencies.append(datasource_entity.plugin_id)
case NodeType.LLM:
llm_entity = LLMNodeData.model_validate(node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(llm_entity.model.provider),
)
case NodeType.QUESTION_CLASSIFIER:
question_classifier_entity = QuestionClassifierNodeData.model_validate(node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
question_classifier_entity.model.provider
),
)
case NodeType.PARAMETER_EXTRACTOR:
parameter_extractor_entity = ParameterExtractorNodeData.model_validate(node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
parameter_extractor_entity.model.provider
),
)
case NodeType.KNOWLEDGE_INDEX:
knowledge_index_entity = KnowledgeConfiguration.model_validate(node["data"])
if knowledge_index_entity.indexing_technique == "high_quality":
if knowledge_index_entity.embedding_model_provider:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_index_entity.embedding_model_provider
),
)
if knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model":
if knowledge_index_entity.retrieval_model.reranking_enable:
if (
knowledge_index_entity.retrieval_model.reranking_model
and knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model"
):
if knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name
),
)
case NodeType.KNOWLEDGE_RETRIEVAL:
knowledge_retrieval_entity = KnowledgeRetrievalNodeData.model_validate(node["data"])
if knowledge_retrieval_entity.retrieval_mode == "multiple":
if knowledge_retrieval_entity.multiple_retrieval_config:
if (
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
== "reranking_model"
):
if knowledge_retrieval_entity.multiple_retrieval_config.reranking_model:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_retrieval_entity.multiple_retrieval_config.reranking_model.provider
),
)
elif (
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
== "weighted_score"
):
if knowledge_retrieval_entity.multiple_retrieval_config.weights:
vector_setting = (
knowledge_retrieval_entity.multiple_retrieval_config.weights.vector_setting
)
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
vector_setting.embedding_provider_name
),
)
elif knowledge_retrieval_entity.retrieval_mode == "single":
model_config = knowledge_retrieval_entity.single_retrieval_config
if model_config:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
model_config.model.provider
),
)
case _:
# TODO: Handle default case or unknown node types
pass
except Exception as e:
logger.exception("Error extracting node dependency", exc_info=e)
return dependencies
@classmethod
def _extract_dependencies_from_model_config(cls, model_config: Mapping) -> list[str]:
"""
Extract dependencies from model config
:param model_config: model config dict
:return: dependencies list format like ["langgenius/google"]
"""
dependencies = []
try:
# completion model
model_dict = model_config.get("model", {})
if model_dict:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(model_dict.get("provider", ""))
)
# reranking model
dataset_configs = model_config.get("dataset_configs", {})
if dataset_configs:
for dataset_config in dataset_configs.get("datasets", {}).get("datasets", []):
if dataset_config.get("reranking_model"):
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
dataset_config.get("reranking_model", {})
.get("reranking_provider_name", {})
.get("provider")
)
)
# tools
agent_configs = model_config.get("agent_mode", {})
if agent_configs:
for agent_config in agent_configs.get("tools", []):
dependencies.append(
DependenciesAnalysisService.analyze_tool_dependency(agent_config.get("provider_id"))
)
except Exception as e:
logger.exception("Error extracting model config dependency", exc_info=e)
return dependencies
@classmethod
def get_leaked_dependencies(cls, tenant_id: str, dsl_dependencies: list[dict]) -> list[PluginDependency]:
"""
Returns the leaked dependencies in current workspace
"""
dependencies = [PluginDependency.model_validate(dep) for dep in dsl_dependencies]
if not dependencies:
return []
return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies)
def _generate_aes_key(self, tenant_id: str) -> bytes:
"""Generate AES key based on tenant_id"""
return hashlib.sha256(tenant_id.encode()).digest()
def encrypt_dataset_id(self, dataset_id: str, tenant_id: str) -> str:
"""Encrypt dataset_id using AES-CBC mode"""
key = self._generate_aes_key(tenant_id)
iv = key[:16]
cipher = AES.new(key, AES.MODE_CBC, iv)
ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size))
return base64.b64encode(ct_bytes).decode()
def decrypt_dataset_id(self, encrypted_data: str, tenant_id: str) -> str | None:
"""AES decryption"""
try:
key = self._generate_aes_key(tenant_id)
iv = key[:16]
cipher = AES.new(key, AES.MODE_CBC, iv)
pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size)
return pt.decode()
except Exception:
return None
def create_rag_pipeline_dataset(
self,
tenant_id: str,
rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
):
if rag_pipeline_dataset_create_entity.name:
# check if dataset name already exists
if (
self._session.query(Dataset)
.filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
.first()
):
raise ValueError(f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists.")
else:
# generate a random name as Untitled 1 2 3 ...
datasets = self._session.query(Dataset).filter_by(tenant_id=tenant_id).all()
names = [dataset.name for dataset in datasets]
rag_pipeline_dataset_create_entity.name = generate_incremental_name(
names,
"Untitled",
)
account = cast(Account, current_user)
rag_pipeline_import_info: RagPipelineImportInfo = self.import_rag_pipeline(
account=account,
import_mode=ImportMode.YAML_CONTENT,
yaml_content=rag_pipeline_dataset_create_entity.yaml_content,
dataset=None,
dataset_name=rag_pipeline_dataset_create_entity.name,
icon_info=rag_pipeline_dataset_create_entity.icon_info,
)
return {
"id": rag_pipeline_import_info.id,
"dataset_id": rag_pipeline_import_info.dataset_id,
"pipeline_id": rag_pipeline_import_info.pipeline_id,
"status": rag_pipeline_import_info.status,
"imported_dsl_version": rag_pipeline_import_info.imported_dsl_version,
"current_dsl_version": rag_pipeline_import_info.current_dsl_version,
"error": rag_pipeline_import_info.error,
}

View File

@@ -0,0 +1,23 @@
from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
from core.plugin.impl.datasource import PluginDatasourceManager
from services.datasource_provider_service import DatasourceProviderService
class RagPipelineManageService:
@staticmethod
def list_rag_pipeline_datasources(tenant_id: str) -> list[PluginDatasourceProviderEntity]:
"""
list rag pipeline datasources
"""
# get all builtin providers
manager = PluginDatasourceManager()
datasources = manager.fetch_datasource_providers(tenant_id)
for datasource in datasources:
datasource_provider_service = DatasourceProviderService()
credentials = datasource_provider_service.get_datasource_credentials(
tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
)
if credentials:
datasource.is_authorized = True
return datasources

View File

@@ -0,0 +1,106 @@
import json
import logging
from collections.abc import Callable, Sequence
from functools import cached_property
from core.app.entities.rag_pipeline_invoke_entities import RagPipelineInvokeEntity
from core.rag.pipeline.queue import TenantIsolatedTaskQueue
from enums.cloud_plan import CloudPlan
from extensions.ext_database import db
from services.feature_service import FeatureService
from services.file_service import FileService
from tasks.rag_pipeline.priority_rag_pipeline_run_task import priority_rag_pipeline_run_task
from tasks.rag_pipeline.rag_pipeline_run_task import rag_pipeline_run_task
logger = logging.getLogger(__name__)
class RagPipelineTaskProxy:
# Default uploaded file name for rag pipeline invoke entities
_RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME = "rag_pipeline_invoke_entities.json"
def __init__(
self, dataset_tenant_id: str, user_id: str, rag_pipeline_invoke_entities: Sequence[RagPipelineInvokeEntity]
):
self._dataset_tenant_id = dataset_tenant_id
self._user_id = user_id
self._rag_pipeline_invoke_entities = rag_pipeline_invoke_entities
self._tenant_isolated_task_queue = TenantIsolatedTaskQueue(dataset_tenant_id, "pipeline")
@cached_property
def features(self):
return FeatureService.get_features(self._dataset_tenant_id)
def _upload_invoke_entities(self) -> str:
text = [item.model_dump() for item in self._rag_pipeline_invoke_entities]
# Convert list to proper JSON string
json_text = json.dumps(text)
upload_file = FileService(db.engine).upload_text(
json_text, self._RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME, self._user_id, self._dataset_tenant_id
)
return upload_file.id
def _send_to_direct_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
logger.info("send file %s to direct queue", upload_file_id)
task_func.delay( # type: ignore
rag_pipeline_invoke_entities_file_id=upload_file_id,
tenant_id=self._dataset_tenant_id,
)
def _send_to_tenant_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
logger.info("send file %s to tenant queue", upload_file_id)
if self._tenant_isolated_task_queue.get_task_key():
# Add to waiting queue using List operations (lpush)
self._tenant_isolated_task_queue.push_tasks([upload_file_id])
logger.info("push tasks: %s", upload_file_id)
else:
# Set flag and execute task
self._tenant_isolated_task_queue.set_task_waiting_time()
task_func.delay( # type: ignore
rag_pipeline_invoke_entities_file_id=upload_file_id,
tenant_id=self._dataset_tenant_id,
)
logger.info("init tasks: %s", upload_file_id)
def _send_to_default_tenant_queue(self, upload_file_id: str):
self._send_to_tenant_queue(upload_file_id, rag_pipeline_run_task)
def _send_to_priority_tenant_queue(self, upload_file_id: str):
self._send_to_tenant_queue(upload_file_id, priority_rag_pipeline_run_task)
def _send_to_priority_direct_queue(self, upload_file_id: str):
self._send_to_direct_queue(upload_file_id, priority_rag_pipeline_run_task)
def _dispatch(self):
upload_file_id = self._upload_invoke_entities()
if not upload_file_id:
raise ValueError("upload_file_id is empty")
logger.info(
"dispatch args: %s - %s - %s",
self._dataset_tenant_id,
self.features.billing.enabled,
self.features.billing.subscription.plan,
)
# dispatch to different pipeline queue with tenant isolation when billing enabled
if self.features.billing.enabled:
if self.features.billing.subscription.plan == CloudPlan.SANDBOX:
# dispatch to normal pipeline queue with tenant isolation for sandbox plan
self._send_to_default_tenant_queue(upload_file_id)
else:
# dispatch to priority pipeline queue with tenant isolation for other plans
self._send_to_priority_tenant_queue(upload_file_id)
else:
# dispatch to priority pipeline queue without tenant isolation for others, e.g.: self-hosted or enterprise
self._send_to_priority_direct_queue(upload_file_id)
def delay(self):
if not self._rag_pipeline_invoke_entities:
logger.warning(
"Received empty rag pipeline invoke entities, no tasks delivered: %s %s",
self._dataset_tenant_id,
self._user_id,
)
return
self._dispatch()

View File

@@ -0,0 +1,387 @@
import json
import logging
from datetime import UTC, datetime
from pathlib import Path
from uuid import uuid4
import yaml
from flask_login import current_user
from constants import DOCUMENT_EXTENSIONS
from core.plugin.impl.plugin import PluginInstaller
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from extensions.ext_database import db
from factories import variable_factory
from models.dataset import Dataset, Document, DocumentPipelineExecutionLog, Pipeline
from models.model import UploadFile
from models.workflow import Workflow, WorkflowType
from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
from services.plugin.plugin_migration import PluginMigration
from services.plugin.plugin_service import PluginService
logger = logging.getLogger(__name__)
class RagPipelineTransformService:
def transform_dataset(self, dataset_id: str):
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
if not dataset:
raise ValueError("Dataset not found")
if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
return {
"pipeline_id": dataset.pipeline_id,
"dataset_id": dataset_id,
"status": "success",
}
if dataset.provider != "vendor":
raise ValueError("External dataset is not supported")
datasource_type = dataset.data_source_type
indexing_technique = dataset.indexing_technique
if not datasource_type and not indexing_technique:
return self._transform_to_empty_pipeline(dataset)
doc_form = dataset.doc_form
if not doc_form:
return self._transform_to_empty_pipeline(dataset)
retrieval_model = dataset.retrieval_model
pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
# deal dependencies
self._deal_dependencies(pipeline_yaml, dataset.tenant_id)
# Extract app data
workflow_data = pipeline_yaml.get("workflow")
if not workflow_data:
raise ValueError("Missing workflow data for rag pipeline")
graph = workflow_data.get("graph", {})
nodes = graph.get("nodes", [])
new_nodes = []
for node in nodes:
if (
node.get("data", {}).get("type") == "datasource"
and node.get("data", {}).get("provider_type") == "local_file"
):
node = self._deal_file_extensions(node)
if node.get("data", {}).get("type") == "knowledge-index":
node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
new_nodes.append(node)
if new_nodes:
graph["nodes"] = new_nodes
workflow_data["graph"] = graph
pipeline_yaml["workflow"] = workflow_data
# create pipeline
pipeline = self._create_pipeline(pipeline_yaml)
# save chunk structure to dataset
if doc_form == "hierarchical_model":
dataset.chunk_structure = "hierarchical_model"
elif doc_form == "text_model":
dataset.chunk_structure = "text_model"
else:
raise ValueError("Unsupported doc form")
dataset.runtime_mode = "rag_pipeline"
dataset.pipeline_id = pipeline.id
# deal document data
self._deal_document_data(dataset)
db.session.commit()
return {
"pipeline_id": pipeline.id,
"dataset_id": dataset_id,
"status": "success",
}
def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str | None):
pipeline_yaml = {}
if doc_form == "text_model":
match datasource_type:
case "upload_file":
if indexing_technique == "high_quality":
# get graph from transform.file-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.file-general-economy.yml
with open(f"{Path(__file__).parent}/transform/file-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "notion_import":
if indexing_technique == "high_quality":
# get graph from transform.notion-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.notion-general-economy.yml
with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "website_crawl":
if indexing_technique == "high_quality":
# get graph from transform.website-crawl-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.website-crawl-general-economy.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case _:
raise ValueError("Unsupported datasource type")
elif doc_form == "hierarchical_model":
match datasource_type:
case "upload_file":
# get graph from transform.file-parentchild.yml
with open(f"{Path(__file__).parent}/transform/file-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "notion_import":
# get graph from transform.notion-parentchild.yml
with open(f"{Path(__file__).parent}/transform/notion-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "website_crawl":
# get graph from transform.website-crawl-parentchild.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case _:
raise ValueError("Unsupported datasource type")
else:
raise ValueError("Unsupported doc form")
return pipeline_yaml
def _deal_file_extensions(self, node: dict):
file_extensions = node.get("data", {}).get("fileExtensions", [])
if not file_extensions:
return node
node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
return node
def _deal_knowledge_index(
self, dataset: Dataset, doc_form: str, indexing_technique: str | None, retrieval_model: dict, node: dict
):
knowledge_configuration_dict = node.get("data", {})
knowledge_configuration = KnowledgeConfiguration.model_validate(knowledge_configuration_dict)
if indexing_technique == "high_quality":
knowledge_configuration.embedding_model = dataset.embedding_model
knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
if retrieval_model:
retrieval_setting = RetrievalSetting.model_validate(retrieval_model)
if indexing_technique == "economy":
retrieval_setting.search_method = RetrievalMethod.KEYWORD_SEARCH
knowledge_configuration.retrieval_model = retrieval_setting
else:
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
knowledge_configuration_dict.update(knowledge_configuration.model_dump())
node["data"] = knowledge_configuration_dict
return node
def _create_pipeline(
self,
data: dict,
) -> Pipeline:
"""Create a new app or update an existing one."""
pipeline_data = data.get("rag_pipeline", {})
# Initialize pipeline based on mode
workflow_data = data.get("workflow")
if not workflow_data or not isinstance(workflow_data, dict):
raise ValueError("Missing workflow data for rag pipeline")
environment_variables_list = workflow_data.get("environment_variables", [])
environment_variables = [
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
]
conversation_variables_list = workflow_data.get("conversation_variables", [])
conversation_variables = [
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
]
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
graph = workflow_data.get("graph", {})
# Create new app
pipeline = Pipeline(
tenant_id=current_user.current_tenant_id,
name=pipeline_data.get("name", ""),
description=pipeline_data.get("description", ""),
created_by=current_user.id,
updated_by=current_user.id,
is_published=True,
is_public=True,
)
pipeline.id = str(uuid4())
db.session.add(pipeline)
db.session.flush()
# create draft workflow
draft_workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE,
version="draft",
graph=json.dumps(graph),
created_by=current_user.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
published_workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE,
version=str(datetime.now(UTC).replace(tzinfo=None)),
graph=json.dumps(graph),
created_by=current_user.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
db.session.add(draft_workflow)
db.session.add(published_workflow)
db.session.flush()
pipeline.workflow_id = published_workflow.id
db.session.add(pipeline)
return pipeline
def _deal_dependencies(self, pipeline_yaml: dict, tenant_id: str):
installer_manager = PluginInstaller()
installed_plugins = installer_manager.list_plugins(tenant_id)
plugin_migration = PluginMigration()
installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
dependencies = pipeline_yaml.get("dependencies", [])
need_install_plugin_unique_identifiers = []
for dependency in dependencies:
if dependency.get("type") == "marketplace":
plugin_unique_identifier = dependency.get("value", {}).get("plugin_unique_identifier")
plugin_id = plugin_unique_identifier.split(":")[0]
if plugin_id not in installed_plugins_ids:
plugin_unique_identifier = plugin_migration._fetch_plugin_unique_identifier(plugin_id) # type: ignore
if plugin_unique_identifier:
need_install_plugin_unique_identifiers.append(plugin_unique_identifier)
if need_install_plugin_unique_identifiers:
logger.debug("Installing missing pipeline plugins %s", need_install_plugin_unique_identifiers)
PluginService.install_from_marketplace_pkg(tenant_id, need_install_plugin_unique_identifiers)
def _transform_to_empty_pipeline(self, dataset: Dataset):
pipeline = Pipeline(
tenant_id=dataset.tenant_id,
name=dataset.name,
description=dataset.description,
created_by=current_user.id,
)
db.session.add(pipeline)
db.session.flush()
dataset.pipeline_id = pipeline.id
dataset.runtime_mode = "rag_pipeline"
dataset.updated_by = current_user.id
dataset.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.add(dataset)
db.session.commit()
return {
"pipeline_id": pipeline.id,
"dataset_id": dataset.id,
"status": "success",
}
def _deal_document_data(self, dataset: Dataset):
file_node_id = "1752479895761"
notion_node_id = "1752489759475"
jina_node_id = "1752491761974"
firecrawl_node_id = "1752565402678"
documents = db.session.query(Document).where(Document.dataset_id == dataset.id).all()
for document in documents:
data_source_info_dict = document.data_source_info_dict
if not data_source_info_dict:
continue
if document.data_source_type == "upload_file":
document.data_source_type = "local_file"
file_id = data_source_info_dict.get("upload_file_id")
if file_id:
file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
if file:
data_source_info = json.dumps(
{
"real_file_id": file_id,
"name": file.name,
"size": file.size,
"extension": file.extension,
"mime_type": file.mime_type,
"url": "",
"transfer_method": "local_file",
}
)
document.data_source_info = data_source_info
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="local_file",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
datasource_node_id=file_node_id,
)
document_pipeline_execution_log.created_at = document.created_at
db.session.add(document)
db.session.add(document_pipeline_execution_log)
elif document.data_source_type == "notion_import":
document.data_source_type = "online_document"
data_source_info = json.dumps(
{
"workspace_id": data_source_info_dict.get("notion_workspace_id"),
"page": {
"page_id": data_source_info_dict.get("notion_page_id"),
"page_name": document.name,
"page_icon": data_source_info_dict.get("notion_page_icon"),
"type": data_source_info_dict.get("type"),
"last_edited_time": data_source_info_dict.get("last_edited_time"),
"parent_id": None,
},
}
)
document.data_source_info = data_source_info
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="online_document",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
datasource_node_id=notion_node_id,
)
document_pipeline_execution_log.created_at = document.created_at
db.session.add(document)
db.session.add(document_pipeline_execution_log)
elif document.data_source_type == "website_crawl":
document.data_source_type = "website_crawl"
data_source_info = json.dumps(
{
"source_url": data_source_info_dict.get("url"),
"content": "",
"title": document.name,
"description": "",
}
)
document.data_source_info = data_source_info
if data_source_info_dict.get("provider") == "firecrawl":
datasource_node_id = firecrawl_node_id
elif data_source_info_dict.get("provider") == "jinareader":
datasource_node_id = jina_node_id
else:
continue
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="website_crawl",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
datasource_node_id=datasource_node_id,
)
document_pipeline_execution_log.created_at = document.created_at
db.session.add(document)
db.session.add(document_pipeline_execution_log)

View File

@@ -0,0 +1,709 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: file-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752482151668-target
source: '1752482022496'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1076.4656678451215
y: 281.3910724383104
positionAbsolute:
x: 1076.4656678451215
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -489.57009543377865
y: 251.3910724383104
positionAbsolute:
x: -489.57009543377865
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos blocos.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: DDelimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: O comprimento de sobreposição dos fragmentos
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Comprimento de sobreposição do bloco
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Excluir todos os URLs e endereços de e-mail
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Excluir todos os URLs e endereços de e-mail
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752482022496.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 693.5300771507484
y: 281.3910724383104
positionAbsolute:
x: 693.5300771507484
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 701.4999626224237
y: 128.33739021504016
zoom: 0.48941689643726966
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,709 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: file-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752482151668-target
source: '1752482022496'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1076.4656678451215
y: 281.3910724383104
positionAbsolute:
x: 1076.4656678451215
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -489.57009543377865
y: 251.3910724383104
positionAbsolute:
x: -489.57009543377865
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752482022496.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 693.5300771507484
y: 281.3910724383104
positionAbsolute:
x: 693.5300771507484
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 701.4999626224237
y: 128.33739021504016
zoom: 0.48941689643726966
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,814 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: file-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752575473519-target
source: '1752482022496'
sourceHandle: source
target: '1752575473519'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752575473519-source-1752477924228-target
source: '1752575473519'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752575473519'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 994.3774545394483
y: 281.3910724383104
positionAbsolute:
x: 994.3774545394483
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -512.2335487893622
y: 251.3910724383104
positionAbsolute:
x: -512.2335487893622
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: false
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752482022496.output#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752575473519'
position:
x: 637.9241611063885
y: 281.3910724383104
positionAbsolute:
x: 637.9241611063885
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 948.6766333808323
y: -102.06757184183238
zoom: 0.8375774577380971
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 256
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 256
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,400 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: notion-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752482151668-target
source: '1752489759475'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1444.5503479271906
y: 281.3910724383104
positionAbsolute:
x: 1444.5503479271906
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752489759475.content#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 1063.6922916384628
y: 281.3910724383104
positionAbsolute:
x: 1063.6922916384628
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -838.569649323166
y: -168.94656489167426
zoom: 1.286925643857699
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,400 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: notion-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752482151668-target
source: '1752489759475'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1444.5503479271906
y: 281.3910724383104
positionAbsolute:
x: 1444.5503479271906
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752489759475.content#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 1063.6922916384628
y: 281.3910724383104
positionAbsolute:
x: 1063.6922916384628
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -838.569649323166
y: -168.94656489167426
zoom: 1.286925643857699
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,506 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: notion-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752490343805-target
source: '1752489759475'
sourceHandle: source
target: '1752490343805'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752490343805-source-1752477924228-target
source: '1752490343805'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752490343805'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1486.2052698032674
y: 281.3910724383104
positionAbsolute:
x: 1486.2052698032674
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: true
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752489759475.content#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752490343805'
position:
x: 1077.0240183162543
y: 281.3910724383104
positionAbsolute:
x: 1077.0240183162543
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -487.2912544090391
y: -54.7029301848807
zoom: 0.9994011715768695
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 199
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,674 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: website-crawl-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752569675978-target
source: '1752565435219'
sourceHandle: source
target: '1752569675978'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752569675978-source-1752477924228-target
source: '1752569675978'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752569675978'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2140.4053851189346
y: 281.3910724383104
positionAbsolute:
x: 2140.4053851189346
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jinareader
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752565435219.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752569675978'
position:
x: 1807.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1807.4306671642219
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -707.721097109337
y: -93.07807382100896
zoom: 0.9350632198875476
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 50
label: chunk_overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Setting the chunk overlap can maintain the semantic relevance between
them, enhancing the retrieve effect. It is recommended to set 10%25% of the
maximum chunk size.
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: replace_consecutive_spaces
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,674 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: website-crawl-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752569675978-target
source: '1752565435219'
sourceHandle: source
target: '1752569675978'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752569675978-source-1752477924228-target
source: '1752569675978'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752569675978'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2140.4053851189346
y: 281.3910724383104
positionAbsolute:
x: 2140.4053851189346
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jinareader
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長。
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752565435219.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752569675978'
position:
x: 1807.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1807.4306671642219
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -707.721097109337
y: -93.07807382100896
zoom: 0.9350632198875476
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 50
label: chunk_overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Setting the chunk overlap can maintain the semantic relevance between
them, enhancing the retrieve effect. It is recommended to set 10%25% of the
maximum chunk size.
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: replace_consecutive_spaces
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View File

@@ -0,0 +1,779 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: website-crawl-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752490343805-source-1752477924228-target
source: '1752490343805'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752490343805-target
source: '1752565435219'
sourceHandle: source
target: '1752490343805'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752490343805'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2215.5544306817387
y: 281.3910724383104
positionAbsolute:
x: 2215.5544306817387
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: true
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752565435219.output#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752490343805'
position:
x: 1853.5260563244174
y: 281.3910724383104
positionAbsolute:
x: 1853.5260563244174
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jinareader
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -826.1791044466438
y: -71.91725474841303
zoom: 0.9980166672552107
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 199
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email