dify

2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions
--- a/dify/api/services/rag_pipeline/entity/pipeline_service_api_entities.py
+++ b/dify/api/services/rag_pipeline/entity/pipeline_service_api_entities.py
@@ -0,0 +1,22 @@
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class DatasourceNodeRunApiEntity(BaseModel):
+    pipeline_id: str
+    node_id: str
+    inputs: dict[str, Any]
+    datasource_type: str
+    credential_id: str | None = None
+    is_published: bool
+
+
+class PipelineRunApiEntity(BaseModel):
+    inputs: Mapping[str, Any]
+    datasource_type: str
+    datasource_info_list: list[Mapping[str, Any]]
+    start_node_id: str
+    is_published: bool
+    response_mode: str
--- a/dify/api/services/rag_pipeline/pipeline_generate_service.py
+++ b/dify/api/services/rag_pipeline/pipeline_generate_service.py
@@ -0,0 +1,115 @@
+from collections.abc import Mapping
+from typing import Any, Union
+
+from configs import dify_config
+from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
+from core.app.entities.app_invoke_entities import InvokeFrom
+from extensions.ext_database import db
+from models.dataset import Document, Pipeline
+from models.model import Account, App, EndUser
+from models.workflow import Workflow
+from services.rag_pipeline.rag_pipeline import RagPipelineService
+
+
+class PipelineGenerateService:
+    @classmethod
+    def generate(
+        cls,
+        pipeline: Pipeline,
+        user: Union[Account, EndUser],
+        args: Mapping[str, Any],
+        invoke_from: InvokeFrom,
+        streaming: bool = True,
+    ):
+        """
+        Pipeline Content Generate
+        :param pipeline: pipeline
+        :param user: user
+        :param args: args
+        :param invoke_from: invoke from
+        :param streaming: streaming
+        :return:
+        """
+        try:
+            workflow = cls._get_workflow(pipeline, invoke_from)
+            if original_document_id := args.get("original_document_id"):
+                # update document status to waiting
+                cls.update_document_status(original_document_id)
+            return PipelineGenerator.convert_to_event_stream(
+                PipelineGenerator().generate(
+                    pipeline=pipeline,
+                    workflow=workflow,
+                    user=user,
+                    args=args,
+                    invoke_from=invoke_from,
+                    streaming=streaming,
+                    call_depth=0,
+                    workflow_thread_pool_id=None,
+                ),
+            )
+
+        except Exception:
+            raise
+
+    @staticmethod
+    def _get_max_active_requests(app_model: App) -> int:
+        max_active_requests = app_model.max_active_requests
+        if max_active_requests is None:
+            max_active_requests = int(dify_config.APP_MAX_ACTIVE_REQUESTS)
+        return max_active_requests
+
+    @classmethod
+    def generate_single_iteration(
+        cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True
+    ):
+        workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
+        return PipelineGenerator.convert_to_event_stream(
+            PipelineGenerator().single_iteration_generate(
+                pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
+            )
+        )
+
+    @classmethod
+    def generate_single_loop(cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True):
+        workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
+        return PipelineGenerator.convert_to_event_stream(
+            PipelineGenerator().single_loop_generate(
+                pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
+            )
+        )
+
+    @classmethod
+    def _get_workflow(cls, pipeline: Pipeline, invoke_from: InvokeFrom) -> Workflow:
+        """
+        Get workflow
+        :param pipeline: pipeline
+        :param invoke_from: invoke from
+        :return:
+        """
+        rag_pipeline_service = RagPipelineService()
+        if invoke_from == InvokeFrom.DEBUGGER:
+            # fetch draft workflow by app_model
+            workflow = rag_pipeline_service.get_draft_workflow(pipeline=pipeline)
+
+            if not workflow:
+                raise ValueError("Workflow not initialized")
+        else:
+            # fetch published workflow by app_model
+            workflow = rag_pipeline_service.get_published_workflow(pipeline=pipeline)
+
+            if not workflow:
+                raise ValueError("Workflow not published")
+
+        return workflow
+
+    @classmethod
+    def update_document_status(cls, document_id: str):
+        """
+        Update document status to waiting
+        :param document_id: document id
+        """
+        document = db.session.query(Document).where(Document.id == document_id).first()
+        if document:
+            document.indexing_status = "waiting"
+            db.session.add(document)
+            db.session.commit()
--- a/dify/api/services/rag_pipeline/pipeline_template/init.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/init.py
--- a/dify/api/services/rag_pipeline/pipeline_template/built_in/init.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/built_in/init.py
--- a/dify/api/services/rag_pipeline/pipeline_template/built_in/built_in_retrieval.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/built_in/built_in_retrieval.py
@@ -0,0 +1,63 @@
+import json
+from os import path
+from pathlib import Path
+
+from flask import current_app
+
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class BuiltInPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval pipeline template from built-in, the location  is constants/pipeline_templates.json
+    """
+
+    builtin_data: dict | None = None
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.BUILTIN
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        result = self.fetch_pipeline_templates_from_builtin(language)
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_builtin(template_id)
+        return result
+
+    @classmethod
+    def _get_builtin_data(cls) -> dict:
+        """
+        Get builtin data.
+        :return:
+        """
+        if cls.builtin_data:
+            return cls.builtin_data
+
+        root_path = current_app.root_path
+        cls.builtin_data = json.loads(
+            Path(path.join(root_path, "constants", "pipeline_templates.json")).read_text(encoding="utf-8")
+        )
+
+        return cls.builtin_data or {}
+
+    @classmethod
+    def fetch_pipeline_templates_from_builtin(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from builtin.
+        :param language: language
+        :return:
+        """
+        builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
+        return builtin_data.get("pipeline_templates", {}).get(language, {})
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_builtin(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from builtin.
+        :param template_id: Template ID
+        :return:
+        """
+        builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
+        return builtin_data.get("pipeline_templates", {}).get(template_id)
--- a/dify/api/services/rag_pipeline/pipeline_template/customized/init.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/customized/init.py
--- a/dify/api/services/rag_pipeline/pipeline_template/customized/customized_retrieval.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/customized/customized_retrieval.py
@@ -0,0 +1,80 @@
+import yaml
+
+from extensions.ext_database import db
+from libs.login import current_account_with_tenant
+from models.dataset import PipelineCustomizedTemplate
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class CustomizedPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval recommended app from database
+    """
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        _, current_tenant_id = current_account_with_tenant()
+        result = self.fetch_pipeline_templates_from_customized(tenant_id=current_tenant_id, language=language)
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.CUSTOMIZED
+
+    @classmethod
+    def fetch_pipeline_templates_from_customized(cls, tenant_id: str, language: str) -> dict:
+        """
+        Fetch pipeline templates from db.
+        :param tenant_id: tenant id
+        :param language: language
+        :return:
+        """
+        pipeline_customized_templates = (
+            db.session.query(PipelineCustomizedTemplate)
+            .where(PipelineCustomizedTemplate.tenant_id == tenant_id, PipelineCustomizedTemplate.language == language)
+            .order_by(PipelineCustomizedTemplate.position.asc(), PipelineCustomizedTemplate.created_at.desc())
+            .all()
+        )
+        recommended_pipelines_results = []
+        for pipeline_customized_template in pipeline_customized_templates:
+            recommended_pipeline_result = {
+                "id": pipeline_customized_template.id,
+                "name": pipeline_customized_template.name,
+                "description": pipeline_customized_template.description,
+                "icon": pipeline_customized_template.icon,
+                "position": pipeline_customized_template.position,
+                "chunk_structure": pipeline_customized_template.chunk_structure,
+            }
+            recommended_pipelines_results.append(recommended_pipeline_result)
+
+        return {"pipeline_templates": recommended_pipelines_results}
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from db.
+        :param template_id: Template ID
+        :return:
+        """
+        pipeline_template = (
+            db.session.query(PipelineCustomizedTemplate).where(PipelineCustomizedTemplate.id == template_id).first()
+        )
+        if not pipeline_template:
+            return None
+
+        dsl_data = yaml.safe_load(pipeline_template.yaml_content)
+        graph_data = dsl_data.get("workflow", {}).get("graph", {})
+
+        return {
+            "id": pipeline_template.id,
+            "name": pipeline_template.name,
+            "icon_info": pipeline_template.icon,
+            "description": pipeline_template.description,
+            "chunk_structure": pipeline_template.chunk_structure,
+            "export_data": pipeline_template.yaml_content,
+            "graph": graph_data,
+            "created_by": pipeline_template.created_user_name,
+        }
--- a/dify/api/services/rag_pipeline/pipeline_template/database/init.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/database/init.py
--- a/dify/api/services/rag_pipeline/pipeline_template/database/database_retrieval.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/database/database_retrieval.py
@@ -0,0 +1,77 @@
+import yaml
+
+from extensions.ext_database import db
+from models.dataset import PipelineBuiltInTemplate
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class DatabasePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval pipeline   template from database
+    """
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        result = self.fetch_pipeline_templates_from_db(language)
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.DATABASE
+
+    @classmethod
+    def fetch_pipeline_templates_from_db(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from db.
+        :param language: language
+        :return:
+        """
+
+        pipeline_built_in_templates: list[PipelineBuiltInTemplate] = (
+            db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.language == language).all()
+        )
+
+        recommended_pipelines_results = []
+        for pipeline_built_in_template in pipeline_built_in_templates:
+            recommended_pipeline_result = {
+                "id": pipeline_built_in_template.id,
+                "name": pipeline_built_in_template.name,
+                "description": pipeline_built_in_template.description,
+                "icon": pipeline_built_in_template.icon,
+                "copyright": pipeline_built_in_template.copyright,
+                "privacy_policy": pipeline_built_in_template.privacy_policy,
+                "position": pipeline_built_in_template.position,
+                "chunk_structure": pipeline_built_in_template.chunk_structure,
+            }
+            recommended_pipelines_results.append(recommended_pipeline_result)
+
+        return {"pipeline_templates": recommended_pipelines_results}
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from db.
+        :param pipeline_id: Pipeline ID
+        :return:
+        """
+        # is in public recommended list
+        pipeline_template = (
+            db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.id == template_id).first()
+        )
+
+        if not pipeline_template:
+            return None
+        dsl_data = yaml.safe_load(pipeline_template.yaml_content)
+        graph_data = dsl_data.get("workflow", {}).get("graph", {})
+        return {
+            "id": pipeline_template.id,
+            "name": pipeline_template.name,
+            "icon_info": pipeline_template.icon,
+            "description": pipeline_template.description,
+            "chunk_structure": pipeline_template.chunk_structure,
+            "export_data": pipeline_template.yaml_content,
+            "graph": graph_data,
+        }
--- a/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_base.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_base.py
@@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+
+
+class PipelineTemplateRetrievalBase(ABC):
+    """Interface for pipeline template retrieval."""
+
+    @abstractmethod
+    def get_pipeline_templates(self, language: str) -> dict:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_pipeline_template_detail(self, template_id: str) -> dict | None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_type(self) -> str:
+        raise NotImplementedError
--- a/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_factory.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_factory.py
@@ -0,0 +1,26 @@
+from services.rag_pipeline.pipeline_template.built_in.built_in_retrieval import BuiltInPipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.customized.customized_retrieval import CustomizedPipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+from services.rag_pipeline.pipeline_template.remote.remote_retrieval import RemotePipelineTemplateRetrieval
+
+
+class PipelineTemplateRetrievalFactory:
+    @staticmethod
+    def get_pipeline_template_factory(mode: str) -> type[PipelineTemplateRetrievalBase]:
+        match mode:
+            case PipelineTemplateType.REMOTE:
+                return RemotePipelineTemplateRetrieval
+            case PipelineTemplateType.CUSTOMIZED:
+                return CustomizedPipelineTemplateRetrieval
+            case PipelineTemplateType.DATABASE:
+                return DatabasePipelineTemplateRetrieval
+            case PipelineTemplateType.BUILTIN:
+                return BuiltInPipelineTemplateRetrieval
+            case _:
+                raise ValueError(f"invalid fetch recommended apps mode: {mode}")
+
+    @staticmethod
+    def get_built_in_pipeline_template_retrieval():
+        return BuiltInPipelineTemplateRetrieval
--- a/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_type.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/pipeline_template_type.py
@@ -0,0 +1,8 @@
+from enum import StrEnum
+
+
+class PipelineTemplateType(StrEnum):
+    REMOTE = "remote"
+    DATABASE = "database"
+    CUSTOMIZED = "customized"
+    BUILTIN = "builtin"
--- a/dify/api/services/rag_pipeline/pipeline_template/remote/init.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/remote/init.py
--- a/dify/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py
+++ b/dify/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py
@@ -0,0 +1,67 @@
+import logging
+
+import httpx
+
+from configs import dify_config
+from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+logger = logging.getLogger(__name__)
+
+
+class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval recommended app from dify official
+    """
+
+    def get_pipeline_template_detail(self, template_id: str):
+        try:
+            result = self.fetch_pipeline_template_detail_from_dify_official(template_id)
+        except Exception as e:
+            logger.warning("fetch recommended app detail from dify official failed: %r, switch to database.", e)
+            result = DatabasePipelineTemplateRetrieval.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        try:
+            result = self.fetch_pipeline_templates_from_dify_official(language)
+        except Exception as e:
+            logger.warning("fetch pipeline templates from dify official failed: %r, switch to database.", e)
+            result = DatabasePipelineTemplateRetrieval.fetch_pipeline_templates_from_db(language)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.REMOTE
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from dify official.
+        :param template_id: Pipeline ID
+        :return:
+        """
+        domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
+        url = f"{domain}/pipeline-templates/{template_id}"
+        response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
+        if response.status_code != 200:
+            return None
+        data: dict = response.json()
+        return data
+
+    @classmethod
+    def fetch_pipeline_templates_from_dify_official(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from dify official.
+        :param language: language
+        :return:
+        """
+        domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
+        url = f"{domain}/pipeline-templates?language={language}"
+        response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
+        if response.status_code != 200:
+            raise ValueError(f"fetch pipeline templates failed, status code: {response.status_code}")
+
+        result: dict = response.json()
+
+        return result
--- a/dify/api/services/rag_pipeline/rag_pipeline.py
+++ b/dify/api/services/rag_pipeline/rag_pipeline.py
--- a/dify/api/services/rag_pipeline/rag_pipeline_dsl_service.py
+++ b/dify/api/services/rag_pipeline/rag_pipeline_dsl_service.py
@@ -0,0 +1,945 @@
+import base64
+import hashlib
+import json
+import logging
+import uuid
+from collections.abc import Mapping
+from datetime import UTC, datetime
+from enum import StrEnum
+from typing import cast
+from urllib.parse import urlparse
+from uuid import uuid4
+
+import yaml  # type: ignore
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import pad, unpad
+from flask_login import current_user
+from packaging import version
+from pydantic import BaseModel, Field
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from core.helper import ssrf_proxy
+from core.helper.name_generator import generate_incremental_name
+from core.model_runtime.utils.encoders import jsonable_encoder
+from core.plugin.entities.plugin import PluginDependency
+from core.workflow.enums import NodeType
+from core.workflow.nodes.datasource.entities import DatasourceNodeData
+from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
+from core.workflow.nodes.llm.entities import LLMNodeData
+from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
+from core.workflow.nodes.question_classifier.entities import QuestionClassifierNodeData
+from core.workflow.nodes.tool.entities import ToolNodeData
+from extensions.ext_redis import redis_client
+from factories import variable_factory
+from models import Account
+from models.dataset import Dataset, DatasetCollectionBinding, Pipeline
+from models.workflow import Workflow, WorkflowType
+from services.entities.knowledge_entities.rag_pipeline_entities import (
+    IconInfo,
+    KnowledgeConfiguration,
+    RagPipelineDatasetCreateEntity,
+)
+from services.plugin.dependencies_analysis import DependenciesAnalysisService
+
+logger = logging.getLogger(__name__)
+
+IMPORT_INFO_REDIS_KEY_PREFIX = "app_import_info:"
+CHECK_DEPENDENCIES_REDIS_KEY_PREFIX = "app_check_dependencies:"
+IMPORT_INFO_REDIS_EXPIRY = 10 * 60  # 10 minutes
+DSL_MAX_SIZE = 10 * 1024 * 1024  # 10MB
+CURRENT_DSL_VERSION = "0.1.0"
+
+
+class ImportMode(StrEnum):
+    YAML_CONTENT = "yaml-content"
+    YAML_URL = "yaml-url"
+
+
+class ImportStatus(StrEnum):
+    COMPLETED = "completed"
+    COMPLETED_WITH_WARNINGS = "completed-with-warnings"
+    PENDING = "pending"
+    FAILED = "failed"
+
+
+class RagPipelineImportInfo(BaseModel):
+    id: str
+    status: ImportStatus
+    pipeline_id: str | None = None
+    current_dsl_version: str = CURRENT_DSL_VERSION
+    imported_dsl_version: str = ""
+    error: str = ""
+    dataset_id: str | None = None
+
+
+class CheckDependenciesResult(BaseModel):
+    leaked_dependencies: list[PluginDependency] = Field(default_factory=list)
+
+
+def _check_version_compatibility(imported_version: str) -> ImportStatus:
+    """Determine import status based on version comparison"""
+    try:
+        current_ver = version.parse(CURRENT_DSL_VERSION)
+        imported_ver = version.parse(imported_version)
+    except version.InvalidVersion:
+        return ImportStatus.FAILED
+
+    # If imported version is newer than current, always return PENDING
+    if imported_ver > current_ver:
+        return ImportStatus.PENDING
+
+    # If imported version is older than current's major, return PENDING
+    if imported_ver.major < current_ver.major:
+        return ImportStatus.PENDING
+
+    # If imported version is older than current's minor, return COMPLETED_WITH_WARNINGS
+    if imported_ver.minor < current_ver.minor:
+        return ImportStatus.COMPLETED_WITH_WARNINGS
+
+    # If imported version equals or is older than current's micro, return COMPLETED
+    return ImportStatus.COMPLETED
+
+
+class RagPipelinePendingData(BaseModel):
+    import_mode: str
+    yaml_content: str
+    pipeline_id: str | None
+
+
+class CheckDependenciesPendingData(BaseModel):
+    dependencies: list[PluginDependency]
+    pipeline_id: str | None
+
+
+class RagPipelineDslService:
+    def __init__(self, session: Session):
+        self._session = session
+
+    def import_rag_pipeline(
+        self,
+        *,
+        account: Account,
+        import_mode: str,
+        yaml_content: str | None = None,
+        yaml_url: str | None = None,
+        pipeline_id: str | None = None,
+        dataset: Dataset | None = None,
+        dataset_name: str | None = None,
+        icon_info: IconInfo | None = None,
+    ) -> RagPipelineImportInfo:
+        """Import an app from YAML content or URL."""
+        import_id = str(uuid.uuid4())
+
+        # Validate import mode
+        try:
+            mode = ImportMode(import_mode)
+        except ValueError:
+            raise ValueError(f"Invalid import_mode: {import_mode}")
+
+        # Get YAML content
+        content: str = ""
+        if mode == ImportMode.YAML_URL:
+            if not yaml_url:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="yaml_url is required when import_mode is yaml-url",
+                )
+            try:
+                parsed_url = urlparse(yaml_url)
+                if (
+                    parsed_url.scheme == "https"
+                    and parsed_url.netloc == "github.com"
+                    and parsed_url.path.endswith((".yml", ".yaml"))
+                ):
+                    yaml_url = yaml_url.replace("https://github.com", "https://raw.githubusercontent.com")
+                    yaml_url = yaml_url.replace("/blob/", "/")
+                response = ssrf_proxy.get(yaml_url.strip(), follow_redirects=True, timeout=(10, 10))
+                response.raise_for_status()
+                content = response.content.decode()
+
+                if len(content) > DSL_MAX_SIZE:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="File size exceeds the limit of 10MB",
+                    )
+
+                if not content:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="Empty content from url",
+                    )
+            except Exception as e:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error=f"Error fetching YAML from URL: {str(e)}",
+                )
+        elif mode == ImportMode.YAML_CONTENT:
+            if not yaml_content:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="yaml_content is required when import_mode is yaml-content",
+                )
+            content = yaml_content
+
+        # Process YAML content
+        try:
+            # Parse YAML to validate format
+            data = yaml.safe_load(content)
+            if not isinstance(data, dict):
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Invalid YAML format: content must be a mapping",
+                )
+
+            # Validate and fix DSL version
+            if not data.get("version"):
+                data["version"] = "0.1.0"
+            if not data.get("kind") or data.get("kind") != "rag_pipeline":
+                data["kind"] = "rag_pipeline"
+
+            imported_version = data.get("version", "0.1.0")
+            # check if imported_version is a float-like string
+            if not isinstance(imported_version, str):
+                raise ValueError(f"Invalid version type, expected str, got {type(imported_version)}")
+            status = _check_version_compatibility(imported_version)
+
+            # Extract app data
+            pipeline_data = data.get("rag_pipeline")
+            if not pipeline_data:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Missing rag_pipeline data in YAML content",
+                )
+
+            # If app_id is provided, check if it exists
+            pipeline = None
+            if pipeline_id:
+                stmt = select(Pipeline).where(
+                    Pipeline.id == pipeline_id,
+                    Pipeline.tenant_id == account.current_tenant_id,
+                )
+                pipeline = self._session.scalar(stmt)
+
+                if not pipeline:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="Pipeline not found",
+                    )
+                dataset = pipeline.retrieve_dataset(session=self._session)
+
+            # If major version mismatch, store import info in Redis
+            if status == ImportStatus.PENDING:
+                pending_data = RagPipelinePendingData(
+                    import_mode=import_mode,
+                    yaml_content=content,
+                    pipeline_id=pipeline_id,
+                )
+                redis_client.setex(
+                    f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}",
+                    IMPORT_INFO_REDIS_EXPIRY,
+                    pending_data.model_dump_json(),
+                )
+
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=status,
+                    pipeline_id=pipeline_id,
+                    imported_dsl_version=imported_version,
+                )
+
+            # Extract dependencies
+            dependencies = data.get("dependencies", [])
+            check_dependencies_pending_data = None
+            if dependencies:
+                check_dependencies_pending_data = [PluginDependency.model_validate(d) for d in dependencies]
+
+            # Create or update pipeline
+            pipeline = self._create_or_update_pipeline(
+                pipeline=pipeline,
+                data=data,
+                account=account,
+                dependencies=check_dependencies_pending_data,
+            )
+            # create dataset
+            name = pipeline.name or "Untitled"
+            description = pipeline.description
+            if icon_info:
+                icon_type = icon_info.icon_type
+                icon = icon_info.icon
+                icon_background = icon_info.icon_background
+                icon_url = icon_info.icon_url
+            else:
+                icon_type = data.get("rag_pipeline", {}).get("icon_type")
+                icon = data.get("rag_pipeline", {}).get("icon")
+                icon_background = data.get("rag_pipeline", {}).get("icon_background")
+                icon_url = data.get("rag_pipeline", {}).get("icon_url")
+            workflow = data.get("workflow", {})
+            graph = workflow.get("graph", {})
+            nodes = graph.get("nodes", [])
+            dataset_id = None
+            for node in nodes:
+                if node.get("data", {}).get("type") == "knowledge-index":
+                    knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
+                    if (
+                        dataset
+                        and pipeline.is_published
+                        and dataset.chunk_structure != knowledge_configuration.chunk_structure
+                    ):
+                        raise ValueError("Chunk structure is not compatible with the published pipeline")
+                    if not dataset:
+                        datasets = self._session.query(Dataset).filter_by(tenant_id=account.current_tenant_id).all()
+                        names = [dataset.name for dataset in datasets]
+                        generate_name = generate_incremental_name(names, name)
+                        dataset = Dataset(
+                            tenant_id=account.current_tenant_id,
+                            name=generate_name,
+                            description=description,
+                            icon_info={
+                                "icon_type": icon_type,
+                                "icon": icon,
+                                "icon_background": icon_background,
+                                "icon_url": icon_url,
+                            },
+                            indexing_technique=knowledge_configuration.indexing_technique,
+                            created_by=account.id,
+                            retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
+                            runtime_mode="rag_pipeline",
+                            chunk_structure=knowledge_configuration.chunk_structure,
+                        )
+                    if knowledge_configuration.indexing_technique == "high_quality":
+                        dataset_collection_binding = (
+                            self._session.query(DatasetCollectionBinding)
+                            .where(
+                                DatasetCollectionBinding.provider_name
+                                == knowledge_configuration.embedding_model_provider,
+                                DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
+                                DatasetCollectionBinding.type == "dataset",
+                            )
+                            .order_by(DatasetCollectionBinding.created_at)
+                            .first()
+                        )
+
+                        if not dataset_collection_binding:
+                            dataset_collection_binding = DatasetCollectionBinding(
+                                provider_name=knowledge_configuration.embedding_model_provider,
+                                model_name=knowledge_configuration.embedding_model,
+                                collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
+                                type="dataset",
+                            )
+                            self._session.add(dataset_collection_binding)
+                            self._session.commit()
+                        dataset_collection_binding_id = dataset_collection_binding.id
+                        dataset.collection_binding_id = dataset_collection_binding_id
+                        dataset.embedding_model = knowledge_configuration.embedding_model
+                        dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
+                    elif knowledge_configuration.indexing_technique == "economy":
+                        dataset.keyword_number = knowledge_configuration.keyword_number
+                    dataset.pipeline_id = pipeline.id
+                    self._session.add(dataset)
+                    self._session.commit()
+                    dataset_id = dataset.id
+            if not dataset_id:
+                raise ValueError("DSL is not valid, please check the Knowledge Index node.")
+
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=status,
+                pipeline_id=pipeline.id,
+                dataset_id=dataset_id,
+                imported_dsl_version=imported_version,
+            )
+
+        except yaml.YAMLError as e:
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=f"Invalid YAML format: {str(e)}",
+            )
+
+        except Exception as e:
+            logger.exception("Failed to import app")
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=str(e),
+            )
+
+    def confirm_import(self, *, import_id: str, account: Account) -> RagPipelineImportInfo:
+        """
+        Confirm an import that requires confirmation
+        """
+        redis_key = f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}"
+        pending_data = redis_client.get(redis_key)
+
+        if not pending_data:
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error="Import information expired or does not exist",
+            )
+
+        try:
+            if not isinstance(pending_data, str | bytes):
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Invalid import information",
+                )
+            pending_data = RagPipelinePendingData.model_validate_json(pending_data)
+            data = yaml.safe_load(pending_data.yaml_content)
+
+            pipeline = None
+            if pending_data.pipeline_id:
+                stmt = select(Pipeline).where(
+                    Pipeline.id == pending_data.pipeline_id,
+                    Pipeline.tenant_id == account.current_tenant_id,
+                )
+                pipeline = self._session.scalar(stmt)
+
+            # Create or update app
+            pipeline = self._create_or_update_pipeline(
+                pipeline=pipeline,
+                data=data,
+                account=account,
+            )
+            dataset = pipeline.retrieve_dataset(session=self._session)
+
+            # create dataset
+            name = pipeline.name
+            description = pipeline.description
+            icon_type = data.get("rag_pipeline", {}).get("icon_type")
+            icon = data.get("rag_pipeline", {}).get("icon")
+            icon_background = data.get("rag_pipeline", {}).get("icon_background")
+            icon_url = data.get("rag_pipeline", {}).get("icon_url")
+            workflow = data.get("workflow", {})
+            graph = workflow.get("graph", {})
+            nodes = graph.get("nodes", [])
+            dataset_id = None
+            for node in nodes:
+                if node.get("data", {}).get("type") == "knowledge-index":
+                    knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
+                    if not dataset:
+                        dataset = Dataset(
+                            tenant_id=account.current_tenant_id,
+                            name=name,
+                            description=description,
+                            icon_info={
+                                "icon_type": icon_type,
+                                "icon": icon,
+                                "icon_background": icon_background,
+                                "icon_url": icon_url,
+                            },
+                            indexing_technique=knowledge_configuration.indexing_technique,
+                            created_by=account.id,
+                            retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
+                            runtime_mode="rag_pipeline",
+                            chunk_structure=knowledge_configuration.chunk_structure,
+                        )
+                    else:
+                        dataset.indexing_technique = knowledge_configuration.indexing_technique
+                        dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+                        dataset.runtime_mode = "rag_pipeline"
+                        dataset.chunk_structure = knowledge_configuration.chunk_structure
+                    if knowledge_configuration.indexing_technique == "high_quality":
+                        dataset_collection_binding = (
+                            self._session.query(DatasetCollectionBinding)
+                            .where(
+                                DatasetCollectionBinding.provider_name
+                                == knowledge_configuration.embedding_model_provider,
+                                DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
+                                DatasetCollectionBinding.type == "dataset",
+                            )
+                            .order_by(DatasetCollectionBinding.created_at)
+                            .first()
+                        )
+
+                        if not dataset_collection_binding:
+                            dataset_collection_binding = DatasetCollectionBinding(
+                                provider_name=knowledge_configuration.embedding_model_provider,
+                                model_name=knowledge_configuration.embedding_model,
+                                collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
+                                type="dataset",
+                            )
+                            self._session.add(dataset_collection_binding)
+                            self._session.commit()
+                        dataset_collection_binding_id = dataset_collection_binding.id
+                        dataset.collection_binding_id = dataset_collection_binding_id
+                        dataset.embedding_model = knowledge_configuration.embedding_model
+                        dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
+                    elif knowledge_configuration.indexing_technique == "economy":
+                        dataset.keyword_number = knowledge_configuration.keyword_number
+                    dataset.pipeline_id = pipeline.id
+                    self._session.add(dataset)
+                    self._session.commit()
+                    dataset_id = dataset.id
+            if not dataset_id:
+                raise ValueError("DSL is not valid, please check the Knowledge Index node.")
+
+            # Delete import info from Redis
+            redis_client.delete(redis_key)
+
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.COMPLETED,
+                pipeline_id=pipeline.id,
+                dataset_id=dataset_id,
+                current_dsl_version=CURRENT_DSL_VERSION,
+                imported_dsl_version=data.get("version", "0.1.0"),
+            )
+
+        except Exception as e:
+            logger.exception("Error confirming import")
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=str(e),
+            )
+
+    def check_dependencies(
+        self,
+        *,
+        pipeline: Pipeline,
+    ) -> CheckDependenciesResult:
+        """Check dependencies"""
+        # Get dependencies from Redis
+        redis_key = f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}"
+        dependencies = redis_client.get(redis_key)
+        if not dependencies:
+            return CheckDependenciesResult()
+
+        # Extract dependencies
+        dependencies = CheckDependenciesPendingData.model_validate_json(dependencies)
+
+        # Get leaked dependencies
+        leaked_dependencies = DependenciesAnalysisService.get_leaked_dependencies(
+            tenant_id=pipeline.tenant_id, dependencies=dependencies.dependencies
+        )
+        return CheckDependenciesResult(
+            leaked_dependencies=leaked_dependencies,
+        )
+
+    def _create_or_update_pipeline(
+        self,
+        *,
+        pipeline: Pipeline | None,
+        data: dict,
+        account: Account,
+        dependencies: list[PluginDependency] | None = None,
+    ) -> Pipeline:
+        """Create a new app or update an existing one."""
+        if not account.current_tenant_id:
+            raise ValueError("Tenant id is required")
+        pipeline_data = data.get("rag_pipeline", {})
+        # Initialize pipeline based on mode
+        workflow_data = data.get("workflow")
+        if not workflow_data or not isinstance(workflow_data, dict):
+            raise ValueError("Missing workflow data for rag pipeline")
+
+        environment_variables_list = workflow_data.get("environment_variables", [])
+        environment_variables = [
+            variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
+        ]
+        conversation_variables_list = workflow_data.get("conversation_variables", [])
+        conversation_variables = [
+            variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
+        ]
+        rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
+
+        graph = workflow_data.get("graph", {})
+        for node in graph.get("nodes", []):
+            if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL:
+                dataset_ids = node["data"].get("dataset_ids", [])
+                node["data"]["dataset_ids"] = [
+                    decrypted_id
+                    for dataset_id in dataset_ids
+                    if (
+                        decrypted_id := self.decrypt_dataset_id(
+                            encrypted_data=dataset_id,
+                            tenant_id=account.current_tenant_id,
+                        )
+                    )
+                ]
+
+        if pipeline:
+            # Update existing pipeline
+            pipeline.name = pipeline_data.get("name", pipeline.name)
+            pipeline.description = pipeline_data.get("description", pipeline.description)
+            pipeline.updated_by = account.id
+
+        else:
+            if account.current_tenant_id is None:
+                raise ValueError("Current tenant is not set")
+
+            # Create new app
+            pipeline = Pipeline(
+                tenant_id=account.current_tenant_id,
+                name=pipeline_data.get("name", ""),
+                description=pipeline_data.get("description", ""),
+                created_by=account.id,
+                updated_by=account.id,
+            )
+            pipeline.id = str(uuid4())
+
+            self._session.add(pipeline)
+            self._session.commit()
+        # save dependencies
+        if dependencies:
+            redis_client.setex(
+                f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}",
+                IMPORT_INFO_REDIS_EXPIRY,
+                CheckDependenciesPendingData(pipeline_id=pipeline.id, dependencies=dependencies).model_dump_json(),
+            )
+        workflow = (
+            self._session.query(Workflow)
+            .where(
+                Workflow.tenant_id == pipeline.tenant_id,
+                Workflow.app_id == pipeline.id,
+                Workflow.version == "draft",
+            )
+            .first()
+        )
+
+        # create draft workflow if not found
+        if not workflow:
+            workflow = Workflow(
+                tenant_id=pipeline.tenant_id,
+                app_id=pipeline.id,
+                features="{}",
+                type=WorkflowType.RAG_PIPELINE,
+                version="draft",
+                graph=json.dumps(graph),
+                created_by=account.id,
+                environment_variables=environment_variables,
+                conversation_variables=conversation_variables,
+                rag_pipeline_variables=rag_pipeline_variables_list,
+            )
+            self._session.add(workflow)
+            self._session.flush()
+            pipeline.workflow_id = workflow.id
+        else:
+            workflow.graph = json.dumps(graph)
+            workflow.updated_by = account.id
+            workflow.updated_at = datetime.now(UTC).replace(tzinfo=None)
+            workflow.environment_variables = environment_variables
+            workflow.conversation_variables = conversation_variables
+            workflow.rag_pipeline_variables = rag_pipeline_variables_list
+        # commit db session changes
+        self._session.commit()
+
+        return pipeline
+
+    def export_rag_pipeline_dsl(self, pipeline: Pipeline, include_secret: bool = False) -> str:
+        """
+        Export pipeline
+        :param pipeline: Pipeline instance
+        :param include_secret: Whether include secret variable
+        :return:
+        """
+        dataset = pipeline.retrieve_dataset(session=self._session)
+        if not dataset:
+            raise ValueError("Missing dataset for rag pipeline")
+        icon_info = dataset.icon_info
+        export_data = {
+            "version": CURRENT_DSL_VERSION,
+            "kind": "rag_pipeline",
+            "rag_pipeline": {
+                "name": dataset.name,
+                "icon": icon_info.get("icon", "📙") if icon_info else "📙",
+                "icon_type": icon_info.get("icon_type", "emoji") if icon_info else "emoji",
+                "icon_background": icon_info.get("icon_background", "#FFEAD5") if icon_info else "#FFEAD5",
+                "icon_url": icon_info.get("icon_url") if icon_info else None,
+                "description": pipeline.description,
+            },
+        }
+
+        self._append_workflow_export_data(export_data=export_data, pipeline=pipeline, include_secret=include_secret)
+
+        return yaml.dump(export_data, allow_unicode=True)  # type: ignore
+
+    def _append_workflow_export_data(self, *, export_data: dict, pipeline: Pipeline, include_secret: bool) -> None:
+        """
+        Append workflow export data
+        :param export_data: export data
+        :param pipeline: Pipeline instance
+        """
+
+        workflow = (
+            self._session.query(Workflow)
+            .where(
+                Workflow.tenant_id == pipeline.tenant_id,
+                Workflow.app_id == pipeline.id,
+                Workflow.version == "draft",
+            )
+            .first()
+        )
+        if not workflow:
+            raise ValueError("Missing draft workflow configuration, please check.")
+
+        workflow_dict = workflow.to_dict(include_secret=include_secret)
+        for node in workflow_dict.get("graph", {}).get("nodes", []):
+            node_data = node.get("data", {})
+            if not node_data:
+                continue
+            data_type = node_data.get("type", "")
+            if data_type == NodeType.KNOWLEDGE_RETRIEVAL:
+                dataset_ids = node_data.get("dataset_ids", [])
+                node["data"]["dataset_ids"] = [
+                    self.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=pipeline.tenant_id)
+                    for dataset_id in dataset_ids
+                ]
+            # filter credential id from tool node
+            if not include_secret and data_type == NodeType.TOOL:
+                node_data.pop("credential_id", None)
+            # filter credential id from agent node
+            if not include_secret and data_type == NodeType.AGENT:
+                for tool in node_data.get("agent_parameters", {}).get("tools", {}).get("value", []):
+                    tool.pop("credential_id", None)
+
+        export_data["workflow"] = workflow_dict
+        dependencies = self._extract_dependencies_from_workflow(workflow)
+        export_data["dependencies"] = [
+            jsonable_encoder(d.model_dump())
+            for d in DependenciesAnalysisService.generate_dependencies(
+                tenant_id=pipeline.tenant_id, dependencies=dependencies
+            )
+        ]
+
+    def _extract_dependencies_from_workflow(self, workflow: Workflow) -> list[str]:
+        """
+        Extract dependencies from workflow
+        :param workflow: Workflow instance
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        graph = workflow.graph_dict
+        dependencies = self._extract_dependencies_from_workflow_graph(graph)
+        return dependencies
+
+    def _extract_dependencies_from_workflow_graph(self, graph: Mapping) -> list[str]:
+        """
+        Extract dependencies from workflow graph
+        :param graph: Workflow graph
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        dependencies = []
+        for node in graph.get("nodes", []):
+            try:
+                typ = node.get("data", {}).get("type")
+                match typ:
+                    case NodeType.TOOL:
+                        tool_entity = ToolNodeData.model_validate(node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_tool_dependency(tool_entity.provider_id),
+                        )
+                    case NodeType.DATASOURCE:
+                        datasource_entity = DatasourceNodeData.model_validate(node["data"])
+                        if datasource_entity.provider_type != "local_file":
+                            dependencies.append(datasource_entity.plugin_id)
+                    case NodeType.LLM:
+                        llm_entity = LLMNodeData.model_validate(node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(llm_entity.model.provider),
+                        )
+                    case NodeType.QUESTION_CLASSIFIER:
+                        question_classifier_entity = QuestionClassifierNodeData.model_validate(node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                question_classifier_entity.model.provider
+                            ),
+                        )
+                    case NodeType.PARAMETER_EXTRACTOR:
+                        parameter_extractor_entity = ParameterExtractorNodeData.model_validate(node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                parameter_extractor_entity.model.provider
+                            ),
+                        )
+                    case NodeType.KNOWLEDGE_INDEX:
+                        knowledge_index_entity = KnowledgeConfiguration.model_validate(node["data"])
+                        if knowledge_index_entity.indexing_technique == "high_quality":
+                            if knowledge_index_entity.embedding_model_provider:
+                                dependencies.append(
+                                    DependenciesAnalysisService.analyze_model_provider_dependency(
+                                        knowledge_index_entity.embedding_model_provider
+                                    ),
+                                )
+                        if knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model":
+                            if knowledge_index_entity.retrieval_model.reranking_enable:
+                                if (
+                                    knowledge_index_entity.retrieval_model.reranking_model
+                                    and knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model"
+                                ):
+                                    if knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name:
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name
+                                            ),
+                                        )
+                    case NodeType.KNOWLEDGE_RETRIEVAL:
+                        knowledge_retrieval_entity = KnowledgeRetrievalNodeData.model_validate(node["data"])
+                        if knowledge_retrieval_entity.retrieval_mode == "multiple":
+                            if knowledge_retrieval_entity.multiple_retrieval_config:
+                                if (
+                                    knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
+                                    == "reranking_model"
+                                ):
+                                    if knowledge_retrieval_entity.multiple_retrieval_config.reranking_model:
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                knowledge_retrieval_entity.multiple_retrieval_config.reranking_model.provider
+                                            ),
+                                        )
+                                elif (
+                                    knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
+                                    == "weighted_score"
+                                ):
+                                    if knowledge_retrieval_entity.multiple_retrieval_config.weights:
+                                        vector_setting = (
+                                            knowledge_retrieval_entity.multiple_retrieval_config.weights.vector_setting
+                                        )
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                vector_setting.embedding_provider_name
+                                            ),
+                                        )
+                        elif knowledge_retrieval_entity.retrieval_mode == "single":
+                            model_config = knowledge_retrieval_entity.single_retrieval_config
+                            if model_config:
+                                dependencies.append(
+                                    DependenciesAnalysisService.analyze_model_provider_dependency(
+                                        model_config.model.provider
+                                    ),
+                                )
+                    case _:
+                        # TODO: Handle default case or unknown node types
+                        pass
+            except Exception as e:
+                logger.exception("Error extracting node dependency", exc_info=e)
+
+        return dependencies
+
+    @classmethod
+    def _extract_dependencies_from_model_config(cls, model_config: Mapping) -> list[str]:
+        """
+        Extract dependencies from model config
+        :param model_config: model config dict
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        dependencies = []
+
+        try:
+            # completion model
+            model_dict = model_config.get("model", {})
+            if model_dict:
+                dependencies.append(
+                    DependenciesAnalysisService.analyze_model_provider_dependency(model_dict.get("provider", ""))
+                )
+
+            # reranking model
+            dataset_configs = model_config.get("dataset_configs", {})
+            if dataset_configs:
+                for dataset_config in dataset_configs.get("datasets", {}).get("datasets", []):
+                    if dataset_config.get("reranking_model"):
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                dataset_config.get("reranking_model", {})
+                                .get("reranking_provider_name", {})
+                                .get("provider")
+                            )
+                        )
+
+            # tools
+            agent_configs = model_config.get("agent_mode", {})
+            if agent_configs:
+                for agent_config in agent_configs.get("tools", []):
+                    dependencies.append(
+                        DependenciesAnalysisService.analyze_tool_dependency(agent_config.get("provider_id"))
+                    )
+
+        except Exception as e:
+            logger.exception("Error extracting model config dependency", exc_info=e)
+
+        return dependencies
+
+    @classmethod
+    def get_leaked_dependencies(cls, tenant_id: str, dsl_dependencies: list[dict]) -> list[PluginDependency]:
+        """
+        Returns the leaked dependencies in current workspace
+        """
+        dependencies = [PluginDependency.model_validate(dep) for dep in dsl_dependencies]
+        if not dependencies:
+            return []
+
+        return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies)
+
+    def _generate_aes_key(self, tenant_id: str) -> bytes:
+        """Generate AES key based on tenant_id"""
+        return hashlib.sha256(tenant_id.encode()).digest()
+
+    def encrypt_dataset_id(self, dataset_id: str, tenant_id: str) -> str:
+        """Encrypt dataset_id using AES-CBC mode"""
+        key = self._generate_aes_key(tenant_id)
+        iv = key[:16]
+        cipher = AES.new(key, AES.MODE_CBC, iv)
+        ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size))
+        return base64.b64encode(ct_bytes).decode()
+
+    def decrypt_dataset_id(self, encrypted_data: str, tenant_id: str) -> str | None:
+        """AES decryption"""
+        try:
+            key = self._generate_aes_key(tenant_id)
+            iv = key[:16]
+            cipher = AES.new(key, AES.MODE_CBC, iv)
+            pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size)
+            return pt.decode()
+        except Exception:
+            return None
+
+    def create_rag_pipeline_dataset(
+        self,
+        tenant_id: str,
+        rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
+    ):
+        if rag_pipeline_dataset_create_entity.name:
+            # check if dataset name already exists
+            if (
+                self._session.query(Dataset)
+                .filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
+                .first()
+            ):
+                raise ValueError(f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists.")
+        else:
+            # generate a random name as Untitled 1 2 3 ...
+            datasets = self._session.query(Dataset).filter_by(tenant_id=tenant_id).all()
+            names = [dataset.name for dataset in datasets]
+            rag_pipeline_dataset_create_entity.name = generate_incremental_name(
+                names,
+                "Untitled",
+            )
+
+        account = cast(Account, current_user)
+        rag_pipeline_import_info: RagPipelineImportInfo = self.import_rag_pipeline(
+            account=account,
+            import_mode=ImportMode.YAML_CONTENT,
+            yaml_content=rag_pipeline_dataset_create_entity.yaml_content,
+            dataset=None,
+            dataset_name=rag_pipeline_dataset_create_entity.name,
+            icon_info=rag_pipeline_dataset_create_entity.icon_info,
+        )
+        return {
+            "id": rag_pipeline_import_info.id,
+            "dataset_id": rag_pipeline_import_info.dataset_id,
+            "pipeline_id": rag_pipeline_import_info.pipeline_id,
+            "status": rag_pipeline_import_info.status,
+            "imported_dsl_version": rag_pipeline_import_info.imported_dsl_version,
+            "current_dsl_version": rag_pipeline_import_info.current_dsl_version,
+            "error": rag_pipeline_import_info.error,
+        }
--- a/dify/api/services/rag_pipeline/rag_pipeline_manage_service.py
+++ b/dify/api/services/rag_pipeline/rag_pipeline_manage_service.py
@@ -0,0 +1,23 @@
+from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
+from core.plugin.impl.datasource import PluginDatasourceManager
+from services.datasource_provider_service import DatasourceProviderService
+
+
+class RagPipelineManageService:
+    @staticmethod
+    def list_rag_pipeline_datasources(tenant_id: str) -> list[PluginDatasourceProviderEntity]:
+        """
+        list rag pipeline datasources
+        """
+
+        # get all builtin providers
+        manager = PluginDatasourceManager()
+        datasources = manager.fetch_datasource_providers(tenant_id)
+        for datasource in datasources:
+            datasource_provider_service = DatasourceProviderService()
+            credentials = datasource_provider_service.get_datasource_credentials(
+                tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
+            )
+            if credentials:
+                datasource.is_authorized = True
+        return datasources
--- a/dify/api/services/rag_pipeline/rag_pipeline_task_proxy.py
+++ b/dify/api/services/rag_pipeline/rag_pipeline_task_proxy.py
@@ -0,0 +1,106 @@
+import json
+import logging
+from collections.abc import Callable, Sequence
+from functools import cached_property
+
+from core.app.entities.rag_pipeline_invoke_entities import RagPipelineInvokeEntity
+from core.rag.pipeline.queue import TenantIsolatedTaskQueue
+from enums.cloud_plan import CloudPlan
+from extensions.ext_database import db
+from services.feature_service import FeatureService
+from services.file_service import FileService
+from tasks.rag_pipeline.priority_rag_pipeline_run_task import priority_rag_pipeline_run_task
+from tasks.rag_pipeline.rag_pipeline_run_task import rag_pipeline_run_task
+
+logger = logging.getLogger(__name__)
+
+
+class RagPipelineTaskProxy:
+    # Default uploaded file name for rag pipeline invoke entities
+    _RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME = "rag_pipeline_invoke_entities.json"
+
+    def __init__(
+        self, dataset_tenant_id: str, user_id: str, rag_pipeline_invoke_entities: Sequence[RagPipelineInvokeEntity]
+    ):
+        self._dataset_tenant_id = dataset_tenant_id
+        self._user_id = user_id
+        self._rag_pipeline_invoke_entities = rag_pipeline_invoke_entities
+        self._tenant_isolated_task_queue = TenantIsolatedTaskQueue(dataset_tenant_id, "pipeline")
+
+    @cached_property
+    def features(self):
+        return FeatureService.get_features(self._dataset_tenant_id)
+
+    def _upload_invoke_entities(self) -> str:
+        text = [item.model_dump() for item in self._rag_pipeline_invoke_entities]
+        # Convert list to proper JSON string
+        json_text = json.dumps(text)
+        upload_file = FileService(db.engine).upload_text(
+            json_text, self._RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME, self._user_id, self._dataset_tenant_id
+        )
+        return upload_file.id
+
+    def _send_to_direct_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
+        logger.info("send file %s to direct queue", upload_file_id)
+        task_func.delay(  # type: ignore
+            rag_pipeline_invoke_entities_file_id=upload_file_id,
+            tenant_id=self._dataset_tenant_id,
+        )
+
+    def _send_to_tenant_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
+        logger.info("send file %s to tenant queue", upload_file_id)
+        if self._tenant_isolated_task_queue.get_task_key():
+            # Add to waiting queue using List operations (lpush)
+            self._tenant_isolated_task_queue.push_tasks([upload_file_id])
+            logger.info("push tasks: %s", upload_file_id)
+        else:
+            # Set flag and execute task
+            self._tenant_isolated_task_queue.set_task_waiting_time()
+            task_func.delay(  # type: ignore
+                rag_pipeline_invoke_entities_file_id=upload_file_id,
+                tenant_id=self._dataset_tenant_id,
+            )
+            logger.info("init tasks: %s", upload_file_id)
+
+    def _send_to_default_tenant_queue(self, upload_file_id: str):
+        self._send_to_tenant_queue(upload_file_id, rag_pipeline_run_task)
+
+    def _send_to_priority_tenant_queue(self, upload_file_id: str):
+        self._send_to_tenant_queue(upload_file_id, priority_rag_pipeline_run_task)
+
+    def _send_to_priority_direct_queue(self, upload_file_id: str):
+        self._send_to_direct_queue(upload_file_id, priority_rag_pipeline_run_task)
+
+    def _dispatch(self):
+        upload_file_id = self._upload_invoke_entities()
+        if not upload_file_id:
+            raise ValueError("upload_file_id is empty")
+
+        logger.info(
+            "dispatch args: %s - %s - %s",
+            self._dataset_tenant_id,
+            self.features.billing.enabled,
+            self.features.billing.subscription.plan,
+        )
+
+        # dispatch to different pipeline queue with tenant isolation when billing enabled
+        if self.features.billing.enabled:
+            if self.features.billing.subscription.plan == CloudPlan.SANDBOX:
+                # dispatch to normal pipeline queue with tenant isolation for sandbox plan
+                self._send_to_default_tenant_queue(upload_file_id)
+            else:
+                # dispatch to priority pipeline queue with tenant isolation for other plans
+                self._send_to_priority_tenant_queue(upload_file_id)
+        else:
+            # dispatch to priority pipeline queue without tenant isolation for others, e.g.: self-hosted or enterprise
+            self._send_to_priority_direct_queue(upload_file_id)
+
+    def delay(self):
+        if not self._rag_pipeline_invoke_entities:
+            logger.warning(
+                "Received empty rag pipeline invoke entities, no tasks delivered: %s %s",
+                self._dataset_tenant_id,
+                self._user_id,
+            )
+            return
+        self._dispatch()
--- a/dify/api/services/rag_pipeline/rag_pipeline_transform_service.py
+++ b/dify/api/services/rag_pipeline/rag_pipeline_transform_service.py
@@ -0,0 +1,387 @@
+import json
+import logging
+from datetime import UTC, datetime
+from pathlib import Path
+from uuid import uuid4
+
+import yaml
+from flask_login import current_user
+
+from constants import DOCUMENT_EXTENSIONS
+from core.plugin.impl.plugin import PluginInstaller
+from core.rag.retrieval.retrieval_methods import RetrievalMethod
+from extensions.ext_database import db
+from factories import variable_factory
+from models.dataset import Dataset, Document, DocumentPipelineExecutionLog, Pipeline
+from models.model import UploadFile
+from models.workflow import Workflow, WorkflowType
+from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
+from services.plugin.plugin_migration import PluginMigration
+from services.plugin.plugin_service import PluginService
+
+logger = logging.getLogger(__name__)
+
+
+class RagPipelineTransformService:
+    def transform_dataset(self, dataset_id: str):
+        dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
+        if not dataset:
+            raise ValueError("Dataset not found")
+        if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
+            return {
+                "pipeline_id": dataset.pipeline_id,
+                "dataset_id": dataset_id,
+                "status": "success",
+            }
+        if dataset.provider != "vendor":
+            raise ValueError("External dataset is not supported")
+        datasource_type = dataset.data_source_type
+        indexing_technique = dataset.indexing_technique
+
+        if not datasource_type and not indexing_technique:
+            return self._transform_to_empty_pipeline(dataset)
+
+        doc_form = dataset.doc_form
+        if not doc_form:
+            return self._transform_to_empty_pipeline(dataset)
+        retrieval_model = dataset.retrieval_model
+        pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
+        # deal dependencies
+        self._deal_dependencies(pipeline_yaml, dataset.tenant_id)
+        # Extract app data
+        workflow_data = pipeline_yaml.get("workflow")
+        if not workflow_data:
+            raise ValueError("Missing workflow data for rag pipeline")
+        graph = workflow_data.get("graph", {})
+        nodes = graph.get("nodes", [])
+        new_nodes = []
+
+        for node in nodes:
+            if (
+                node.get("data", {}).get("type") == "datasource"
+                and node.get("data", {}).get("provider_type") == "local_file"
+            ):
+                node = self._deal_file_extensions(node)
+            if node.get("data", {}).get("type") == "knowledge-index":
+                node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
+            new_nodes.append(node)
+        if new_nodes:
+            graph["nodes"] = new_nodes
+            workflow_data["graph"] = graph
+            pipeline_yaml["workflow"] = workflow_data
+        # create pipeline
+        pipeline = self._create_pipeline(pipeline_yaml)
+
+        # save chunk structure to dataset
+        if doc_form == "hierarchical_model":
+            dataset.chunk_structure = "hierarchical_model"
+        elif doc_form == "text_model":
+            dataset.chunk_structure = "text_model"
+        else:
+            raise ValueError("Unsupported doc form")
+
+        dataset.runtime_mode = "rag_pipeline"
+        dataset.pipeline_id = pipeline.id
+
+        # deal document data
+        self._deal_document_data(dataset)
+
+        db.session.commit()
+        return {
+            "pipeline_id": pipeline.id,
+            "dataset_id": dataset_id,
+            "status": "success",
+        }
+
+    def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str | None):
+        pipeline_yaml = {}
+        if doc_form == "text_model":
+            match datasource_type:
+                case "upload_file":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.file-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.file-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.notion-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.notion-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.website-crawl-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.website-crawl-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        elif doc_form == "hierarchical_model":
+            match datasource_type:
+                case "upload_file":
+                    # get graph from transform.file-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/file-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    # get graph from transform.notion-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/notion-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    # get graph from transform.website-crawl-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/website-crawl-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        else:
+            raise ValueError("Unsupported doc form")
+        return pipeline_yaml
+
+    def _deal_file_extensions(self, node: dict):
+        file_extensions = node.get("data", {}).get("fileExtensions", [])
+        if not file_extensions:
+            return node
+        node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
+        return node
+
+    def _deal_knowledge_index(
+        self, dataset: Dataset, doc_form: str, indexing_technique: str | None, retrieval_model: dict, node: dict
+    ):
+        knowledge_configuration_dict = node.get("data", {})
+        knowledge_configuration = KnowledgeConfiguration.model_validate(knowledge_configuration_dict)
+
+        if indexing_technique == "high_quality":
+            knowledge_configuration.embedding_model = dataset.embedding_model
+            knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
+        if retrieval_model:
+            retrieval_setting = RetrievalSetting.model_validate(retrieval_model)
+            if indexing_technique == "economy":
+                retrieval_setting.search_method = RetrievalMethod.KEYWORD_SEARCH
+            knowledge_configuration.retrieval_model = retrieval_setting
+        else:
+            dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+
+        knowledge_configuration_dict.update(knowledge_configuration.model_dump())
+        node["data"] = knowledge_configuration_dict
+        return node
+
+    def _create_pipeline(
+        self,
+        data: dict,
+    ) -> Pipeline:
+        """Create a new app or update an existing one."""
+        pipeline_data = data.get("rag_pipeline", {})
+        # Initialize pipeline based on mode
+        workflow_data = data.get("workflow")
+        if not workflow_data or not isinstance(workflow_data, dict):
+            raise ValueError("Missing workflow data for rag pipeline")
+
+        environment_variables_list = workflow_data.get("environment_variables", [])
+        environment_variables = [
+            variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
+        ]
+        conversation_variables_list = workflow_data.get("conversation_variables", [])
+        conversation_variables = [
+            variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
+        ]
+        rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
+
+        graph = workflow_data.get("graph", {})
+
+        # Create new app
+        pipeline = Pipeline(
+            tenant_id=current_user.current_tenant_id,
+            name=pipeline_data.get("name", ""),
+            description=pipeline_data.get("description", ""),
+            created_by=current_user.id,
+            updated_by=current_user.id,
+            is_published=True,
+            is_public=True,
+        )
+        pipeline.id = str(uuid4())
+
+        db.session.add(pipeline)
+        db.session.flush()
+        # create draft workflow
+        draft_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE,
+            version="draft",
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        published_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE,
+            version=str(datetime.now(UTC).replace(tzinfo=None)),
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        db.session.add(draft_workflow)
+        db.session.add(published_workflow)
+        db.session.flush()
+        pipeline.workflow_id = published_workflow.id
+        db.session.add(pipeline)
+        return pipeline
+
+    def _deal_dependencies(self, pipeline_yaml: dict, tenant_id: str):
+        installer_manager = PluginInstaller()
+        installed_plugins = installer_manager.list_plugins(tenant_id)
+
+        plugin_migration = PluginMigration()
+
+        installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
+        dependencies = pipeline_yaml.get("dependencies", [])
+        need_install_plugin_unique_identifiers = []
+        for dependency in dependencies:
+            if dependency.get("type") == "marketplace":
+                plugin_unique_identifier = dependency.get("value", {}).get("plugin_unique_identifier")
+                plugin_id = plugin_unique_identifier.split(":")[0]
+                if plugin_id not in installed_plugins_ids:
+                    plugin_unique_identifier = plugin_migration._fetch_plugin_unique_identifier(plugin_id)  # type: ignore
+                    if plugin_unique_identifier:
+                        need_install_plugin_unique_identifiers.append(plugin_unique_identifier)
+        if need_install_plugin_unique_identifiers:
+            logger.debug("Installing missing pipeline plugins %s", need_install_plugin_unique_identifiers)
+            PluginService.install_from_marketplace_pkg(tenant_id, need_install_plugin_unique_identifiers)
+
+    def _transform_to_empty_pipeline(self, dataset: Dataset):
+        pipeline = Pipeline(
+            tenant_id=dataset.tenant_id,
+            name=dataset.name,
+            description=dataset.description,
+            created_by=current_user.id,
+        )
+        db.session.add(pipeline)
+        db.session.flush()
+
+        dataset.pipeline_id = pipeline.id
+        dataset.runtime_mode = "rag_pipeline"
+        dataset.updated_by = current_user.id
+        dataset.updated_at = datetime.now(UTC).replace(tzinfo=None)
+        db.session.add(dataset)
+        db.session.commit()
+        return {
+            "pipeline_id": pipeline.id,
+            "dataset_id": dataset.id,
+            "status": "success",
+        }
+
+    def _deal_document_data(self, dataset: Dataset):
+        file_node_id = "1752479895761"
+        notion_node_id = "1752489759475"
+        jina_node_id = "1752491761974"
+        firecrawl_node_id = "1752565402678"
+
+        documents = db.session.query(Document).where(Document.dataset_id == dataset.id).all()
+
+        for document in documents:
+            data_source_info_dict = document.data_source_info_dict
+            if not data_source_info_dict:
+                continue
+            if document.data_source_type == "upload_file":
+                document.data_source_type = "local_file"
+                file_id = data_source_info_dict.get("upload_file_id")
+                if file_id:
+                    file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+                    if file:
+                        data_source_info = json.dumps(
+                            {
+                                "real_file_id": file_id,
+                                "name": file.name,
+                                "size": file.size,
+                                "extension": file.extension,
+                                "mime_type": file.mime_type,
+                                "url": "",
+                                "transfer_method": "local_file",
+                            }
+                        )
+                        document.data_source_info = data_source_info
+                        document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                            document_id=document.id,
+                            pipeline_id=dataset.pipeline_id,
+                            datasource_type="local_file",
+                            datasource_info=data_source_info,
+                            input_data={},
+                            created_by=document.created_by,
+                            datasource_node_id=file_node_id,
+                        )
+                        document_pipeline_execution_log.created_at = document.created_at
+                        db.session.add(document)
+                        db.session.add(document_pipeline_execution_log)
+            elif document.data_source_type == "notion_import":
+                document.data_source_type = "online_document"
+                data_source_info = json.dumps(
+                    {
+                        "workspace_id": data_source_info_dict.get("notion_workspace_id"),
+                        "page": {
+                            "page_id": data_source_info_dict.get("notion_page_id"),
+                            "page_name": document.name,
+                            "page_icon": data_source_info_dict.get("notion_page_icon"),
+                            "type": data_source_info_dict.get("type"),
+                            "last_edited_time": data_source_info_dict.get("last_edited_time"),
+                            "parent_id": None,
+                        },
+                    }
+                )
+                document.data_source_info = data_source_info
+                document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                    document_id=document.id,
+                    pipeline_id=dataset.pipeline_id,
+                    datasource_type="online_document",
+                    datasource_info=data_source_info,
+                    input_data={},
+                    created_by=document.created_by,
+                    datasource_node_id=notion_node_id,
+                )
+                document_pipeline_execution_log.created_at = document.created_at
+                db.session.add(document)
+                db.session.add(document_pipeline_execution_log)
+            elif document.data_source_type == "website_crawl":
+                document.data_source_type = "website_crawl"
+                data_source_info = json.dumps(
+                    {
+                        "source_url": data_source_info_dict.get("url"),
+                        "content": "",
+                        "title": document.name,
+                        "description": "",
+                    }
+                )
+                document.data_source_info = data_source_info
+                if data_source_info_dict.get("provider") == "firecrawl":
+                    datasource_node_id = firecrawl_node_id
+                elif data_source_info_dict.get("provider") == "jinareader":
+                    datasource_node_id = jina_node_id
+                else:
+                    continue
+                document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                    document_id=document.id,
+                    pipeline_id=dataset.pipeline_id,
+                    datasource_type="website_crawl",
+                    datasource_info=data_source_info,
+                    input_data={},
+                    created_by=document.created_by,
+                    datasource_node_id=datasource_node_id,
+                )
+                document_pipeline_execution_log.created_at = document.created_at
+                db.session.add(document)
+                db.session.add(document_pipeline_execution_log)
--- a/dify/api/services/rag_pipeline/transform/file-general-economy.yml
+++ b/dify/api/services/rag_pipeline/transform/file-general-economy.yml
@@ -0,0 +1,709 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: file-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos blocos.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: DDelimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: O comprimento de sobreposição dos fragmentos
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Comprimento de sobreposição do bloco
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Excluir todos os URLs e endereços de e-mail
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Excluir todos os URLs e endereços de e-mail
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/file-general-high-quality.yml
+++ b/dify/api/services/rag_pipeline/transform/file-general-high-quality.yml
@@ -0,0 +1,709 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/file-parentchild.yml
+++ b/dify/api/services/rag_pipeline/transform/file-parentchild.yml
@@ -0,0 +1,814 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752575473519-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752575473519'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752575473519-source-1752477924228-target
+      source: '1752575473519'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752575473519'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: false
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752575473519'
+      position:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 948.6766333808323
+      y: -102.06757184183238
+      zoom: 0.8375774577380971
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/notion-general-economy.yml
+++ b/dify/api/services/rag_pipeline/transform/notion-general-economy.yml
@@ -0,0 +1,400 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/notion-general-high-quality.yml
+++ b/dify/api/services/rag_pipeline/transform/notion-general-high-quality.yml
@@ -0,0 +1,400 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: notion-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/notion-parentchild.yml
+++ b/dify/api/services/rag_pipeline/transform/notion-parentchild.yml
@@ -0,0 +1,506 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752490343805-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: true
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -487.2912544090391
+      y: -54.7029301848807
+      zoom: 0.9994011715768695
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
+++ b/dify/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
@@ -0,0 +1,674 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jinareader
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
+++ b/dify/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
@@ -0,0 +1,674 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: website-crawl-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jinareader
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長。
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker  
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/dify/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
+++ b/dify/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
@@ -0,0 +1,779 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752490343805-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: true
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jinareader
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -826.1791044466438
+      y: -71.91725474841303
+      zoom: 0.9980166672552107
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email