dify
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DatasourceNodeRunApiEntity(BaseModel):
|
||||
pipeline_id: str
|
||||
node_id: str
|
||||
inputs: dict[str, Any]
|
||||
datasource_type: str
|
||||
credential_id: str | None = None
|
||||
is_published: bool
|
||||
|
||||
|
||||
class PipelineRunApiEntity(BaseModel):
|
||||
inputs: Mapping[str, Any]
|
||||
datasource_type: str
|
||||
datasource_info_list: list[Mapping[str, Any]]
|
||||
start_node_id: str
|
||||
is_published: bool
|
||||
response_mode: str
|
||||
115
dify/api/services/rag_pipeline/pipeline_generate_service.py
Normal file
115
dify/api/services/rag_pipeline/pipeline_generate_service.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Union
|
||||
|
||||
from configs import dify_config
|
||||
from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Document, Pipeline
|
||||
from models.model import Account, App, EndUser
|
||||
from models.workflow import Workflow
|
||||
from services.rag_pipeline.rag_pipeline import RagPipelineService
|
||||
|
||||
|
||||
class PipelineGenerateService:
|
||||
@classmethod
|
||||
def generate(
|
||||
cls,
|
||||
pipeline: Pipeline,
|
||||
user: Union[Account, EndUser],
|
||||
args: Mapping[str, Any],
|
||||
invoke_from: InvokeFrom,
|
||||
streaming: bool = True,
|
||||
):
|
||||
"""
|
||||
Pipeline Content Generate
|
||||
:param pipeline: pipeline
|
||||
:param user: user
|
||||
:param args: args
|
||||
:param invoke_from: invoke from
|
||||
:param streaming: streaming
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
workflow = cls._get_workflow(pipeline, invoke_from)
|
||||
if original_document_id := args.get("original_document_id"):
|
||||
# update document status to waiting
|
||||
cls.update_document_status(original_document_id)
|
||||
return PipelineGenerator.convert_to_event_stream(
|
||||
PipelineGenerator().generate(
|
||||
pipeline=pipeline,
|
||||
workflow=workflow,
|
||||
user=user,
|
||||
args=args,
|
||||
invoke_from=invoke_from,
|
||||
streaming=streaming,
|
||||
call_depth=0,
|
||||
workflow_thread_pool_id=None,
|
||||
),
|
||||
)
|
||||
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def _get_max_active_requests(app_model: App) -> int:
|
||||
max_active_requests = app_model.max_active_requests
|
||||
if max_active_requests is None:
|
||||
max_active_requests = int(dify_config.APP_MAX_ACTIVE_REQUESTS)
|
||||
return max_active_requests
|
||||
|
||||
@classmethod
|
||||
def generate_single_iteration(
|
||||
cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True
|
||||
):
|
||||
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
|
||||
return PipelineGenerator.convert_to_event_stream(
|
||||
PipelineGenerator().single_iteration_generate(
|
||||
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def generate_single_loop(cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True):
|
||||
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
|
||||
return PipelineGenerator.convert_to_event_stream(
|
||||
PipelineGenerator().single_loop_generate(
|
||||
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_workflow(cls, pipeline: Pipeline, invoke_from: InvokeFrom) -> Workflow:
|
||||
"""
|
||||
Get workflow
|
||||
:param pipeline: pipeline
|
||||
:param invoke_from: invoke from
|
||||
:return:
|
||||
"""
|
||||
rag_pipeline_service = RagPipelineService()
|
||||
if invoke_from == InvokeFrom.DEBUGGER:
|
||||
# fetch draft workflow by app_model
|
||||
workflow = rag_pipeline_service.get_draft_workflow(pipeline=pipeline)
|
||||
|
||||
if not workflow:
|
||||
raise ValueError("Workflow not initialized")
|
||||
else:
|
||||
# fetch published workflow by app_model
|
||||
workflow = rag_pipeline_service.get_published_workflow(pipeline=pipeline)
|
||||
|
||||
if not workflow:
|
||||
raise ValueError("Workflow not published")
|
||||
|
||||
return workflow
|
||||
|
||||
@classmethod
|
||||
def update_document_status(cls, document_id: str):
|
||||
"""
|
||||
Update document status to waiting
|
||||
:param document_id: document id
|
||||
"""
|
||||
document = db.session.query(Document).where(Document.id == document_id).first()
|
||||
if document:
|
||||
document.indexing_status = "waiting"
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
@@ -0,0 +1,63 @@
|
||||
import json
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
|
||||
|
||||
|
||||
class BuiltInPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
|
||||
"""
|
||||
Retrieval pipeline template from built-in, the location is constants/pipeline_templates.json
|
||||
"""
|
||||
|
||||
builtin_data: dict | None = None
|
||||
|
||||
def get_type(self) -> str:
|
||||
return PipelineTemplateType.BUILTIN
|
||||
|
||||
def get_pipeline_templates(self, language: str) -> dict:
|
||||
result = self.fetch_pipeline_templates_from_builtin(language)
|
||||
return result
|
||||
|
||||
def get_pipeline_template_detail(self, template_id: str):
|
||||
result = self.fetch_pipeline_template_detail_from_builtin(template_id)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def _get_builtin_data(cls) -> dict:
|
||||
"""
|
||||
Get builtin data.
|
||||
:return:
|
||||
"""
|
||||
if cls.builtin_data:
|
||||
return cls.builtin_data
|
||||
|
||||
root_path = current_app.root_path
|
||||
cls.builtin_data = json.loads(
|
||||
Path(path.join(root_path, "constants", "pipeline_templates.json")).read_text(encoding="utf-8")
|
||||
)
|
||||
|
||||
return cls.builtin_data or {}
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_templates_from_builtin(cls, language: str) -> dict:
|
||||
"""
|
||||
Fetch pipeline templates from builtin.
|
||||
:param language: language
|
||||
:return:
|
||||
"""
|
||||
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
|
||||
return builtin_data.get("pipeline_templates", {}).get(language, {})
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_template_detail_from_builtin(cls, template_id: str) -> dict | None:
|
||||
"""
|
||||
Fetch pipeline template detail from builtin.
|
||||
:param template_id: Template ID
|
||||
:return:
|
||||
"""
|
||||
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
|
||||
return builtin_data.get("pipeline_templates", {}).get(template_id)
|
||||
@@ -0,0 +1,80 @@
|
||||
import yaml
|
||||
|
||||
from extensions.ext_database import db
|
||||
from libs.login import current_account_with_tenant
|
||||
from models.dataset import PipelineCustomizedTemplate
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
|
||||
|
||||
|
||||
class CustomizedPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
|
||||
"""
|
||||
Retrieval recommended app from database
|
||||
"""
|
||||
|
||||
def get_pipeline_templates(self, language: str) -> dict:
|
||||
_, current_tenant_id = current_account_with_tenant()
|
||||
result = self.fetch_pipeline_templates_from_customized(tenant_id=current_tenant_id, language=language)
|
||||
return result
|
||||
|
||||
def get_pipeline_template_detail(self, template_id: str):
|
||||
result = self.fetch_pipeline_template_detail_from_db(template_id)
|
||||
return result
|
||||
|
||||
def get_type(self) -> str:
|
||||
return PipelineTemplateType.CUSTOMIZED
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_templates_from_customized(cls, tenant_id: str, language: str) -> dict:
|
||||
"""
|
||||
Fetch pipeline templates from db.
|
||||
:param tenant_id: tenant id
|
||||
:param language: language
|
||||
:return:
|
||||
"""
|
||||
pipeline_customized_templates = (
|
||||
db.session.query(PipelineCustomizedTemplate)
|
||||
.where(PipelineCustomizedTemplate.tenant_id == tenant_id, PipelineCustomizedTemplate.language == language)
|
||||
.order_by(PipelineCustomizedTemplate.position.asc(), PipelineCustomizedTemplate.created_at.desc())
|
||||
.all()
|
||||
)
|
||||
recommended_pipelines_results = []
|
||||
for pipeline_customized_template in pipeline_customized_templates:
|
||||
recommended_pipeline_result = {
|
||||
"id": pipeline_customized_template.id,
|
||||
"name": pipeline_customized_template.name,
|
||||
"description": pipeline_customized_template.description,
|
||||
"icon": pipeline_customized_template.icon,
|
||||
"position": pipeline_customized_template.position,
|
||||
"chunk_structure": pipeline_customized_template.chunk_structure,
|
||||
}
|
||||
recommended_pipelines_results.append(recommended_pipeline_result)
|
||||
|
||||
return {"pipeline_templates": recommended_pipelines_results}
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
|
||||
"""
|
||||
Fetch pipeline template detail from db.
|
||||
:param template_id: Template ID
|
||||
:return:
|
||||
"""
|
||||
pipeline_template = (
|
||||
db.session.query(PipelineCustomizedTemplate).where(PipelineCustomizedTemplate.id == template_id).first()
|
||||
)
|
||||
if not pipeline_template:
|
||||
return None
|
||||
|
||||
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
|
||||
graph_data = dsl_data.get("workflow", {}).get("graph", {})
|
||||
|
||||
return {
|
||||
"id": pipeline_template.id,
|
||||
"name": pipeline_template.name,
|
||||
"icon_info": pipeline_template.icon,
|
||||
"description": pipeline_template.description,
|
||||
"chunk_structure": pipeline_template.chunk_structure,
|
||||
"export_data": pipeline_template.yaml_content,
|
||||
"graph": graph_data,
|
||||
"created_by": pipeline_template.created_user_name,
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
import yaml
|
||||
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import PipelineBuiltInTemplate
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
|
||||
|
||||
|
||||
class DatabasePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
|
||||
"""
|
||||
Retrieval pipeline template from database
|
||||
"""
|
||||
|
||||
def get_pipeline_templates(self, language: str) -> dict:
|
||||
result = self.fetch_pipeline_templates_from_db(language)
|
||||
return result
|
||||
|
||||
def get_pipeline_template_detail(self, template_id: str):
|
||||
result = self.fetch_pipeline_template_detail_from_db(template_id)
|
||||
return result
|
||||
|
||||
def get_type(self) -> str:
|
||||
return PipelineTemplateType.DATABASE
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_templates_from_db(cls, language: str) -> dict:
|
||||
"""
|
||||
Fetch pipeline templates from db.
|
||||
:param language: language
|
||||
:return:
|
||||
"""
|
||||
|
||||
pipeline_built_in_templates: list[PipelineBuiltInTemplate] = (
|
||||
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.language == language).all()
|
||||
)
|
||||
|
||||
recommended_pipelines_results = []
|
||||
for pipeline_built_in_template in pipeline_built_in_templates:
|
||||
recommended_pipeline_result = {
|
||||
"id": pipeline_built_in_template.id,
|
||||
"name": pipeline_built_in_template.name,
|
||||
"description": pipeline_built_in_template.description,
|
||||
"icon": pipeline_built_in_template.icon,
|
||||
"copyright": pipeline_built_in_template.copyright,
|
||||
"privacy_policy": pipeline_built_in_template.privacy_policy,
|
||||
"position": pipeline_built_in_template.position,
|
||||
"chunk_structure": pipeline_built_in_template.chunk_structure,
|
||||
}
|
||||
recommended_pipelines_results.append(recommended_pipeline_result)
|
||||
|
||||
return {"pipeline_templates": recommended_pipelines_results}
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
|
||||
"""
|
||||
Fetch pipeline template detail from db.
|
||||
:param pipeline_id: Pipeline ID
|
||||
:return:
|
||||
"""
|
||||
# is in public recommended list
|
||||
pipeline_template = (
|
||||
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.id == template_id).first()
|
||||
)
|
||||
|
||||
if not pipeline_template:
|
||||
return None
|
||||
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
|
||||
graph_data = dsl_data.get("workflow", {}).get("graph", {})
|
||||
return {
|
||||
"id": pipeline_template.id,
|
||||
"name": pipeline_template.name,
|
||||
"icon_info": pipeline_template.icon,
|
||||
"description": pipeline_template.description,
|
||||
"chunk_structure": pipeline_template.chunk_structure,
|
||||
"export_data": pipeline_template.yaml_content,
|
||||
"graph": graph_data,
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class PipelineTemplateRetrievalBase(ABC):
|
||||
"""Interface for pipeline template retrieval."""
|
||||
|
||||
@abstractmethod
|
||||
def get_pipeline_templates(self, language: str) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_pipeline_template_detail(self, template_id: str) -> dict | None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_type(self) -> str:
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,26 @@
|
||||
from services.rag_pipeline.pipeline_template.built_in.built_in_retrieval import BuiltInPipelineTemplateRetrieval
|
||||
from services.rag_pipeline.pipeline_template.customized.customized_retrieval import CustomizedPipelineTemplateRetrieval
|
||||
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
|
||||
from services.rag_pipeline.pipeline_template.remote.remote_retrieval import RemotePipelineTemplateRetrieval
|
||||
|
||||
|
||||
class PipelineTemplateRetrievalFactory:
|
||||
@staticmethod
|
||||
def get_pipeline_template_factory(mode: str) -> type[PipelineTemplateRetrievalBase]:
|
||||
match mode:
|
||||
case PipelineTemplateType.REMOTE:
|
||||
return RemotePipelineTemplateRetrieval
|
||||
case PipelineTemplateType.CUSTOMIZED:
|
||||
return CustomizedPipelineTemplateRetrieval
|
||||
case PipelineTemplateType.DATABASE:
|
||||
return DatabasePipelineTemplateRetrieval
|
||||
case PipelineTemplateType.BUILTIN:
|
||||
return BuiltInPipelineTemplateRetrieval
|
||||
case _:
|
||||
raise ValueError(f"invalid fetch recommended apps mode: {mode}")
|
||||
|
||||
@staticmethod
|
||||
def get_built_in_pipeline_template_retrieval():
|
||||
return BuiltInPipelineTemplateRetrieval
|
||||
@@ -0,0 +1,8 @@
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class PipelineTemplateType(StrEnum):
|
||||
REMOTE = "remote"
|
||||
DATABASE = "database"
|
||||
CUSTOMIZED = "customized"
|
||||
BUILTIN = "builtin"
|
||||
@@ -0,0 +1,67 @@
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from configs import dify_config
|
||||
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
|
||||
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
|
||||
"""
|
||||
Retrieval recommended app from dify official
|
||||
"""
|
||||
|
||||
def get_pipeline_template_detail(self, template_id: str):
|
||||
try:
|
||||
result = self.fetch_pipeline_template_detail_from_dify_official(template_id)
|
||||
except Exception as e:
|
||||
logger.warning("fetch recommended app detail from dify official failed: %r, switch to database.", e)
|
||||
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_template_detail_from_db(template_id)
|
||||
return result
|
||||
|
||||
def get_pipeline_templates(self, language: str) -> dict:
|
||||
try:
|
||||
result = self.fetch_pipeline_templates_from_dify_official(language)
|
||||
except Exception as e:
|
||||
logger.warning("fetch pipeline templates from dify official failed: %r, switch to database.", e)
|
||||
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_templates_from_db(language)
|
||||
return result
|
||||
|
||||
def get_type(self) -> str:
|
||||
return PipelineTemplateType.REMOTE
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None:
|
||||
"""
|
||||
Fetch pipeline template detail from dify official.
|
||||
:param template_id: Pipeline ID
|
||||
:return:
|
||||
"""
|
||||
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
|
||||
url = f"{domain}/pipeline-templates/{template_id}"
|
||||
response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
data: dict = response.json()
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def fetch_pipeline_templates_from_dify_official(cls, language: str) -> dict:
|
||||
"""
|
||||
Fetch pipeline templates from dify official.
|
||||
:param language: language
|
||||
:return:
|
||||
"""
|
||||
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
|
||||
url = f"{domain}/pipeline-templates?language={language}"
|
||||
response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0))
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"fetch pipeline templates failed, status code: {response.status_code}")
|
||||
|
||||
result: dict = response.json()
|
||||
|
||||
return result
|
||||
1423
dify/api/services/rag_pipeline/rag_pipeline.py
Normal file
1423
dify/api/services/rag_pipeline/rag_pipeline.py
Normal file
File diff suppressed because it is too large
Load Diff
945
dify/api/services/rag_pipeline/rag_pipeline_dsl_service.py
Normal file
945
dify/api/services/rag_pipeline/rag_pipeline_dsl_service.py
Normal file
@@ -0,0 +1,945 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from collections.abc import Mapping
|
||||
from datetime import UTC, datetime
|
||||
from enum import StrEnum
|
||||
from typing import cast
|
||||
from urllib.parse import urlparse
|
||||
from uuid import uuid4
|
||||
|
||||
import yaml # type: ignore
|
||||
from Crypto.Cipher import AES
|
||||
from Crypto.Util.Padding import pad, unpad
|
||||
from flask_login import current_user
|
||||
from packaging import version
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from core.helper import ssrf_proxy
|
||||
from core.helper.name_generator import generate_incremental_name
|
||||
from core.model_runtime.utils.encoders import jsonable_encoder
|
||||
from core.plugin.entities.plugin import PluginDependency
|
||||
from core.workflow.enums import NodeType
|
||||
from core.workflow.nodes.datasource.entities import DatasourceNodeData
|
||||
from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
|
||||
from core.workflow.nodes.llm.entities import LLMNodeData
|
||||
from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
|
||||
from core.workflow.nodes.question_classifier.entities import QuestionClassifierNodeData
|
||||
from core.workflow.nodes.tool.entities import ToolNodeData
|
||||
from extensions.ext_redis import redis_client
|
||||
from factories import variable_factory
|
||||
from models import Account
|
||||
from models.dataset import Dataset, DatasetCollectionBinding, Pipeline
|
||||
from models.workflow import Workflow, WorkflowType
|
||||
from services.entities.knowledge_entities.rag_pipeline_entities import (
|
||||
IconInfo,
|
||||
KnowledgeConfiguration,
|
||||
RagPipelineDatasetCreateEntity,
|
||||
)
|
||||
from services.plugin.dependencies_analysis import DependenciesAnalysisService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
IMPORT_INFO_REDIS_KEY_PREFIX = "app_import_info:"
|
||||
CHECK_DEPENDENCIES_REDIS_KEY_PREFIX = "app_check_dependencies:"
|
||||
IMPORT_INFO_REDIS_EXPIRY = 10 * 60 # 10 minutes
|
||||
DSL_MAX_SIZE = 10 * 1024 * 1024 # 10MB
|
||||
CURRENT_DSL_VERSION = "0.1.0"
|
||||
|
||||
|
||||
class ImportMode(StrEnum):
|
||||
YAML_CONTENT = "yaml-content"
|
||||
YAML_URL = "yaml-url"
|
||||
|
||||
|
||||
class ImportStatus(StrEnum):
|
||||
COMPLETED = "completed"
|
||||
COMPLETED_WITH_WARNINGS = "completed-with-warnings"
|
||||
PENDING = "pending"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class RagPipelineImportInfo(BaseModel):
|
||||
id: str
|
||||
status: ImportStatus
|
||||
pipeline_id: str | None = None
|
||||
current_dsl_version: str = CURRENT_DSL_VERSION
|
||||
imported_dsl_version: str = ""
|
||||
error: str = ""
|
||||
dataset_id: str | None = None
|
||||
|
||||
|
||||
class CheckDependenciesResult(BaseModel):
|
||||
leaked_dependencies: list[PluginDependency] = Field(default_factory=list)
|
||||
|
||||
|
||||
def _check_version_compatibility(imported_version: str) -> ImportStatus:
|
||||
"""Determine import status based on version comparison"""
|
||||
try:
|
||||
current_ver = version.parse(CURRENT_DSL_VERSION)
|
||||
imported_ver = version.parse(imported_version)
|
||||
except version.InvalidVersion:
|
||||
return ImportStatus.FAILED
|
||||
|
||||
# If imported version is newer than current, always return PENDING
|
||||
if imported_ver > current_ver:
|
||||
return ImportStatus.PENDING
|
||||
|
||||
# If imported version is older than current's major, return PENDING
|
||||
if imported_ver.major < current_ver.major:
|
||||
return ImportStatus.PENDING
|
||||
|
||||
# If imported version is older than current's minor, return COMPLETED_WITH_WARNINGS
|
||||
if imported_ver.minor < current_ver.minor:
|
||||
return ImportStatus.COMPLETED_WITH_WARNINGS
|
||||
|
||||
# If imported version equals or is older than current's micro, return COMPLETED
|
||||
return ImportStatus.COMPLETED
|
||||
|
||||
|
||||
class RagPipelinePendingData(BaseModel):
|
||||
import_mode: str
|
||||
yaml_content: str
|
||||
pipeline_id: str | None
|
||||
|
||||
|
||||
class CheckDependenciesPendingData(BaseModel):
|
||||
dependencies: list[PluginDependency]
|
||||
pipeline_id: str | None
|
||||
|
||||
|
||||
class RagPipelineDslService:
|
||||
def __init__(self, session: Session):
|
||||
self._session = session
|
||||
|
||||
def import_rag_pipeline(
|
||||
self,
|
||||
*,
|
||||
account: Account,
|
||||
import_mode: str,
|
||||
yaml_content: str | None = None,
|
||||
yaml_url: str | None = None,
|
||||
pipeline_id: str | None = None,
|
||||
dataset: Dataset | None = None,
|
||||
dataset_name: str | None = None,
|
||||
icon_info: IconInfo | None = None,
|
||||
) -> RagPipelineImportInfo:
|
||||
"""Import an app from YAML content or URL."""
|
||||
import_id = str(uuid.uuid4())
|
||||
|
||||
# Validate import mode
|
||||
try:
|
||||
mode = ImportMode(import_mode)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid import_mode: {import_mode}")
|
||||
|
||||
# Get YAML content
|
||||
content: str = ""
|
||||
if mode == ImportMode.YAML_URL:
|
||||
if not yaml_url:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="yaml_url is required when import_mode is yaml-url",
|
||||
)
|
||||
try:
|
||||
parsed_url = urlparse(yaml_url)
|
||||
if (
|
||||
parsed_url.scheme == "https"
|
||||
and parsed_url.netloc == "github.com"
|
||||
and parsed_url.path.endswith((".yml", ".yaml"))
|
||||
):
|
||||
yaml_url = yaml_url.replace("https://github.com", "https://raw.githubusercontent.com")
|
||||
yaml_url = yaml_url.replace("/blob/", "/")
|
||||
response = ssrf_proxy.get(yaml_url.strip(), follow_redirects=True, timeout=(10, 10))
|
||||
response.raise_for_status()
|
||||
content = response.content.decode()
|
||||
|
||||
if len(content) > DSL_MAX_SIZE:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="File size exceeds the limit of 10MB",
|
||||
)
|
||||
|
||||
if not content:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Empty content from url",
|
||||
)
|
||||
except Exception as e:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error=f"Error fetching YAML from URL: {str(e)}",
|
||||
)
|
||||
elif mode == ImportMode.YAML_CONTENT:
|
||||
if not yaml_content:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="yaml_content is required when import_mode is yaml-content",
|
||||
)
|
||||
content = yaml_content
|
||||
|
||||
# Process YAML content
|
||||
try:
|
||||
# Parse YAML to validate format
|
||||
data = yaml.safe_load(content)
|
||||
if not isinstance(data, dict):
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Invalid YAML format: content must be a mapping",
|
||||
)
|
||||
|
||||
# Validate and fix DSL version
|
||||
if not data.get("version"):
|
||||
data["version"] = "0.1.0"
|
||||
if not data.get("kind") or data.get("kind") != "rag_pipeline":
|
||||
data["kind"] = "rag_pipeline"
|
||||
|
||||
imported_version = data.get("version", "0.1.0")
|
||||
# check if imported_version is a float-like string
|
||||
if not isinstance(imported_version, str):
|
||||
raise ValueError(f"Invalid version type, expected str, got {type(imported_version)}")
|
||||
status = _check_version_compatibility(imported_version)
|
||||
|
||||
# Extract app data
|
||||
pipeline_data = data.get("rag_pipeline")
|
||||
if not pipeline_data:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Missing rag_pipeline data in YAML content",
|
||||
)
|
||||
|
||||
# If app_id is provided, check if it exists
|
||||
pipeline = None
|
||||
if pipeline_id:
|
||||
stmt = select(Pipeline).where(
|
||||
Pipeline.id == pipeline_id,
|
||||
Pipeline.tenant_id == account.current_tenant_id,
|
||||
)
|
||||
pipeline = self._session.scalar(stmt)
|
||||
|
||||
if not pipeline:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Pipeline not found",
|
||||
)
|
||||
dataset = pipeline.retrieve_dataset(session=self._session)
|
||||
|
||||
# If major version mismatch, store import info in Redis
|
||||
if status == ImportStatus.PENDING:
|
||||
pending_data = RagPipelinePendingData(
|
||||
import_mode=import_mode,
|
||||
yaml_content=content,
|
||||
pipeline_id=pipeline_id,
|
||||
)
|
||||
redis_client.setex(
|
||||
f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}",
|
||||
IMPORT_INFO_REDIS_EXPIRY,
|
||||
pending_data.model_dump_json(),
|
||||
)
|
||||
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=status,
|
||||
pipeline_id=pipeline_id,
|
||||
imported_dsl_version=imported_version,
|
||||
)
|
||||
|
||||
# Extract dependencies
|
||||
dependencies = data.get("dependencies", [])
|
||||
check_dependencies_pending_data = None
|
||||
if dependencies:
|
||||
check_dependencies_pending_data = [PluginDependency.model_validate(d) for d in dependencies]
|
||||
|
||||
# Create or update pipeline
|
||||
pipeline = self._create_or_update_pipeline(
|
||||
pipeline=pipeline,
|
||||
data=data,
|
||||
account=account,
|
||||
dependencies=check_dependencies_pending_data,
|
||||
)
|
||||
# create dataset
|
||||
name = pipeline.name or "Untitled"
|
||||
description = pipeline.description
|
||||
if icon_info:
|
||||
icon_type = icon_info.icon_type
|
||||
icon = icon_info.icon
|
||||
icon_background = icon_info.icon_background
|
||||
icon_url = icon_info.icon_url
|
||||
else:
|
||||
icon_type = data.get("rag_pipeline", {}).get("icon_type")
|
||||
icon = data.get("rag_pipeline", {}).get("icon")
|
||||
icon_background = data.get("rag_pipeline", {}).get("icon_background")
|
||||
icon_url = data.get("rag_pipeline", {}).get("icon_url")
|
||||
workflow = data.get("workflow", {})
|
||||
graph = workflow.get("graph", {})
|
||||
nodes = graph.get("nodes", [])
|
||||
dataset_id = None
|
||||
for node in nodes:
|
||||
if node.get("data", {}).get("type") == "knowledge-index":
|
||||
knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
|
||||
if (
|
||||
dataset
|
||||
and pipeline.is_published
|
||||
and dataset.chunk_structure != knowledge_configuration.chunk_structure
|
||||
):
|
||||
raise ValueError("Chunk structure is not compatible with the published pipeline")
|
||||
if not dataset:
|
||||
datasets = self._session.query(Dataset).filter_by(tenant_id=account.current_tenant_id).all()
|
||||
names = [dataset.name for dataset in datasets]
|
||||
generate_name = generate_incremental_name(names, name)
|
||||
dataset = Dataset(
|
||||
tenant_id=account.current_tenant_id,
|
||||
name=generate_name,
|
||||
description=description,
|
||||
icon_info={
|
||||
"icon_type": icon_type,
|
||||
"icon": icon,
|
||||
"icon_background": icon_background,
|
||||
"icon_url": icon_url,
|
||||
},
|
||||
indexing_technique=knowledge_configuration.indexing_technique,
|
||||
created_by=account.id,
|
||||
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
|
||||
runtime_mode="rag_pipeline",
|
||||
chunk_structure=knowledge_configuration.chunk_structure,
|
||||
)
|
||||
if knowledge_configuration.indexing_technique == "high_quality":
|
||||
dataset_collection_binding = (
|
||||
self._session.query(DatasetCollectionBinding)
|
||||
.where(
|
||||
DatasetCollectionBinding.provider_name
|
||||
== knowledge_configuration.embedding_model_provider,
|
||||
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
|
||||
DatasetCollectionBinding.type == "dataset",
|
||||
)
|
||||
.order_by(DatasetCollectionBinding.created_at)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not dataset_collection_binding:
|
||||
dataset_collection_binding = DatasetCollectionBinding(
|
||||
provider_name=knowledge_configuration.embedding_model_provider,
|
||||
model_name=knowledge_configuration.embedding_model,
|
||||
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
|
||||
type="dataset",
|
||||
)
|
||||
self._session.add(dataset_collection_binding)
|
||||
self._session.commit()
|
||||
dataset_collection_binding_id = dataset_collection_binding.id
|
||||
dataset.collection_binding_id = dataset_collection_binding_id
|
||||
dataset.embedding_model = knowledge_configuration.embedding_model
|
||||
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
|
||||
elif knowledge_configuration.indexing_technique == "economy":
|
||||
dataset.keyword_number = knowledge_configuration.keyword_number
|
||||
dataset.pipeline_id = pipeline.id
|
||||
self._session.add(dataset)
|
||||
self._session.commit()
|
||||
dataset_id = dataset.id
|
||||
if not dataset_id:
|
||||
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
|
||||
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=status,
|
||||
pipeline_id=pipeline.id,
|
||||
dataset_id=dataset_id,
|
||||
imported_dsl_version=imported_version,
|
||||
)
|
||||
|
||||
except yaml.YAMLError as e:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error=f"Invalid YAML format: {str(e)}",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Failed to import app")
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def confirm_import(self, *, import_id: str, account: Account) -> RagPipelineImportInfo:
|
||||
"""
|
||||
Confirm an import that requires confirmation
|
||||
"""
|
||||
redis_key = f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}"
|
||||
pending_data = redis_client.get(redis_key)
|
||||
|
||||
if not pending_data:
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Import information expired or does not exist",
|
||||
)
|
||||
|
||||
try:
|
||||
if not isinstance(pending_data, str | bytes):
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error="Invalid import information",
|
||||
)
|
||||
pending_data = RagPipelinePendingData.model_validate_json(pending_data)
|
||||
data = yaml.safe_load(pending_data.yaml_content)
|
||||
|
||||
pipeline = None
|
||||
if pending_data.pipeline_id:
|
||||
stmt = select(Pipeline).where(
|
||||
Pipeline.id == pending_data.pipeline_id,
|
||||
Pipeline.tenant_id == account.current_tenant_id,
|
||||
)
|
||||
pipeline = self._session.scalar(stmt)
|
||||
|
||||
# Create or update app
|
||||
pipeline = self._create_or_update_pipeline(
|
||||
pipeline=pipeline,
|
||||
data=data,
|
||||
account=account,
|
||||
)
|
||||
dataset = pipeline.retrieve_dataset(session=self._session)
|
||||
|
||||
# create dataset
|
||||
name = pipeline.name
|
||||
description = pipeline.description
|
||||
icon_type = data.get("rag_pipeline", {}).get("icon_type")
|
||||
icon = data.get("rag_pipeline", {}).get("icon")
|
||||
icon_background = data.get("rag_pipeline", {}).get("icon_background")
|
||||
icon_url = data.get("rag_pipeline", {}).get("icon_url")
|
||||
workflow = data.get("workflow", {})
|
||||
graph = workflow.get("graph", {})
|
||||
nodes = graph.get("nodes", [])
|
||||
dataset_id = None
|
||||
for node in nodes:
|
||||
if node.get("data", {}).get("type") == "knowledge-index":
|
||||
knowledge_configuration = KnowledgeConfiguration.model_validate(node.get("data", {}))
|
||||
if not dataset:
|
||||
dataset = Dataset(
|
||||
tenant_id=account.current_tenant_id,
|
||||
name=name,
|
||||
description=description,
|
||||
icon_info={
|
||||
"icon_type": icon_type,
|
||||
"icon": icon,
|
||||
"icon_background": icon_background,
|
||||
"icon_url": icon_url,
|
||||
},
|
||||
indexing_technique=knowledge_configuration.indexing_technique,
|
||||
created_by=account.id,
|
||||
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
|
||||
runtime_mode="rag_pipeline",
|
||||
chunk_structure=knowledge_configuration.chunk_structure,
|
||||
)
|
||||
else:
|
||||
dataset.indexing_technique = knowledge_configuration.indexing_technique
|
||||
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
|
||||
dataset.runtime_mode = "rag_pipeline"
|
||||
dataset.chunk_structure = knowledge_configuration.chunk_structure
|
||||
if knowledge_configuration.indexing_technique == "high_quality":
|
||||
dataset_collection_binding = (
|
||||
self._session.query(DatasetCollectionBinding)
|
||||
.where(
|
||||
DatasetCollectionBinding.provider_name
|
||||
== knowledge_configuration.embedding_model_provider,
|
||||
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
|
||||
DatasetCollectionBinding.type == "dataset",
|
||||
)
|
||||
.order_by(DatasetCollectionBinding.created_at)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not dataset_collection_binding:
|
||||
dataset_collection_binding = DatasetCollectionBinding(
|
||||
provider_name=knowledge_configuration.embedding_model_provider,
|
||||
model_name=knowledge_configuration.embedding_model,
|
||||
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
|
||||
type="dataset",
|
||||
)
|
||||
self._session.add(dataset_collection_binding)
|
||||
self._session.commit()
|
||||
dataset_collection_binding_id = dataset_collection_binding.id
|
||||
dataset.collection_binding_id = dataset_collection_binding_id
|
||||
dataset.embedding_model = knowledge_configuration.embedding_model
|
||||
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
|
||||
elif knowledge_configuration.indexing_technique == "economy":
|
||||
dataset.keyword_number = knowledge_configuration.keyword_number
|
||||
dataset.pipeline_id = pipeline.id
|
||||
self._session.add(dataset)
|
||||
self._session.commit()
|
||||
dataset_id = dataset.id
|
||||
if not dataset_id:
|
||||
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
|
||||
|
||||
# Delete import info from Redis
|
||||
redis_client.delete(redis_key)
|
||||
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.COMPLETED,
|
||||
pipeline_id=pipeline.id,
|
||||
dataset_id=dataset_id,
|
||||
current_dsl_version=CURRENT_DSL_VERSION,
|
||||
imported_dsl_version=data.get("version", "0.1.0"),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error confirming import")
|
||||
return RagPipelineImportInfo(
|
||||
id=import_id,
|
||||
status=ImportStatus.FAILED,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def check_dependencies(
|
||||
self,
|
||||
*,
|
||||
pipeline: Pipeline,
|
||||
) -> CheckDependenciesResult:
|
||||
"""Check dependencies"""
|
||||
# Get dependencies from Redis
|
||||
redis_key = f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}"
|
||||
dependencies = redis_client.get(redis_key)
|
||||
if not dependencies:
|
||||
return CheckDependenciesResult()
|
||||
|
||||
# Extract dependencies
|
||||
dependencies = CheckDependenciesPendingData.model_validate_json(dependencies)
|
||||
|
||||
# Get leaked dependencies
|
||||
leaked_dependencies = DependenciesAnalysisService.get_leaked_dependencies(
|
||||
tenant_id=pipeline.tenant_id, dependencies=dependencies.dependencies
|
||||
)
|
||||
return CheckDependenciesResult(
|
||||
leaked_dependencies=leaked_dependencies,
|
||||
)
|
||||
|
||||
def _create_or_update_pipeline(
|
||||
self,
|
||||
*,
|
||||
pipeline: Pipeline | None,
|
||||
data: dict,
|
||||
account: Account,
|
||||
dependencies: list[PluginDependency] | None = None,
|
||||
) -> Pipeline:
|
||||
"""Create a new app or update an existing one."""
|
||||
if not account.current_tenant_id:
|
||||
raise ValueError("Tenant id is required")
|
||||
pipeline_data = data.get("rag_pipeline", {})
|
||||
# Initialize pipeline based on mode
|
||||
workflow_data = data.get("workflow")
|
||||
if not workflow_data or not isinstance(workflow_data, dict):
|
||||
raise ValueError("Missing workflow data for rag pipeline")
|
||||
|
||||
environment_variables_list = workflow_data.get("environment_variables", [])
|
||||
environment_variables = [
|
||||
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
|
||||
]
|
||||
conversation_variables_list = workflow_data.get("conversation_variables", [])
|
||||
conversation_variables = [
|
||||
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
|
||||
]
|
||||
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
|
||||
|
||||
graph = workflow_data.get("graph", {})
|
||||
for node in graph.get("nodes", []):
|
||||
if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL:
|
||||
dataset_ids = node["data"].get("dataset_ids", [])
|
||||
node["data"]["dataset_ids"] = [
|
||||
decrypted_id
|
||||
for dataset_id in dataset_ids
|
||||
if (
|
||||
decrypted_id := self.decrypt_dataset_id(
|
||||
encrypted_data=dataset_id,
|
||||
tenant_id=account.current_tenant_id,
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
if pipeline:
|
||||
# Update existing pipeline
|
||||
pipeline.name = pipeline_data.get("name", pipeline.name)
|
||||
pipeline.description = pipeline_data.get("description", pipeline.description)
|
||||
pipeline.updated_by = account.id
|
||||
|
||||
else:
|
||||
if account.current_tenant_id is None:
|
||||
raise ValueError("Current tenant is not set")
|
||||
|
||||
# Create new app
|
||||
pipeline = Pipeline(
|
||||
tenant_id=account.current_tenant_id,
|
||||
name=pipeline_data.get("name", ""),
|
||||
description=pipeline_data.get("description", ""),
|
||||
created_by=account.id,
|
||||
updated_by=account.id,
|
||||
)
|
||||
pipeline.id = str(uuid4())
|
||||
|
||||
self._session.add(pipeline)
|
||||
self._session.commit()
|
||||
# save dependencies
|
||||
if dependencies:
|
||||
redis_client.setex(
|
||||
f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}",
|
||||
IMPORT_INFO_REDIS_EXPIRY,
|
||||
CheckDependenciesPendingData(pipeline_id=pipeline.id, dependencies=dependencies).model_dump_json(),
|
||||
)
|
||||
workflow = (
|
||||
self._session.query(Workflow)
|
||||
.where(
|
||||
Workflow.tenant_id == pipeline.tenant_id,
|
||||
Workflow.app_id == pipeline.id,
|
||||
Workflow.version == "draft",
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
# create draft workflow if not found
|
||||
if not workflow:
|
||||
workflow = Workflow(
|
||||
tenant_id=pipeline.tenant_id,
|
||||
app_id=pipeline.id,
|
||||
features="{}",
|
||||
type=WorkflowType.RAG_PIPELINE,
|
||||
version="draft",
|
||||
graph=json.dumps(graph),
|
||||
created_by=account.id,
|
||||
environment_variables=environment_variables,
|
||||
conversation_variables=conversation_variables,
|
||||
rag_pipeline_variables=rag_pipeline_variables_list,
|
||||
)
|
||||
self._session.add(workflow)
|
||||
self._session.flush()
|
||||
pipeline.workflow_id = workflow.id
|
||||
else:
|
||||
workflow.graph = json.dumps(graph)
|
||||
workflow.updated_by = account.id
|
||||
workflow.updated_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
workflow.environment_variables = environment_variables
|
||||
workflow.conversation_variables = conversation_variables
|
||||
workflow.rag_pipeline_variables = rag_pipeline_variables_list
|
||||
# commit db session changes
|
||||
self._session.commit()
|
||||
|
||||
return pipeline
|
||||
|
||||
def export_rag_pipeline_dsl(self, pipeline: Pipeline, include_secret: bool = False) -> str:
|
||||
"""
|
||||
Export pipeline
|
||||
:param pipeline: Pipeline instance
|
||||
:param include_secret: Whether include secret variable
|
||||
:return:
|
||||
"""
|
||||
dataset = pipeline.retrieve_dataset(session=self._session)
|
||||
if not dataset:
|
||||
raise ValueError("Missing dataset for rag pipeline")
|
||||
icon_info = dataset.icon_info
|
||||
export_data = {
|
||||
"version": CURRENT_DSL_VERSION,
|
||||
"kind": "rag_pipeline",
|
||||
"rag_pipeline": {
|
||||
"name": dataset.name,
|
||||
"icon": icon_info.get("icon", "📙") if icon_info else "📙",
|
||||
"icon_type": icon_info.get("icon_type", "emoji") if icon_info else "emoji",
|
||||
"icon_background": icon_info.get("icon_background", "#FFEAD5") if icon_info else "#FFEAD5",
|
||||
"icon_url": icon_info.get("icon_url") if icon_info else None,
|
||||
"description": pipeline.description,
|
||||
},
|
||||
}
|
||||
|
||||
self._append_workflow_export_data(export_data=export_data, pipeline=pipeline, include_secret=include_secret)
|
||||
|
||||
return yaml.dump(export_data, allow_unicode=True) # type: ignore
|
||||
|
||||
def _append_workflow_export_data(self, *, export_data: dict, pipeline: Pipeline, include_secret: bool) -> None:
|
||||
"""
|
||||
Append workflow export data
|
||||
:param export_data: export data
|
||||
:param pipeline: Pipeline instance
|
||||
"""
|
||||
|
||||
workflow = (
|
||||
self._session.query(Workflow)
|
||||
.where(
|
||||
Workflow.tenant_id == pipeline.tenant_id,
|
||||
Workflow.app_id == pipeline.id,
|
||||
Workflow.version == "draft",
|
||||
)
|
||||
.first()
|
||||
)
|
||||
if not workflow:
|
||||
raise ValueError("Missing draft workflow configuration, please check.")
|
||||
|
||||
workflow_dict = workflow.to_dict(include_secret=include_secret)
|
||||
for node in workflow_dict.get("graph", {}).get("nodes", []):
|
||||
node_data = node.get("data", {})
|
||||
if not node_data:
|
||||
continue
|
||||
data_type = node_data.get("type", "")
|
||||
if data_type == NodeType.KNOWLEDGE_RETRIEVAL:
|
||||
dataset_ids = node_data.get("dataset_ids", [])
|
||||
node["data"]["dataset_ids"] = [
|
||||
self.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=pipeline.tenant_id)
|
||||
for dataset_id in dataset_ids
|
||||
]
|
||||
# filter credential id from tool node
|
||||
if not include_secret and data_type == NodeType.TOOL:
|
||||
node_data.pop("credential_id", None)
|
||||
# filter credential id from agent node
|
||||
if not include_secret and data_type == NodeType.AGENT:
|
||||
for tool in node_data.get("agent_parameters", {}).get("tools", {}).get("value", []):
|
||||
tool.pop("credential_id", None)
|
||||
|
||||
export_data["workflow"] = workflow_dict
|
||||
dependencies = self._extract_dependencies_from_workflow(workflow)
|
||||
export_data["dependencies"] = [
|
||||
jsonable_encoder(d.model_dump())
|
||||
for d in DependenciesAnalysisService.generate_dependencies(
|
||||
tenant_id=pipeline.tenant_id, dependencies=dependencies
|
||||
)
|
||||
]
|
||||
|
||||
def _extract_dependencies_from_workflow(self, workflow: Workflow) -> list[str]:
|
||||
"""
|
||||
Extract dependencies from workflow
|
||||
:param workflow: Workflow instance
|
||||
:return: dependencies list format like ["langgenius/google"]
|
||||
"""
|
||||
graph = workflow.graph_dict
|
||||
dependencies = self._extract_dependencies_from_workflow_graph(graph)
|
||||
return dependencies
|
||||
|
||||
def _extract_dependencies_from_workflow_graph(self, graph: Mapping) -> list[str]:
|
||||
"""
|
||||
Extract dependencies from workflow graph
|
||||
:param graph: Workflow graph
|
||||
:return: dependencies list format like ["langgenius/google"]
|
||||
"""
|
||||
dependencies = []
|
||||
for node in graph.get("nodes", []):
|
||||
try:
|
||||
typ = node.get("data", {}).get("type")
|
||||
match typ:
|
||||
case NodeType.TOOL:
|
||||
tool_entity = ToolNodeData.model_validate(node["data"])
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_tool_dependency(tool_entity.provider_id),
|
||||
)
|
||||
case NodeType.DATASOURCE:
|
||||
datasource_entity = DatasourceNodeData.model_validate(node["data"])
|
||||
if datasource_entity.provider_type != "local_file":
|
||||
dependencies.append(datasource_entity.plugin_id)
|
||||
case NodeType.LLM:
|
||||
llm_entity = LLMNodeData.model_validate(node["data"])
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(llm_entity.model.provider),
|
||||
)
|
||||
case NodeType.QUESTION_CLASSIFIER:
|
||||
question_classifier_entity = QuestionClassifierNodeData.model_validate(node["data"])
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
question_classifier_entity.model.provider
|
||||
),
|
||||
)
|
||||
case NodeType.PARAMETER_EXTRACTOR:
|
||||
parameter_extractor_entity = ParameterExtractorNodeData.model_validate(node["data"])
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
parameter_extractor_entity.model.provider
|
||||
),
|
||||
)
|
||||
case NodeType.KNOWLEDGE_INDEX:
|
||||
knowledge_index_entity = KnowledgeConfiguration.model_validate(node["data"])
|
||||
if knowledge_index_entity.indexing_technique == "high_quality":
|
||||
if knowledge_index_entity.embedding_model_provider:
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
knowledge_index_entity.embedding_model_provider
|
||||
),
|
||||
)
|
||||
if knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model":
|
||||
if knowledge_index_entity.retrieval_model.reranking_enable:
|
||||
if (
|
||||
knowledge_index_entity.retrieval_model.reranking_model
|
||||
and knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model"
|
||||
):
|
||||
if knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name:
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name
|
||||
),
|
||||
)
|
||||
case NodeType.KNOWLEDGE_RETRIEVAL:
|
||||
knowledge_retrieval_entity = KnowledgeRetrievalNodeData.model_validate(node["data"])
|
||||
if knowledge_retrieval_entity.retrieval_mode == "multiple":
|
||||
if knowledge_retrieval_entity.multiple_retrieval_config:
|
||||
if (
|
||||
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
|
||||
== "reranking_model"
|
||||
):
|
||||
if knowledge_retrieval_entity.multiple_retrieval_config.reranking_model:
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
knowledge_retrieval_entity.multiple_retrieval_config.reranking_model.provider
|
||||
),
|
||||
)
|
||||
elif (
|
||||
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
|
||||
== "weighted_score"
|
||||
):
|
||||
if knowledge_retrieval_entity.multiple_retrieval_config.weights:
|
||||
vector_setting = (
|
||||
knowledge_retrieval_entity.multiple_retrieval_config.weights.vector_setting
|
||||
)
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
vector_setting.embedding_provider_name
|
||||
),
|
||||
)
|
||||
elif knowledge_retrieval_entity.retrieval_mode == "single":
|
||||
model_config = knowledge_retrieval_entity.single_retrieval_config
|
||||
if model_config:
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
model_config.model.provider
|
||||
),
|
||||
)
|
||||
case _:
|
||||
# TODO: Handle default case or unknown node types
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.exception("Error extracting node dependency", exc_info=e)
|
||||
|
||||
return dependencies
|
||||
|
||||
@classmethod
|
||||
def _extract_dependencies_from_model_config(cls, model_config: Mapping) -> list[str]:
|
||||
"""
|
||||
Extract dependencies from model config
|
||||
:param model_config: model config dict
|
||||
:return: dependencies list format like ["langgenius/google"]
|
||||
"""
|
||||
dependencies = []
|
||||
|
||||
try:
|
||||
# completion model
|
||||
model_dict = model_config.get("model", {})
|
||||
if model_dict:
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(model_dict.get("provider", ""))
|
||||
)
|
||||
|
||||
# reranking model
|
||||
dataset_configs = model_config.get("dataset_configs", {})
|
||||
if dataset_configs:
|
||||
for dataset_config in dataset_configs.get("datasets", {}).get("datasets", []):
|
||||
if dataset_config.get("reranking_model"):
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_model_provider_dependency(
|
||||
dataset_config.get("reranking_model", {})
|
||||
.get("reranking_provider_name", {})
|
||||
.get("provider")
|
||||
)
|
||||
)
|
||||
|
||||
# tools
|
||||
agent_configs = model_config.get("agent_mode", {})
|
||||
if agent_configs:
|
||||
for agent_config in agent_configs.get("tools", []):
|
||||
dependencies.append(
|
||||
DependenciesAnalysisService.analyze_tool_dependency(agent_config.get("provider_id"))
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error extracting model config dependency", exc_info=e)
|
||||
|
||||
return dependencies
|
||||
|
||||
@classmethod
|
||||
def get_leaked_dependencies(cls, tenant_id: str, dsl_dependencies: list[dict]) -> list[PluginDependency]:
|
||||
"""
|
||||
Returns the leaked dependencies in current workspace
|
||||
"""
|
||||
dependencies = [PluginDependency.model_validate(dep) for dep in dsl_dependencies]
|
||||
if not dependencies:
|
||||
return []
|
||||
|
||||
return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies)
|
||||
|
||||
def _generate_aes_key(self, tenant_id: str) -> bytes:
|
||||
"""Generate AES key based on tenant_id"""
|
||||
return hashlib.sha256(tenant_id.encode()).digest()
|
||||
|
||||
def encrypt_dataset_id(self, dataset_id: str, tenant_id: str) -> str:
|
||||
"""Encrypt dataset_id using AES-CBC mode"""
|
||||
key = self._generate_aes_key(tenant_id)
|
||||
iv = key[:16]
|
||||
cipher = AES.new(key, AES.MODE_CBC, iv)
|
||||
ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size))
|
||||
return base64.b64encode(ct_bytes).decode()
|
||||
|
||||
def decrypt_dataset_id(self, encrypted_data: str, tenant_id: str) -> str | None:
|
||||
"""AES decryption"""
|
||||
try:
|
||||
key = self._generate_aes_key(tenant_id)
|
||||
iv = key[:16]
|
||||
cipher = AES.new(key, AES.MODE_CBC, iv)
|
||||
pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size)
|
||||
return pt.decode()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def create_rag_pipeline_dataset(
|
||||
self,
|
||||
tenant_id: str,
|
||||
rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
|
||||
):
|
||||
if rag_pipeline_dataset_create_entity.name:
|
||||
# check if dataset name already exists
|
||||
if (
|
||||
self._session.query(Dataset)
|
||||
.filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
|
||||
.first()
|
||||
):
|
||||
raise ValueError(f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists.")
|
||||
else:
|
||||
# generate a random name as Untitled 1 2 3 ...
|
||||
datasets = self._session.query(Dataset).filter_by(tenant_id=tenant_id).all()
|
||||
names = [dataset.name for dataset in datasets]
|
||||
rag_pipeline_dataset_create_entity.name = generate_incremental_name(
|
||||
names,
|
||||
"Untitled",
|
||||
)
|
||||
|
||||
account = cast(Account, current_user)
|
||||
rag_pipeline_import_info: RagPipelineImportInfo = self.import_rag_pipeline(
|
||||
account=account,
|
||||
import_mode=ImportMode.YAML_CONTENT,
|
||||
yaml_content=rag_pipeline_dataset_create_entity.yaml_content,
|
||||
dataset=None,
|
||||
dataset_name=rag_pipeline_dataset_create_entity.name,
|
||||
icon_info=rag_pipeline_dataset_create_entity.icon_info,
|
||||
)
|
||||
return {
|
||||
"id": rag_pipeline_import_info.id,
|
||||
"dataset_id": rag_pipeline_import_info.dataset_id,
|
||||
"pipeline_id": rag_pipeline_import_info.pipeline_id,
|
||||
"status": rag_pipeline_import_info.status,
|
||||
"imported_dsl_version": rag_pipeline_import_info.imported_dsl_version,
|
||||
"current_dsl_version": rag_pipeline_import_info.current_dsl_version,
|
||||
"error": rag_pipeline_import_info.error,
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
|
||||
from core.plugin.impl.datasource import PluginDatasourceManager
|
||||
from services.datasource_provider_service import DatasourceProviderService
|
||||
|
||||
|
||||
class RagPipelineManageService:
|
||||
@staticmethod
|
||||
def list_rag_pipeline_datasources(tenant_id: str) -> list[PluginDatasourceProviderEntity]:
|
||||
"""
|
||||
list rag pipeline datasources
|
||||
"""
|
||||
|
||||
# get all builtin providers
|
||||
manager = PluginDatasourceManager()
|
||||
datasources = manager.fetch_datasource_providers(tenant_id)
|
||||
for datasource in datasources:
|
||||
datasource_provider_service = DatasourceProviderService()
|
||||
credentials = datasource_provider_service.get_datasource_credentials(
|
||||
tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
|
||||
)
|
||||
if credentials:
|
||||
datasource.is_authorized = True
|
||||
return datasources
|
||||
106
dify/api/services/rag_pipeline/rag_pipeline_task_proxy.py
Normal file
106
dify/api/services/rag_pipeline/rag_pipeline_task_proxy.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import cached_property
|
||||
|
||||
from core.app.entities.rag_pipeline_invoke_entities import RagPipelineInvokeEntity
|
||||
from core.rag.pipeline.queue import TenantIsolatedTaskQueue
|
||||
from enums.cloud_plan import CloudPlan
|
||||
from extensions.ext_database import db
|
||||
from services.feature_service import FeatureService
|
||||
from services.file_service import FileService
|
||||
from tasks.rag_pipeline.priority_rag_pipeline_run_task import priority_rag_pipeline_run_task
|
||||
from tasks.rag_pipeline.rag_pipeline_run_task import rag_pipeline_run_task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RagPipelineTaskProxy:
|
||||
# Default uploaded file name for rag pipeline invoke entities
|
||||
_RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME = "rag_pipeline_invoke_entities.json"
|
||||
|
||||
def __init__(
|
||||
self, dataset_tenant_id: str, user_id: str, rag_pipeline_invoke_entities: Sequence[RagPipelineInvokeEntity]
|
||||
):
|
||||
self._dataset_tenant_id = dataset_tenant_id
|
||||
self._user_id = user_id
|
||||
self._rag_pipeline_invoke_entities = rag_pipeline_invoke_entities
|
||||
self._tenant_isolated_task_queue = TenantIsolatedTaskQueue(dataset_tenant_id, "pipeline")
|
||||
|
||||
@cached_property
|
||||
def features(self):
|
||||
return FeatureService.get_features(self._dataset_tenant_id)
|
||||
|
||||
def _upload_invoke_entities(self) -> str:
|
||||
text = [item.model_dump() for item in self._rag_pipeline_invoke_entities]
|
||||
# Convert list to proper JSON string
|
||||
json_text = json.dumps(text)
|
||||
upload_file = FileService(db.engine).upload_text(
|
||||
json_text, self._RAG_PIPELINE_INVOKE_ENTITIES_FILE_NAME, self._user_id, self._dataset_tenant_id
|
||||
)
|
||||
return upload_file.id
|
||||
|
||||
def _send_to_direct_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
|
||||
logger.info("send file %s to direct queue", upload_file_id)
|
||||
task_func.delay( # type: ignore
|
||||
rag_pipeline_invoke_entities_file_id=upload_file_id,
|
||||
tenant_id=self._dataset_tenant_id,
|
||||
)
|
||||
|
||||
def _send_to_tenant_queue(self, upload_file_id: str, task_func: Callable[[str, str], None]):
|
||||
logger.info("send file %s to tenant queue", upload_file_id)
|
||||
if self._tenant_isolated_task_queue.get_task_key():
|
||||
# Add to waiting queue using List operations (lpush)
|
||||
self._tenant_isolated_task_queue.push_tasks([upload_file_id])
|
||||
logger.info("push tasks: %s", upload_file_id)
|
||||
else:
|
||||
# Set flag and execute task
|
||||
self._tenant_isolated_task_queue.set_task_waiting_time()
|
||||
task_func.delay( # type: ignore
|
||||
rag_pipeline_invoke_entities_file_id=upload_file_id,
|
||||
tenant_id=self._dataset_tenant_id,
|
||||
)
|
||||
logger.info("init tasks: %s", upload_file_id)
|
||||
|
||||
def _send_to_default_tenant_queue(self, upload_file_id: str):
|
||||
self._send_to_tenant_queue(upload_file_id, rag_pipeline_run_task)
|
||||
|
||||
def _send_to_priority_tenant_queue(self, upload_file_id: str):
|
||||
self._send_to_tenant_queue(upload_file_id, priority_rag_pipeline_run_task)
|
||||
|
||||
def _send_to_priority_direct_queue(self, upload_file_id: str):
|
||||
self._send_to_direct_queue(upload_file_id, priority_rag_pipeline_run_task)
|
||||
|
||||
def _dispatch(self):
|
||||
upload_file_id = self._upload_invoke_entities()
|
||||
if not upload_file_id:
|
||||
raise ValueError("upload_file_id is empty")
|
||||
|
||||
logger.info(
|
||||
"dispatch args: %s - %s - %s",
|
||||
self._dataset_tenant_id,
|
||||
self.features.billing.enabled,
|
||||
self.features.billing.subscription.plan,
|
||||
)
|
||||
|
||||
# dispatch to different pipeline queue with tenant isolation when billing enabled
|
||||
if self.features.billing.enabled:
|
||||
if self.features.billing.subscription.plan == CloudPlan.SANDBOX:
|
||||
# dispatch to normal pipeline queue with tenant isolation for sandbox plan
|
||||
self._send_to_default_tenant_queue(upload_file_id)
|
||||
else:
|
||||
# dispatch to priority pipeline queue with tenant isolation for other plans
|
||||
self._send_to_priority_tenant_queue(upload_file_id)
|
||||
else:
|
||||
# dispatch to priority pipeline queue without tenant isolation for others, e.g.: self-hosted or enterprise
|
||||
self._send_to_priority_direct_queue(upload_file_id)
|
||||
|
||||
def delay(self):
|
||||
if not self._rag_pipeline_invoke_entities:
|
||||
logger.warning(
|
||||
"Received empty rag pipeline invoke entities, no tasks delivered: %s %s",
|
||||
self._dataset_tenant_id,
|
||||
self._user_id,
|
||||
)
|
||||
return
|
||||
self._dispatch()
|
||||
387
dify/api/services/rag_pipeline/rag_pipeline_transform_service.py
Normal file
387
dify/api/services/rag_pipeline/rag_pipeline_transform_service.py
Normal file
@@ -0,0 +1,387 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import yaml
|
||||
from flask_login import current_user
|
||||
|
||||
from constants import DOCUMENT_EXTENSIONS
|
||||
from core.plugin.impl.plugin import PluginInstaller
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from extensions.ext_database import db
|
||||
from factories import variable_factory
|
||||
from models.dataset import Dataset, Document, DocumentPipelineExecutionLog, Pipeline
|
||||
from models.model import UploadFile
|
||||
from models.workflow import Workflow, WorkflowType
|
||||
from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
|
||||
from services.plugin.plugin_migration import PluginMigration
|
||||
from services.plugin.plugin_service import PluginService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RagPipelineTransformService:
|
||||
def transform_dataset(self, dataset_id: str):
|
||||
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
raise ValueError("Dataset not found")
|
||||
if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
|
||||
return {
|
||||
"pipeline_id": dataset.pipeline_id,
|
||||
"dataset_id": dataset_id,
|
||||
"status": "success",
|
||||
}
|
||||
if dataset.provider != "vendor":
|
||||
raise ValueError("External dataset is not supported")
|
||||
datasource_type = dataset.data_source_type
|
||||
indexing_technique = dataset.indexing_technique
|
||||
|
||||
if not datasource_type and not indexing_technique:
|
||||
return self._transform_to_empty_pipeline(dataset)
|
||||
|
||||
doc_form = dataset.doc_form
|
||||
if not doc_form:
|
||||
return self._transform_to_empty_pipeline(dataset)
|
||||
retrieval_model = dataset.retrieval_model
|
||||
pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
|
||||
# deal dependencies
|
||||
self._deal_dependencies(pipeline_yaml, dataset.tenant_id)
|
||||
# Extract app data
|
||||
workflow_data = pipeline_yaml.get("workflow")
|
||||
if not workflow_data:
|
||||
raise ValueError("Missing workflow data for rag pipeline")
|
||||
graph = workflow_data.get("graph", {})
|
||||
nodes = graph.get("nodes", [])
|
||||
new_nodes = []
|
||||
|
||||
for node in nodes:
|
||||
if (
|
||||
node.get("data", {}).get("type") == "datasource"
|
||||
and node.get("data", {}).get("provider_type") == "local_file"
|
||||
):
|
||||
node = self._deal_file_extensions(node)
|
||||
if node.get("data", {}).get("type") == "knowledge-index":
|
||||
node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
|
||||
new_nodes.append(node)
|
||||
if new_nodes:
|
||||
graph["nodes"] = new_nodes
|
||||
workflow_data["graph"] = graph
|
||||
pipeline_yaml["workflow"] = workflow_data
|
||||
# create pipeline
|
||||
pipeline = self._create_pipeline(pipeline_yaml)
|
||||
|
||||
# save chunk structure to dataset
|
||||
if doc_form == "hierarchical_model":
|
||||
dataset.chunk_structure = "hierarchical_model"
|
||||
elif doc_form == "text_model":
|
||||
dataset.chunk_structure = "text_model"
|
||||
else:
|
||||
raise ValueError("Unsupported doc form")
|
||||
|
||||
dataset.runtime_mode = "rag_pipeline"
|
||||
dataset.pipeline_id = pipeline.id
|
||||
|
||||
# deal document data
|
||||
self._deal_document_data(dataset)
|
||||
|
||||
db.session.commit()
|
||||
return {
|
||||
"pipeline_id": pipeline.id,
|
||||
"dataset_id": dataset_id,
|
||||
"status": "success",
|
||||
}
|
||||
|
||||
def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str | None):
|
||||
pipeline_yaml = {}
|
||||
if doc_form == "text_model":
|
||||
match datasource_type:
|
||||
case "upload_file":
|
||||
if indexing_technique == "high_quality":
|
||||
# get graph from transform.file-general-high-quality.yml
|
||||
with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
if indexing_technique == "economy":
|
||||
# get graph from transform.file-general-economy.yml
|
||||
with open(f"{Path(__file__).parent}/transform/file-general-economy.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case "notion_import":
|
||||
if indexing_technique == "high_quality":
|
||||
# get graph from transform.notion-general-high-quality.yml
|
||||
with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
if indexing_technique == "economy":
|
||||
# get graph from transform.notion-general-economy.yml
|
||||
with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case "website_crawl":
|
||||
if indexing_technique == "high_quality":
|
||||
# get graph from transform.website-crawl-general-high-quality.yml
|
||||
with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
if indexing_technique == "economy":
|
||||
# get graph from transform.website-crawl-general-economy.yml
|
||||
with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case _:
|
||||
raise ValueError("Unsupported datasource type")
|
||||
elif doc_form == "hierarchical_model":
|
||||
match datasource_type:
|
||||
case "upload_file":
|
||||
# get graph from transform.file-parentchild.yml
|
||||
with open(f"{Path(__file__).parent}/transform/file-parentchild.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case "notion_import":
|
||||
# get graph from transform.notion-parentchild.yml
|
||||
with open(f"{Path(__file__).parent}/transform/notion-parentchild.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case "website_crawl":
|
||||
# get graph from transform.website-crawl-parentchild.yml
|
||||
with open(f"{Path(__file__).parent}/transform/website-crawl-parentchild.yml") as f:
|
||||
pipeline_yaml = yaml.safe_load(f)
|
||||
case _:
|
||||
raise ValueError("Unsupported datasource type")
|
||||
else:
|
||||
raise ValueError("Unsupported doc form")
|
||||
return pipeline_yaml
|
||||
|
||||
def _deal_file_extensions(self, node: dict):
|
||||
file_extensions = node.get("data", {}).get("fileExtensions", [])
|
||||
if not file_extensions:
|
||||
return node
|
||||
node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
|
||||
return node
|
||||
|
||||
def _deal_knowledge_index(
|
||||
self, dataset: Dataset, doc_form: str, indexing_technique: str | None, retrieval_model: dict, node: dict
|
||||
):
|
||||
knowledge_configuration_dict = node.get("data", {})
|
||||
knowledge_configuration = KnowledgeConfiguration.model_validate(knowledge_configuration_dict)
|
||||
|
||||
if indexing_technique == "high_quality":
|
||||
knowledge_configuration.embedding_model = dataset.embedding_model
|
||||
knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
|
||||
if retrieval_model:
|
||||
retrieval_setting = RetrievalSetting.model_validate(retrieval_model)
|
||||
if indexing_technique == "economy":
|
||||
retrieval_setting.search_method = RetrievalMethod.KEYWORD_SEARCH
|
||||
knowledge_configuration.retrieval_model = retrieval_setting
|
||||
else:
|
||||
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
|
||||
|
||||
knowledge_configuration_dict.update(knowledge_configuration.model_dump())
|
||||
node["data"] = knowledge_configuration_dict
|
||||
return node
|
||||
|
||||
def _create_pipeline(
|
||||
self,
|
||||
data: dict,
|
||||
) -> Pipeline:
|
||||
"""Create a new app or update an existing one."""
|
||||
pipeline_data = data.get("rag_pipeline", {})
|
||||
# Initialize pipeline based on mode
|
||||
workflow_data = data.get("workflow")
|
||||
if not workflow_data or not isinstance(workflow_data, dict):
|
||||
raise ValueError("Missing workflow data for rag pipeline")
|
||||
|
||||
environment_variables_list = workflow_data.get("environment_variables", [])
|
||||
environment_variables = [
|
||||
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
|
||||
]
|
||||
conversation_variables_list = workflow_data.get("conversation_variables", [])
|
||||
conversation_variables = [
|
||||
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
|
||||
]
|
||||
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
|
||||
|
||||
graph = workflow_data.get("graph", {})
|
||||
|
||||
# Create new app
|
||||
pipeline = Pipeline(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
name=pipeline_data.get("name", ""),
|
||||
description=pipeline_data.get("description", ""),
|
||||
created_by=current_user.id,
|
||||
updated_by=current_user.id,
|
||||
is_published=True,
|
||||
is_public=True,
|
||||
)
|
||||
pipeline.id = str(uuid4())
|
||||
|
||||
db.session.add(pipeline)
|
||||
db.session.flush()
|
||||
# create draft workflow
|
||||
draft_workflow = Workflow(
|
||||
tenant_id=pipeline.tenant_id,
|
||||
app_id=pipeline.id,
|
||||
features="{}",
|
||||
type=WorkflowType.RAG_PIPELINE,
|
||||
version="draft",
|
||||
graph=json.dumps(graph),
|
||||
created_by=current_user.id,
|
||||
environment_variables=environment_variables,
|
||||
conversation_variables=conversation_variables,
|
||||
rag_pipeline_variables=rag_pipeline_variables_list,
|
||||
)
|
||||
published_workflow = Workflow(
|
||||
tenant_id=pipeline.tenant_id,
|
||||
app_id=pipeline.id,
|
||||
features="{}",
|
||||
type=WorkflowType.RAG_PIPELINE,
|
||||
version=str(datetime.now(UTC).replace(tzinfo=None)),
|
||||
graph=json.dumps(graph),
|
||||
created_by=current_user.id,
|
||||
environment_variables=environment_variables,
|
||||
conversation_variables=conversation_variables,
|
||||
rag_pipeline_variables=rag_pipeline_variables_list,
|
||||
)
|
||||
db.session.add(draft_workflow)
|
||||
db.session.add(published_workflow)
|
||||
db.session.flush()
|
||||
pipeline.workflow_id = published_workflow.id
|
||||
db.session.add(pipeline)
|
||||
return pipeline
|
||||
|
||||
def _deal_dependencies(self, pipeline_yaml: dict, tenant_id: str):
|
||||
installer_manager = PluginInstaller()
|
||||
installed_plugins = installer_manager.list_plugins(tenant_id)
|
||||
|
||||
plugin_migration = PluginMigration()
|
||||
|
||||
installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
|
||||
dependencies = pipeline_yaml.get("dependencies", [])
|
||||
need_install_plugin_unique_identifiers = []
|
||||
for dependency in dependencies:
|
||||
if dependency.get("type") == "marketplace":
|
||||
plugin_unique_identifier = dependency.get("value", {}).get("plugin_unique_identifier")
|
||||
plugin_id = plugin_unique_identifier.split(":")[0]
|
||||
if plugin_id not in installed_plugins_ids:
|
||||
plugin_unique_identifier = plugin_migration._fetch_plugin_unique_identifier(plugin_id) # type: ignore
|
||||
if plugin_unique_identifier:
|
||||
need_install_plugin_unique_identifiers.append(plugin_unique_identifier)
|
||||
if need_install_plugin_unique_identifiers:
|
||||
logger.debug("Installing missing pipeline plugins %s", need_install_plugin_unique_identifiers)
|
||||
PluginService.install_from_marketplace_pkg(tenant_id, need_install_plugin_unique_identifiers)
|
||||
|
||||
def _transform_to_empty_pipeline(self, dataset: Dataset):
|
||||
pipeline = Pipeline(
|
||||
tenant_id=dataset.tenant_id,
|
||||
name=dataset.name,
|
||||
description=dataset.description,
|
||||
created_by=current_user.id,
|
||||
)
|
||||
db.session.add(pipeline)
|
||||
db.session.flush()
|
||||
|
||||
dataset.pipeline_id = pipeline.id
|
||||
dataset.runtime_mode = "rag_pipeline"
|
||||
dataset.updated_by = current_user.id
|
||||
dataset.updated_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
db.session.add(dataset)
|
||||
db.session.commit()
|
||||
return {
|
||||
"pipeline_id": pipeline.id,
|
||||
"dataset_id": dataset.id,
|
||||
"status": "success",
|
||||
}
|
||||
|
||||
def _deal_document_data(self, dataset: Dataset):
|
||||
file_node_id = "1752479895761"
|
||||
notion_node_id = "1752489759475"
|
||||
jina_node_id = "1752491761974"
|
||||
firecrawl_node_id = "1752565402678"
|
||||
|
||||
documents = db.session.query(Document).where(Document.dataset_id == dataset.id).all()
|
||||
|
||||
for document in documents:
|
||||
data_source_info_dict = document.data_source_info_dict
|
||||
if not data_source_info_dict:
|
||||
continue
|
||||
if document.data_source_type == "upload_file":
|
||||
document.data_source_type = "local_file"
|
||||
file_id = data_source_info_dict.get("upload_file_id")
|
||||
if file_id:
|
||||
file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
|
||||
if file:
|
||||
data_source_info = json.dumps(
|
||||
{
|
||||
"real_file_id": file_id,
|
||||
"name": file.name,
|
||||
"size": file.size,
|
||||
"extension": file.extension,
|
||||
"mime_type": file.mime_type,
|
||||
"url": "",
|
||||
"transfer_method": "local_file",
|
||||
}
|
||||
)
|
||||
document.data_source_info = data_source_info
|
||||
document_pipeline_execution_log = DocumentPipelineExecutionLog(
|
||||
document_id=document.id,
|
||||
pipeline_id=dataset.pipeline_id,
|
||||
datasource_type="local_file",
|
||||
datasource_info=data_source_info,
|
||||
input_data={},
|
||||
created_by=document.created_by,
|
||||
datasource_node_id=file_node_id,
|
||||
)
|
||||
document_pipeline_execution_log.created_at = document.created_at
|
||||
db.session.add(document)
|
||||
db.session.add(document_pipeline_execution_log)
|
||||
elif document.data_source_type == "notion_import":
|
||||
document.data_source_type = "online_document"
|
||||
data_source_info = json.dumps(
|
||||
{
|
||||
"workspace_id": data_source_info_dict.get("notion_workspace_id"),
|
||||
"page": {
|
||||
"page_id": data_source_info_dict.get("notion_page_id"),
|
||||
"page_name": document.name,
|
||||
"page_icon": data_source_info_dict.get("notion_page_icon"),
|
||||
"type": data_source_info_dict.get("type"),
|
||||
"last_edited_time": data_source_info_dict.get("last_edited_time"),
|
||||
"parent_id": None,
|
||||
},
|
||||
}
|
||||
)
|
||||
document.data_source_info = data_source_info
|
||||
document_pipeline_execution_log = DocumentPipelineExecutionLog(
|
||||
document_id=document.id,
|
||||
pipeline_id=dataset.pipeline_id,
|
||||
datasource_type="online_document",
|
||||
datasource_info=data_source_info,
|
||||
input_data={},
|
||||
created_by=document.created_by,
|
||||
datasource_node_id=notion_node_id,
|
||||
)
|
||||
document_pipeline_execution_log.created_at = document.created_at
|
||||
db.session.add(document)
|
||||
db.session.add(document_pipeline_execution_log)
|
||||
elif document.data_source_type == "website_crawl":
|
||||
document.data_source_type = "website_crawl"
|
||||
data_source_info = json.dumps(
|
||||
{
|
||||
"source_url": data_source_info_dict.get("url"),
|
||||
"content": "",
|
||||
"title": document.name,
|
||||
"description": "",
|
||||
}
|
||||
)
|
||||
document.data_source_info = data_source_info
|
||||
if data_source_info_dict.get("provider") == "firecrawl":
|
||||
datasource_node_id = firecrawl_node_id
|
||||
elif data_source_info_dict.get("provider") == "jinareader":
|
||||
datasource_node_id = jina_node_id
|
||||
else:
|
||||
continue
|
||||
document_pipeline_execution_log = DocumentPipelineExecutionLog(
|
||||
document_id=document.id,
|
||||
pipeline_id=dataset.pipeline_id,
|
||||
datasource_type="website_crawl",
|
||||
datasource_info=data_source_info,
|
||||
input_data={},
|
||||
created_by=document.created_by,
|
||||
datasource_node_id=datasource_node_id,
|
||||
)
|
||||
document_pipeline_execution_log.created_at = document.created_at
|
||||
db.session.add(document)
|
||||
db.session.add(document_pipeline_execution_log)
|
||||
@@ -0,0 +1,709 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: ''
|
||||
icon_type: emoji
|
||||
name: file-general-economy
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: if-else
|
||||
id: 1752479895761-source-1752481129417-target
|
||||
source: '1752479895761'
|
||||
sourceHandle: source
|
||||
target: '1752481129417'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: tool
|
||||
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
target: '1752480460682'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: document-extractor
|
||||
id: 1752481129417-false-1752481112180-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 'false'
|
||||
target: '1752481112180'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: variable-aggregator
|
||||
id: 1752480460682-source-1752482022496-target
|
||||
source: '1752480460682'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: document-extractor
|
||||
targetType: variable-aggregator
|
||||
id: 1752481112180-source-1752482022496-target
|
||||
source: '1752481112180'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752482022496-source-1752482151668-target
|
||||
source: '1752482022496'
|
||||
sourceHandle: source
|
||||
target: '1752482151668'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752482151668-source-1752477924228-target
|
||||
source: '1752482151668'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752482151668'
|
||||
- result
|
||||
indexing_technique: economy
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: keyword_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: true
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 1076.4656678451215
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1076.4656678451215
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: File
|
||||
datasource_name: upload-file
|
||||
datasource_parameters: {}
|
||||
fileExtensions:
|
||||
- txt
|
||||
- markdown
|
||||
- mdx
|
||||
- pdf
|
||||
- html
|
||||
- xlsx
|
||||
- xls
|
||||
- vtt
|
||||
- properties
|
||||
- doc
|
||||
- docx
|
||||
- csv
|
||||
- eml
|
||||
- msg
|
||||
- pptx
|
||||
- xml
|
||||
- epub
|
||||
- ppt
|
||||
- md
|
||||
plugin_id: langgenius/file
|
||||
provider_name: file
|
||||
provider_type: local_file
|
||||
selected: false
|
||||
title: File
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752479895761'
|
||||
position:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
documents:
|
||||
description: the documents extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
images:
|
||||
description: The images extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
|
||||
jpeg)
|
||||
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
|
||||
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
|
||||
jpg, jpeg)
|
||||
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
|
||||
label:
|
||||
en_US: file
|
||||
ja_JP: ファイル
|
||||
pt_BR: arquivo
|
||||
zh_Hans: file
|
||||
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
|
||||
png, jpg, jpeg)
|
||||
max: null
|
||||
min: null
|
||||
name: file
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: file
|
||||
params:
|
||||
file: ''
|
||||
provider_id: langgenius/dify_extractor/dify_extractor
|
||||
provider_name: langgenius/dify_extractor/dify_extractor
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: Dify Extractor
|
||||
tool_configurations: {}
|
||||
tool_description: Dify Extractor
|
||||
tool_label: Dify Extractor
|
||||
tool_name: dify_extractor
|
||||
tool_parameters:
|
||||
file:
|
||||
type: variable
|
||||
value:
|
||||
- '1752479895761'
|
||||
- file
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752480460682'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_array_file: false
|
||||
selected: false
|
||||
title: 文档提取器
|
||||
type: document-extractor
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
height: 90
|
||||
id: '1752481112180'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
cases:
|
||||
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
conditions:
|
||||
- comparison_operator: is
|
||||
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
|
||||
value: .xlsx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
|
||||
value: .xls
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
|
||||
value: .md
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
|
||||
value: .markdown
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
|
||||
value: .mdx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
|
||||
value: .html
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
|
||||
value: .htm
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
|
||||
value: .docx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
|
||||
value: .csv
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
|
||||
value: .txt
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
logical_operator: or
|
||||
selected: false
|
||||
title: IF/ELSE
|
||||
type: if-else
|
||||
height: 358
|
||||
id: '1752481129417'
|
||||
position:
|
||||
x: -489.57009543377865
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -489.57009543377865
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
advanced_settings:
|
||||
group_enabled: false
|
||||
groups:
|
||||
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
|
||||
group_name: Group1
|
||||
output_type: string
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
height: 129
|
||||
id: '1752482022496'
|
||||
position:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos blocos.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: DDelimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長
|
||||
pt_BR: O comprimento de sobreposição dos fragmentos
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Comprimento de sobreposição do bloco
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Excluir todos os URLs e endereços de e-mail
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Excluir todos os URLs e endereços de e-mail
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752482022496.output#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752482151668'
|
||||
position:
|
||||
x: 693.5300771507484
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 693.5300771507484
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: 701.4999626224237
|
||||
y: 128.33739021504016
|
||||
zoom: 0.48941689643726966
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Chunk overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,709 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: '#FFF4ED'
|
||||
icon_type: emoji
|
||||
name: file-general-high-quality
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: if-else
|
||||
id: 1752479895761-source-1752481129417-target
|
||||
source: '1752479895761'
|
||||
sourceHandle: source
|
||||
target: '1752481129417'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: tool
|
||||
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
target: '1752480460682'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: document-extractor
|
||||
id: 1752481129417-false-1752481112180-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 'false'
|
||||
target: '1752481112180'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: variable-aggregator
|
||||
id: 1752480460682-source-1752482022496-target
|
||||
source: '1752480460682'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: document-extractor
|
||||
targetType: variable-aggregator
|
||||
id: 1752481112180-source-1752482022496-target
|
||||
source: '1752481112180'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752482022496-source-1752482151668-target
|
||||
source: '1752482022496'
|
||||
sourceHandle: source
|
||||
target: '1752482151668'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752482151668-source-1752477924228-target
|
||||
source: '1752482151668'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752482151668'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: false
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 1076.4656678451215
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1076.4656678451215
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: File
|
||||
datasource_name: upload-file
|
||||
datasource_parameters: {}
|
||||
fileExtensions:
|
||||
- txt
|
||||
- markdown
|
||||
- mdx
|
||||
- pdf
|
||||
- html
|
||||
- xlsx
|
||||
- xls
|
||||
- vtt
|
||||
- properties
|
||||
- doc
|
||||
- docx
|
||||
- csv
|
||||
- eml
|
||||
- msg
|
||||
- pptx
|
||||
- xml
|
||||
- epub
|
||||
- ppt
|
||||
- md
|
||||
plugin_id: langgenius/file
|
||||
provider_name: file
|
||||
provider_type: local_file
|
||||
selected: false
|
||||
title: File
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752479895761'
|
||||
position:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
documents:
|
||||
description: the documents extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
images:
|
||||
description: The images extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
|
||||
jpeg)
|
||||
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
|
||||
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
|
||||
jpg, jpeg)
|
||||
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
|
||||
label:
|
||||
en_US: file
|
||||
ja_JP: ファイル
|
||||
pt_BR: arquivo
|
||||
zh_Hans: file
|
||||
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
|
||||
png, jpg, jpeg)
|
||||
max: null
|
||||
min: null
|
||||
name: file
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: file
|
||||
params:
|
||||
file: ''
|
||||
provider_id: langgenius/dify_extractor/dify_extractor
|
||||
provider_name: langgenius/dify_extractor/dify_extractor
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: Dify Extractor
|
||||
tool_configurations: {}
|
||||
tool_description: Dify Extractor
|
||||
tool_label: Dify Extractor
|
||||
tool_name: dify_extractor
|
||||
tool_parameters:
|
||||
file:
|
||||
type: variable
|
||||
value:
|
||||
- '1752479895761'
|
||||
- file
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752480460682'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_array_file: false
|
||||
selected: false
|
||||
title: 文档提取器
|
||||
type: document-extractor
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
height: 90
|
||||
id: '1752481112180'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
cases:
|
||||
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
conditions:
|
||||
- comparison_operator: is
|
||||
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
|
||||
value: .xlsx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
|
||||
value: .xls
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
|
||||
value: .md
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
|
||||
value: .markdown
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
|
||||
value: .mdx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
|
||||
value: .html
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
|
||||
value: .htm
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
|
||||
value: .docx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
|
||||
value: .csv
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
|
||||
value: .txt
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
logical_operator: or
|
||||
selected: false
|
||||
title: IF/ELSE
|
||||
type: if-else
|
||||
height: 358
|
||||
id: '1752481129417'
|
||||
position:
|
||||
x: -489.57009543377865
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -489.57009543377865
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
advanced_settings:
|
||||
group_enabled: false
|
||||
groups:
|
||||
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
|
||||
group_name: Group1
|
||||
output_type: string
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
height: 129
|
||||
id: '1752482022496'
|
||||
position:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos pedaços.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: Delimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長
|
||||
pt_BR: The chunk overlap length.
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Chunk Overlap Length
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete all URLs and email addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete All URLs and Email Addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752482022496.output#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752482151668'
|
||||
position:
|
||||
x: 693.5300771507484
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 693.5300771507484
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: 701.4999626224237
|
||||
y: 128.33739021504016
|
||||
zoom: 0.48941689643726966
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Chunk overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
814
dify/api/services/rag_pipeline/transform/file-parentchild.yml
Normal file
814
dify/api/services/rag_pipeline/transform/file-parentchild.yml
Normal file
@@ -0,0 +1,814 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: '#FFF4ED'
|
||||
icon_type: emoji
|
||||
name: file-parentchild
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: if-else
|
||||
id: 1752479895761-source-1752481129417-target
|
||||
source: '1752479895761'
|
||||
sourceHandle: source
|
||||
target: '1752481129417'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: tool
|
||||
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
target: '1752480460682'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: if-else
|
||||
targetType: document-extractor
|
||||
id: 1752481129417-false-1752481112180-target
|
||||
source: '1752481129417'
|
||||
sourceHandle: 'false'
|
||||
target: '1752481112180'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: variable-aggregator
|
||||
id: 1752480460682-source-1752482022496-target
|
||||
source: '1752480460682'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: document-extractor
|
||||
targetType: variable-aggregator
|
||||
id: 1752481112180-source-1752482022496-target
|
||||
source: '1752481112180'
|
||||
sourceHandle: source
|
||||
target: '1752482022496'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752482022496-source-1752575473519-target
|
||||
source: '1752482022496'
|
||||
sourceHandle: source
|
||||
target: '1752575473519'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752575473519-source-1752477924228-target
|
||||
source: '1752575473519'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: hierarchical_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752575473519'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: false
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 994.3774545394483
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 994.3774545394483
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: File
|
||||
datasource_name: upload-file
|
||||
datasource_parameters: {}
|
||||
fileExtensions:
|
||||
- txt
|
||||
- markdown
|
||||
- mdx
|
||||
- pdf
|
||||
- html
|
||||
- xlsx
|
||||
- xls
|
||||
- vtt
|
||||
- properties
|
||||
- doc
|
||||
- docx
|
||||
- csv
|
||||
- eml
|
||||
- msg
|
||||
- pptx
|
||||
- xml
|
||||
- epub
|
||||
- ppt
|
||||
- md
|
||||
plugin_id: langgenius/file
|
||||
provider_name: file
|
||||
provider_type: local_file
|
||||
selected: false
|
||||
title: File
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752479895761'
|
||||
position:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -839.8603427660498
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
documents:
|
||||
description: the documents extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
images:
|
||||
description: The images extracted from the file
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
|
||||
jpeg)
|
||||
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
|
||||
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
|
||||
jpg, jpeg)
|
||||
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
|
||||
label:
|
||||
en_US: file
|
||||
ja_JP: ファイル
|
||||
pt_BR: arquivo
|
||||
zh_Hans: file
|
||||
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
|
||||
png, jpg, jpeg)
|
||||
max: null
|
||||
min: null
|
||||
name: file
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: file
|
||||
params:
|
||||
file: ''
|
||||
provider_id: langgenius/dify_extractor/dify_extractor
|
||||
provider_name: langgenius/dify_extractor/dify_extractor
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: Dify Extractor
|
||||
tool_configurations: {}
|
||||
tool_description: Dify Extractor
|
||||
tool_label: Dify Extractor
|
||||
tool_name: dify_extractor
|
||||
tool_parameters:
|
||||
file:
|
||||
type: variable
|
||||
value:
|
||||
- '1752479895761'
|
||||
- file
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752480460682'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_array_file: false
|
||||
selected: false
|
||||
title: 文档提取器
|
||||
type: document-extractor
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
height: 90
|
||||
id: '1752481112180'
|
||||
position:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
positionAbsolute:
|
||||
x: -108.28652292656551
|
||||
y: 390.6576481692478
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
cases:
|
||||
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
conditions:
|
||||
- comparison_operator: is
|
||||
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
|
||||
value: .xlsx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
|
||||
value: .xls
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
|
||||
value: .md
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
|
||||
value: .markdown
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
|
||||
value: .mdx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
|
||||
value: .html
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
|
||||
value: .htm
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
|
||||
value: .docx
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
|
||||
value: .csv
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
- comparison_operator: is
|
||||
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
|
||||
value: .txt
|
||||
varType: file
|
||||
variable_selector:
|
||||
- '1752479895761'
|
||||
- file
|
||||
- extension
|
||||
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
|
||||
logical_operator: or
|
||||
selected: false
|
||||
title: IF/ELSE
|
||||
type: if-else
|
||||
height: 358
|
||||
id: '1752481129417'
|
||||
position:
|
||||
x: -512.2335487893622
|
||||
y: 251.3910724383104
|
||||
positionAbsolute:
|
||||
x: -512.2335487893622
|
||||
y: 251.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
advanced_settings:
|
||||
group_enabled: false
|
||||
groups:
|
||||
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
|
||||
group_name: Group1
|
||||
output_type: string
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752481112180'
|
||||
- text
|
||||
- - '1752480460682'
|
||||
- text
|
||||
height: 129
|
||||
id: '1752482022496'
|
||||
position:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 319.441649575055
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: Parent child chunks result
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input text
|
||||
ja_JP: 入力テキスト
|
||||
pt_BR: Texto de entrada
|
||||
zh_Hans: 输入文本
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_text
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 1024
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for chunking
|
||||
ja_JP: チャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para divisão
|
||||
zh_Hans: 用于分块的最大长度
|
||||
label:
|
||||
en_US: Maximum Length
|
||||
ja_JP: 最大長
|
||||
pt_BR: Comprimento Máximo
|
||||
zh_Hans: 最大长度
|
||||
llm_description: Maximum length allowed per chunk
|
||||
max: null
|
||||
min: null
|
||||
name: max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '
|
||||
|
||||
|
||||
'
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for chunking
|
||||
ja_JP: チャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para divisão
|
||||
zh_Hans: 用于分块的分隔符
|
||||
label:
|
||||
en_US: Chunk Separator
|
||||
ja_JP: チャンク区切り文字
|
||||
pt_BR: Separador de Divisão
|
||||
zh_Hans: 分块分隔符
|
||||
llm_description: The separator used to split chunks
|
||||
max: null
|
||||
min: null
|
||||
name: separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 512
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for subchunking
|
||||
ja_JP: サブチャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para subdivisão
|
||||
zh_Hans: 用于子分块的最大长度
|
||||
label:
|
||||
en_US: Subchunk Maximum Length
|
||||
ja_JP: サブチャンク最大長
|
||||
pt_BR: Comprimento Máximo de Subdivisão
|
||||
zh_Hans: 子分块最大长度
|
||||
llm_description: Maximum length allowed per subchunk
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '. '
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for subchunking
|
||||
ja_JP: サブチャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para subdivisão
|
||||
zh_Hans: 用于子分块的分隔符
|
||||
label:
|
||||
en_US: Subchunk Separator
|
||||
ja_JP: サブチャンキング用セパレーター
|
||||
pt_BR: Separador de Subdivisão
|
||||
zh_Hans: 子分块分隔符
|
||||
llm_description: The separator used to split subchunks
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: paragraph
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Split text into paragraphs based on separator and maximum chunk
|
||||
length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
|
||||
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
|
||||
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
|
||||
máximo do bloco, usando o texto dividido como bloco pai ou documento
|
||||
completo como bloco pai e diretamente recuperá-lo.
|
||||
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
|
||||
label:
|
||||
en_US: Parent Mode
|
||||
ja_JP: 親子モード
|
||||
pt_BR: Modo Pai
|
||||
zh_Hans: 父块模式
|
||||
llm_description: Split text into paragraphs based on separator and maximum
|
||||
chunk length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
max: null
|
||||
min: null
|
||||
name: parent_mode
|
||||
options:
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Paragraph
|
||||
ja_JP: 段落
|
||||
pt_BR: Parágrafo
|
||||
zh_Hans: 段落
|
||||
value: paragraph
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Full Document
|
||||
ja_JP: 全文
|
||||
pt_BR: Documento Completo
|
||||
zh_Hans: 全文
|
||||
value: full_doc
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: select
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove extra spaces in the text
|
||||
ja_JP: テキスト内の余分なスペースを削除するかどうか
|
||||
pt_BR: Se deve remover espaços extras no texto
|
||||
zh_Hans: 是否移除文本中的多余空格
|
||||
label:
|
||||
en_US: Remove Extra Spaces
|
||||
ja_JP: 余分なスペースを削除
|
||||
pt_BR: Remover Espaços Extras
|
||||
zh_Hans: 移除多余空格
|
||||
llm_description: Whether to remove extra spaces in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_extra_spaces
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove URLs and emails in the text
|
||||
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
|
||||
pt_BR: Se deve remover URLs e e-mails no texto
|
||||
zh_Hans: 是否移除文本中的URL和电子邮件地址
|
||||
label:
|
||||
en_US: Remove URLs and Emails
|
||||
ja_JP: URLとメールアドレスを削除
|
||||
pt_BR: Remover URLs e E-mails
|
||||
zh_Hans: 移除URL和电子邮件地址
|
||||
llm_description: Whether to remove URLs and emails in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_urls_emails
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
input_text: ''
|
||||
max_length: ''
|
||||
parent_mode: ''
|
||||
remove_extra_spaces: ''
|
||||
remove_urls_emails: ''
|
||||
separator: ''
|
||||
subchunk_max_length: ''
|
||||
subchunk_separator: ''
|
||||
provider_id: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_name: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: Parent-child Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: Parent-child Chunk Structure
|
||||
tool_label: Parent-child Chunker
|
||||
tool_name: parentchild_chunker
|
||||
tool_parameters:
|
||||
input_text:
|
||||
type: mixed
|
||||
value: '{{#1752482022496.output#}}'
|
||||
max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
parent_mode:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- parent_mode
|
||||
remove_extra_spaces:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
remove_urls_emails:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
subchunk_max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- child_max_chunk_length
|
||||
subchunk_separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.child_delimiter#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752575473519'
|
||||
position:
|
||||
x: 637.9241611063885
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 637.9241611063885
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: 948.6766333808323
|
||||
y: -102.06757184183238
|
||||
zoom: 0.8375774577380971
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 1024
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n
|
||||
label: Child delimiter
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: child_delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 512
|
||||
label: Child max chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: child_max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: paragraph
|
||||
label: Parent mode
|
||||
max_length: 48
|
||||
options:
|
||||
- full_doc
|
||||
- paragraph
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: select
|
||||
unit: null
|
||||
variable: parent_mode
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,400 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: ''
|
||||
icon_type: emoji
|
||||
name: notion-general-economy
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752482151668-source-1752477924228-target
|
||||
source: '1752482151668'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: tool
|
||||
id: 1752489759475-source-1752482151668-target
|
||||
source: '1752489759475'
|
||||
sourceHandle: source
|
||||
target: '1752482151668'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752482151668'
|
||||
- result
|
||||
indexing_technique: economy
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: keyword_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: true
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 1444.5503479271906
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1444.5503479271906
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos pedaços.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: Delimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長
|
||||
pt_BR: The chunk overlap length.
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Chunk Overlap Length
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete all URLs and email addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete All URLs and Email Addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752489759475.content#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752482151668'
|
||||
position:
|
||||
x: 1063.6922916384628
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1063.6922916384628
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Notion数据源
|
||||
datasource_name: notion_datasource
|
||||
datasource_parameters: {}
|
||||
plugin_id: langgenius/notion_datasource
|
||||
provider_name: notion_datasource
|
||||
provider_type: online_document
|
||||
selected: false
|
||||
title: Notion数据源
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752489759475'
|
||||
position:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -838.569649323166
|
||||
y: -168.94656489167426
|
||||
zoom: 1.286925643857699
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Chunk overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,400 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: '#FFF4ED'
|
||||
icon_type: emoji
|
||||
name: notion-general-high-quality
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752482151668-source-1752477924228-target
|
||||
source: '1752482151668'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: tool
|
||||
id: 1752489759475-source-1752482151668-target
|
||||
source: '1752489759475'
|
||||
sourceHandle: source
|
||||
target: '1752482151668'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752482151668'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: true
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 1444.5503479271906
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1444.5503479271906
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos pedaços.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: Delimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長
|
||||
pt_BR: The chunk overlap length.
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Chunk Overlap Length
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete all URLs and email addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete All URLs and Email Addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752489759475.content#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752482151668'
|
||||
position:
|
||||
x: 1063.6922916384628
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1063.6922916384628
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Notion数据源
|
||||
datasource_name: notion_datasource
|
||||
datasource_parameters: {}
|
||||
plugin_id: langgenius/notion_datasource
|
||||
provider_name: notion_datasource
|
||||
provider_type: online_document
|
||||
selected: false
|
||||
title: Notion数据源
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752489759475'
|
||||
position:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -838.569649323166
|
||||
y: -168.94656489167426
|
||||
zoom: 1.286925643857699
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Chunk overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
506
dify/api/services/rag_pipeline/transform/notion-parentchild.yml
Normal file
506
dify/api/services/rag_pipeline/transform/notion-parentchild.yml
Normal file
@@ -0,0 +1,506 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: ''
|
||||
icon_type: emoji
|
||||
name: notion-parentchild
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: tool
|
||||
id: 1752489759475-source-1752490343805-target
|
||||
source: '1752489759475'
|
||||
sourceHandle: source
|
||||
target: '1752490343805'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752490343805-source-1752477924228-target
|
||||
source: '1752490343805'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: hierarchical_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752490343805'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: false
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 1486.2052698032674
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1486.2052698032674
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Notion数据源
|
||||
datasource_name: notion_datasource
|
||||
datasource_parameters: {}
|
||||
plugin_id: langgenius/notion_datasource
|
||||
provider_name: notion_datasource
|
||||
provider_type: online_document
|
||||
selected: false
|
||||
title: Notion数据源
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752489759475'
|
||||
position:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 736.9082104000458
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: Parent child chunks result
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input text
|
||||
ja_JP: 入力テキスト
|
||||
pt_BR: Texto de entrada
|
||||
zh_Hans: 输入文本
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_text
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 1024
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for chunking
|
||||
ja_JP: チャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para divisão
|
||||
zh_Hans: 用于分块的最大长度
|
||||
label:
|
||||
en_US: Maximum Length
|
||||
ja_JP: 最大長
|
||||
pt_BR: Comprimento Máximo
|
||||
zh_Hans: 最大长度
|
||||
llm_description: Maximum length allowed per chunk
|
||||
max: null
|
||||
min: null
|
||||
name: max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '
|
||||
|
||||
|
||||
'
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for chunking
|
||||
ja_JP: チャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para divisão
|
||||
zh_Hans: 用于分块的分隔符
|
||||
label:
|
||||
en_US: Chunk Separator
|
||||
ja_JP: チャンク区切り文字
|
||||
pt_BR: Separador de Divisão
|
||||
zh_Hans: 分块分隔符
|
||||
llm_description: The separator used to split chunks
|
||||
max: null
|
||||
min: null
|
||||
name: separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 512
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for subchunking
|
||||
ja_JP: サブチャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para subdivisão
|
||||
zh_Hans: 用于子分块的最大长度
|
||||
label:
|
||||
en_US: Subchunk Maximum Length
|
||||
ja_JP: サブチャンク最大長
|
||||
pt_BR: Comprimento Máximo de Subdivisão
|
||||
zh_Hans: 子分块最大长度
|
||||
llm_description: Maximum length allowed per subchunk
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '. '
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for subchunking
|
||||
ja_JP: サブチャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para subdivisão
|
||||
zh_Hans: 用于子分块的分隔符
|
||||
label:
|
||||
en_US: Subchunk Separator
|
||||
ja_JP: サブチャンキング用セパレーター
|
||||
pt_BR: Separador de Subdivisão
|
||||
zh_Hans: 子分块分隔符
|
||||
llm_description: The separator used to split subchunks
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: paragraph
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Split text into paragraphs based on separator and maximum chunk
|
||||
length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
|
||||
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
|
||||
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
|
||||
máximo do bloco, usando o texto dividido como bloco pai ou documento
|
||||
completo como bloco pai e diretamente recuperá-lo.
|
||||
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
|
||||
label:
|
||||
en_US: Parent Mode
|
||||
ja_JP: 親子モード
|
||||
pt_BR: Modo Pai
|
||||
zh_Hans: 父块模式
|
||||
llm_description: Split text into paragraphs based on separator and maximum
|
||||
chunk length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
max: null
|
||||
min: null
|
||||
name: parent_mode
|
||||
options:
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Paragraph
|
||||
ja_JP: 段落
|
||||
pt_BR: Parágrafo
|
||||
zh_Hans: 段落
|
||||
value: paragraph
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Full Document
|
||||
ja_JP: 全文
|
||||
pt_BR: Documento Completo
|
||||
zh_Hans: 全文
|
||||
value: full_doc
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: select
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove extra spaces in the text
|
||||
ja_JP: テキスト内の余分なスペースを削除するかどうか
|
||||
pt_BR: Se deve remover espaços extras no texto
|
||||
zh_Hans: 是否移除文本中的多余空格
|
||||
label:
|
||||
en_US: Remove Extra Spaces
|
||||
ja_JP: 余分なスペースを削除
|
||||
pt_BR: Remover Espaços Extras
|
||||
zh_Hans: 移除多余空格
|
||||
llm_description: Whether to remove extra spaces in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_extra_spaces
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove URLs and emails in the text
|
||||
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
|
||||
pt_BR: Se deve remover URLs e e-mails no texto
|
||||
zh_Hans: 是否移除文本中的URL和电子邮件地址
|
||||
label:
|
||||
en_US: Remove URLs and Emails
|
||||
ja_JP: URLとメールアドレスを削除
|
||||
pt_BR: Remover URLs e E-mails
|
||||
zh_Hans: 移除URL和电子邮件地址
|
||||
llm_description: Whether to remove URLs and emails in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_urls_emails
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
input_text: ''
|
||||
max_length: ''
|
||||
parent_mode: ''
|
||||
remove_extra_spaces: ''
|
||||
remove_urls_emails: ''
|
||||
separator: ''
|
||||
subchunk_max_length: ''
|
||||
subchunk_separator: ''
|
||||
provider_id: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_name: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_type: builtin
|
||||
selected: true
|
||||
title: Parent-child Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: Parent-child Chunk Structure
|
||||
tool_label: Parent-child Chunker
|
||||
tool_name: parentchild_chunker
|
||||
tool_parameters:
|
||||
input_text:
|
||||
type: mixed
|
||||
value: '{{#1752489759475.content#}}'
|
||||
max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
parent_mode:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- parent_mode
|
||||
remove_extra_spaces:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
remove_urls_emails:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
subchunk_max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- child_max_chunk_length
|
||||
subchunk_separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.child_delimiter#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752490343805'
|
||||
position:
|
||||
x: 1077.0240183162543
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1077.0240183162543
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -487.2912544090391
|
||||
y: -54.7029301848807
|
||||
zoom: 0.9994011715768695
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 1024
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n
|
||||
label: Child delimiter
|
||||
max_length: 199
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: child_delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 512
|
||||
label: Child max chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: child_max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: paragraph
|
||||
label: Parent mode
|
||||
max_length: 48
|
||||
options:
|
||||
- full_doc
|
||||
- paragraph
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: select
|
||||
unit: null
|
||||
variable: parent_mode
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,674 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: ''
|
||||
icon_type: emoji
|
||||
name: website-crawl-general-economy
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752491761974-source-1752565435219-target
|
||||
source: '1752491761974'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752565402678-source-1752565435219-target
|
||||
source: '1752565402678'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752565435219-source-1752569675978-target
|
||||
source: '1752565435219'
|
||||
sourceHandle: source
|
||||
target: '1752569675978'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752569675978-source-1752477924228-target
|
||||
source: '1752569675978'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752569675978'
|
||||
- result
|
||||
indexing_technique: economy
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: keyword_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: true
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 2140.4053851189346
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 2140.4053851189346
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Jina Reader
|
||||
datasource_name: jina_reader
|
||||
datasource_parameters:
|
||||
crawl_sub_pages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752491761974'
|
||||
- jina_limit
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_url#}}'
|
||||
use_sitemap:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
|
||||
plugin_id: langgenius/jina_datasource
|
||||
provider_name: jinareader
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Jina Reader
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752491761974'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Firecrawl
|
||||
datasource_name: crawl
|
||||
datasource_parameters:
|
||||
crawl_subpages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
|
||||
exclude_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
|
||||
include_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_limit
|
||||
max_depth:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_max_depth
|
||||
only_main_content:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_url#}}'
|
||||
plugin_id: langgenius/firecrawl_datasource
|
||||
provider_name: firecrawl
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Firecrawl
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752565402678'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752491761974'
|
||||
- content
|
||||
- - '1752565402678'
|
||||
- content
|
||||
height: 129
|
||||
id: '1752565435219'
|
||||
position:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos pedaços.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: Delimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長
|
||||
pt_BR: The chunk overlap length.
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Chunk Overlap Length
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete all URLs and email addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete All URLs and Email Addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752565435219.output#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752569675978'
|
||||
position:
|
||||
x: 1807.4306671642219
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1807.4306671642219
|
||||
y: 281.3910724383104
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -707.721097109337
|
||||
y: -93.07807382100896
|
||||
zoom: 0.9350632198875476
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: jina_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: jina_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Use sitemap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
|
||||
iteratively based on page relevance, yielding fewer but higher-quality pages.
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_use_sitemap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: true
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Max depth
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: ''
|
||||
required: false
|
||||
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
|
||||
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
|
||||
+ one /, and so on.
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_max_depth
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Exclude paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: blog/*, /about/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_exclude_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Include only paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: articles/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_include_only_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: firecrawl_extract_main_content
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_extract_main_content
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 1024
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 50
|
||||
label: chunk_overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: Setting the chunk overlap can maintain the semantic relevance between
|
||||
them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
|
||||
maximum chunk size.
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: replace_consecutive_spaces
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,674 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: '#FFF4ED'
|
||||
icon_type: emoji
|
||||
name: website-crawl-general-high-quality
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752491761974-source-1752565435219-target
|
||||
source: '1752491761974'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752565402678-source-1752565435219-target
|
||||
source: '1752565402678'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752565435219-source-1752569675978-target
|
||||
source: '1752565435219'
|
||||
sourceHandle: source
|
||||
target: '1752569675978'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752569675978-source-1752477924228-target
|
||||
source: '1752569675978'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: text_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752569675978'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: false
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 2140.4053851189346
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 2140.4053851189346
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Jina Reader
|
||||
datasource_name: jina_reader
|
||||
datasource_parameters:
|
||||
crawl_sub_pages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752491761974'
|
||||
- jina_limit
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_url#}}'
|
||||
use_sitemap:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
|
||||
plugin_id: langgenius/jina_datasource
|
||||
provider_name: jinareader
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Jina Reader
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752491761974'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Firecrawl
|
||||
datasource_name: crawl
|
||||
datasource_parameters:
|
||||
crawl_subpages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
|
||||
exclude_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
|
||||
include_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_limit
|
||||
max_depth:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_max_depth
|
||||
only_main_content:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_url#}}'
|
||||
plugin_id: langgenius/firecrawl_datasource
|
||||
provider_name: firecrawl
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Firecrawl
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752565402678'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752491761974'
|
||||
- content
|
||||
- - '1752565402678'
|
||||
- content
|
||||
height: 129
|
||||
id: '1752565435219'
|
||||
position:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: The result of the general chunk tool.
|
||||
properties:
|
||||
general_chunks:
|
||||
items:
|
||||
description: The chunk of the text.
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input Variable
|
||||
ja_JP: 入力変数
|
||||
pt_BR: Variável de entrada
|
||||
zh_Hans: 输入变量
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_variable
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The delimiter of the chunks.
|
||||
ja_JP: チャンクの区切り記号。
|
||||
pt_BR: O delimitador dos pedaços.
|
||||
zh_Hans: 块的分隔符。
|
||||
label:
|
||||
en_US: Delimiter
|
||||
ja_JP: 区切り記号
|
||||
pt_BR: Delimitador
|
||||
zh_Hans: 分隔符
|
||||
llm_description: The delimiter of the chunks, the format of the delimiter
|
||||
must be a string.
|
||||
max: null
|
||||
min: null
|
||||
name: delimiter
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The maximum chunk length.
|
||||
ja_JP: 最大長のチャンク。
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度。
|
||||
label:
|
||||
en_US: Maximum Chunk Length
|
||||
ja_JP: チャンク最大長
|
||||
pt_BR: O comprimento máximo do bloco
|
||||
zh_Hans: 最大块的长度
|
||||
llm_description: The maximum chunk length, the format of the chunk size
|
||||
must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: max_chunk_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The chunk overlap length.
|
||||
ja_JP: チャンクの重複長。
|
||||
pt_BR: The chunk overlap length.
|
||||
zh_Hans: 块的重叠长度。
|
||||
label:
|
||||
en_US: Chunk Overlap Length
|
||||
ja_JP: チャンク重複長
|
||||
pt_BR: Chunk Overlap Length
|
||||
zh_Hans: 块的重叠长度
|
||||
llm_description: The chunk overlap length, the format of the chunk overlap
|
||||
length must be an integer.
|
||||
max: null
|
||||
min: null
|
||||
name: chunk_overlap_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Replace consecutive spaces, newlines and tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace consecutive spaces, newlines and tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
label:
|
||||
en_US: Replace Consecutive Spaces, Newlines and Tabs
|
||||
ja_JP: 連続のスペース、改行、まだはタブを置換する
|
||||
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
|
||||
zh_Hans: 替换连续的空格、换行符和制表符
|
||||
llm_description: Replace consecutive spaces, newlines and tabs, the format
|
||||
of the replace must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: replace_consecutive_spaces_newlines_tabs
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Delete all URLs and email addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete all URLs and email addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
label:
|
||||
en_US: Delete All URLs and Email Addresses
|
||||
ja_JP: すべてのURLとメールアドレスを削除する
|
||||
pt_BR: Delete All URLs and Email Addresses
|
||||
zh_Hans: 删除所有URL和电子邮件地址
|
||||
llm_description: Delete all URLs and email addresses, the format of the
|
||||
delete must be a boolean.
|
||||
max: null
|
||||
min: null
|
||||
name: delete_all_urls_and_email_addresses
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
chunk_overlap_length: ''
|
||||
delete_all_urls_and_email_addresses: ''
|
||||
delimiter: ''
|
||||
input_variable: ''
|
||||
max_chunk_length: ''
|
||||
replace_consecutive_spaces_newlines_tabs: ''
|
||||
provider_id: langgenius/general_chunker/general_chunker
|
||||
provider_name: langgenius/general_chunker/general_chunker
|
||||
provider_type: builtin
|
||||
selected: false
|
||||
title: General Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
|
||||
tool_label: General Chunker
|
||||
tool_name: general_chunker
|
||||
tool_parameters:
|
||||
chunk_overlap_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- chunk_overlap
|
||||
delete_all_urls_and_email_addresses:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
delimiter:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
input_variable:
|
||||
type: mixed
|
||||
value: '{{#1752565435219.output#}}'
|
||||
max_chunk_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
replace_consecutive_spaces_newlines_tabs:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752569675978'
|
||||
position:
|
||||
x: 1807.4306671642219
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1807.4306671642219
|
||||
y: 281.3910724383104
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -707.721097109337
|
||||
y: -93.07807382100896
|
||||
zoom: 0.9350632198875476
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: jina_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: jina_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Use sitemap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
|
||||
iteratively based on page relevance, yielding fewer but higher-quality pages.
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_use_sitemap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: true
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Max depth
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: ''
|
||||
required: false
|
||||
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
|
||||
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
|
||||
+ one /, and so on.
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_max_depth
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Exclude paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: blog/*, /about/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_exclude_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Include only paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: articles/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_include_only_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: firecrawl_extract_main_content
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_extract_main_content
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: Delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 1024
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 50
|
||||
label: chunk_overlap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: Setting the chunk overlap can maintain the semantic relevance between
|
||||
them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
|
||||
maximum chunk size.
|
||||
type: number
|
||||
unit: characters
|
||||
variable: chunk_overlap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: replace_consecutive_spaces
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
@@ -0,0 +1,779 @@
|
||||
dependencies:
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
|
||||
- current_identifier: null
|
||||
type: marketplace
|
||||
value:
|
||||
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
|
||||
kind: rag_pipeline
|
||||
rag_pipeline:
|
||||
description: ''
|
||||
icon: 📙
|
||||
icon_background: ''
|
||||
icon_type: emoji
|
||||
name: website-crawl-parentchild
|
||||
version: 0.1.0
|
||||
workflow:
|
||||
conversation_variables: []
|
||||
environment_variables: []
|
||||
features: {}
|
||||
graph:
|
||||
edges:
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: tool
|
||||
targetType: knowledge-index
|
||||
id: 1752490343805-source-1752477924228-target
|
||||
source: '1752490343805'
|
||||
sourceHandle: source
|
||||
target: '1752477924228'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752491761974-source-1752565435219-target
|
||||
source: '1752491761974'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInIteration: false
|
||||
isInLoop: false
|
||||
sourceType: variable-aggregator
|
||||
targetType: tool
|
||||
id: 1752565435219-source-1752490343805-target
|
||||
source: '1752565435219'
|
||||
sourceHandle: source
|
||||
target: '1752490343805'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
- data:
|
||||
isInLoop: false
|
||||
sourceType: datasource
|
||||
targetType: variable-aggregator
|
||||
id: 1752565402678-source-1752565435219-target
|
||||
source: '1752565402678'
|
||||
sourceHandle: source
|
||||
target: '1752565435219'
|
||||
targetHandle: target
|
||||
type: custom
|
||||
zIndex: 0
|
||||
nodes:
|
||||
- data:
|
||||
chunk_structure: hierarchical_model
|
||||
embedding_model: text-embedding-ada-002
|
||||
embedding_model_provider: langgenius/openai/openai
|
||||
index_chunk_variable_selector:
|
||||
- '1752490343805'
|
||||
- result
|
||||
indexing_technique: high_quality
|
||||
keyword_number: 10
|
||||
retrieval_model:
|
||||
score_threshold: 0.5
|
||||
score_threshold_enabled: false
|
||||
search_method: semantic_search
|
||||
top_k: 3
|
||||
vector_setting:
|
||||
embedding_model_name: text-embedding-ada-002
|
||||
embedding_provider_name: langgenius/openai/openai
|
||||
selected: false
|
||||
title: Knowledge Base
|
||||
type: knowledge-index
|
||||
height: 114
|
||||
id: '1752477924228'
|
||||
position:
|
||||
x: 2215.5544306817387
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 2215.5544306817387
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
is_team_authorization: true
|
||||
output_schema:
|
||||
properties:
|
||||
result:
|
||||
description: Parent child chunks result
|
||||
items:
|
||||
type: object
|
||||
type: array
|
||||
type: object
|
||||
paramSchemas:
|
||||
- auto_generate: null
|
||||
default: null
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: The text you want to chunk.
|
||||
ja_JP: チャンク化したいテキスト。
|
||||
pt_BR: O texto que você deseja dividir.
|
||||
zh_Hans: 你想要分块的文本。
|
||||
label:
|
||||
en_US: Input text
|
||||
ja_JP: 入力テキスト
|
||||
pt_BR: Texto de entrada
|
||||
zh_Hans: 输入文本
|
||||
llm_description: The text you want to chunk.
|
||||
max: null
|
||||
min: null
|
||||
name: input_text
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 1024
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for chunking
|
||||
ja_JP: チャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para divisão
|
||||
zh_Hans: 用于分块的最大长度
|
||||
label:
|
||||
en_US: Maximum Length
|
||||
ja_JP: 最大長
|
||||
pt_BR: Comprimento Máximo
|
||||
zh_Hans: 最大长度
|
||||
llm_description: Maximum length allowed per chunk
|
||||
max: null
|
||||
min: null
|
||||
name: max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '
|
||||
|
||||
|
||||
'
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for chunking
|
||||
ja_JP: チャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para divisão
|
||||
zh_Hans: 用于分块的分隔符
|
||||
label:
|
||||
en_US: Chunk Separator
|
||||
ja_JP: チャンク区切り文字
|
||||
pt_BR: Separador de Divisão
|
||||
zh_Hans: 分块分隔符
|
||||
llm_description: The separator used to split chunks
|
||||
max: null
|
||||
min: null
|
||||
name: separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: 512
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Maximum length for subchunking
|
||||
ja_JP: サブチャンク分割の最大長
|
||||
pt_BR: Comprimento máximo para subdivisão
|
||||
zh_Hans: 用于子分块的最大长度
|
||||
label:
|
||||
en_US: Subchunk Maximum Length
|
||||
ja_JP: サブチャンク最大長
|
||||
pt_BR: Comprimento Máximo de Subdivisão
|
||||
zh_Hans: 子分块最大长度
|
||||
llm_description: Maximum length allowed per subchunk
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_max_length
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: number
|
||||
- auto_generate: null
|
||||
default: '. '
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Separator used for subchunking
|
||||
ja_JP: サブチャンク分割に使用する区切り文字
|
||||
pt_BR: Separador usado para subdivisão
|
||||
zh_Hans: 用于子分块的分隔符
|
||||
label:
|
||||
en_US: Subchunk Separator
|
||||
ja_JP: サブチャンキング用セパレーター
|
||||
pt_BR: Separador de Subdivisão
|
||||
zh_Hans: 子分块分隔符
|
||||
llm_description: The separator used to split subchunks
|
||||
max: null
|
||||
min: null
|
||||
name: subchunk_separator
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: string
|
||||
- auto_generate: null
|
||||
default: paragraph
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Split text into paragraphs based on separator and maximum chunk
|
||||
length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
|
||||
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
|
||||
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
|
||||
máximo do bloco, usando o texto dividido como bloco pai ou documento
|
||||
completo como bloco pai e diretamente recuperá-lo.
|
||||
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
|
||||
label:
|
||||
en_US: Parent Mode
|
||||
ja_JP: 親子モード
|
||||
pt_BR: Modo Pai
|
||||
zh_Hans: 父块模式
|
||||
llm_description: Split text into paragraphs based on separator and maximum
|
||||
chunk length, using split text as parent block or entire document as parent
|
||||
block and directly retrieve.
|
||||
max: null
|
||||
min: null
|
||||
name: parent_mode
|
||||
options:
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Paragraph
|
||||
ja_JP: 段落
|
||||
pt_BR: Parágrafo
|
||||
zh_Hans: 段落
|
||||
value: paragraph
|
||||
- icon: ''
|
||||
label:
|
||||
en_US: Full Document
|
||||
ja_JP: 全文
|
||||
pt_BR: Documento Completo
|
||||
zh_Hans: 全文
|
||||
value: full_doc
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: true
|
||||
scope: null
|
||||
template: null
|
||||
type: select
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove extra spaces in the text
|
||||
ja_JP: テキスト内の余分なスペースを削除するかどうか
|
||||
pt_BR: Se deve remover espaços extras no texto
|
||||
zh_Hans: 是否移除文本中的多余空格
|
||||
label:
|
||||
en_US: Remove Extra Spaces
|
||||
ja_JP: 余分なスペースを削除
|
||||
pt_BR: Remover Espaços Extras
|
||||
zh_Hans: 移除多余空格
|
||||
llm_description: Whether to remove extra spaces in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_extra_spaces
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
- auto_generate: null
|
||||
default: 0
|
||||
form: llm
|
||||
human_description:
|
||||
en_US: Whether to remove URLs and emails in the text
|
||||
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
|
||||
pt_BR: Se deve remover URLs e e-mails no texto
|
||||
zh_Hans: 是否移除文本中的URL和电子邮件地址
|
||||
label:
|
||||
en_US: Remove URLs and Emails
|
||||
ja_JP: URLとメールアドレスを削除
|
||||
pt_BR: Remover URLs e E-mails
|
||||
zh_Hans: 移除URL和电子邮件地址
|
||||
llm_description: Whether to remove URLs and emails in the text
|
||||
max: null
|
||||
min: null
|
||||
name: remove_urls_emails
|
||||
options: []
|
||||
placeholder: null
|
||||
precision: null
|
||||
required: false
|
||||
scope: null
|
||||
template: null
|
||||
type: boolean
|
||||
params:
|
||||
input_text: ''
|
||||
max_length: ''
|
||||
parent_mode: ''
|
||||
remove_extra_spaces: ''
|
||||
remove_urls_emails: ''
|
||||
separator: ''
|
||||
subchunk_max_length: ''
|
||||
subchunk_separator: ''
|
||||
provider_id: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_name: langgenius/parentchild_chunker/parentchild_chunker
|
||||
provider_type: builtin
|
||||
selected: true
|
||||
title: Parent-child Chunker
|
||||
tool_configurations: {}
|
||||
tool_description: Parent-child Chunk Structure
|
||||
tool_label: Parent-child Chunker
|
||||
tool_name: parentchild_chunker
|
||||
tool_parameters:
|
||||
input_text:
|
||||
type: mixed
|
||||
value: '{{#1752565435219.output#}}'
|
||||
max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- max_chunk_length
|
||||
parent_mode:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- parent_mode
|
||||
remove_extra_spaces:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.replace_consecutive_spaces#}}'
|
||||
remove_urls_emails:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delete_urls_email#}}'
|
||||
separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.delimiter#}}'
|
||||
subchunk_max_length:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- shared
|
||||
- child_max_chunk_length
|
||||
subchunk_separator:
|
||||
type: mixed
|
||||
value: '{{#rag.shared.child_delimiter#}}'
|
||||
type: tool
|
||||
height: 52
|
||||
id: '1752490343805'
|
||||
position:
|
||||
x: 1853.5260563244174
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1853.5260563244174
|
||||
y: 281.3910724383104
|
||||
selected: true
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Jina Reader
|
||||
datasource_name: jina_reader
|
||||
datasource_parameters:
|
||||
crawl_sub_pages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752491761974'
|
||||
- jina_limit
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_url#}}'
|
||||
use_sitemap:
|
||||
type: mixed
|
||||
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
|
||||
plugin_id: langgenius/jina_datasource
|
||||
provider_name: jinareader
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Jina Reader
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752491761974'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
datasource_configurations: {}
|
||||
datasource_label: Firecrawl
|
||||
datasource_name: crawl
|
||||
datasource_parameters:
|
||||
crawl_subpages:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
|
||||
exclude_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
|
||||
include_paths:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
|
||||
limit:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_limit
|
||||
max_depth:
|
||||
type: variable
|
||||
value:
|
||||
- rag
|
||||
- '1752565402678'
|
||||
- firecrawl_max_depth
|
||||
only_main_content:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
|
||||
url:
|
||||
type: mixed
|
||||
value: '{{#rag.1752565402678.firecrawl_url#}}'
|
||||
plugin_id: langgenius/firecrawl_datasource
|
||||
provider_name: firecrawl
|
||||
provider_type: website_crawl
|
||||
selected: false
|
||||
title: Firecrawl
|
||||
type: datasource
|
||||
height: 52
|
||||
id: '1752565402678'
|
||||
position:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
positionAbsolute:
|
||||
x: 1067.7526055798794
|
||||
y: 417.32608398342404
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
- data:
|
||||
output_type: string
|
||||
selected: false
|
||||
title: Variable Aggregator
|
||||
type: variable-aggregator
|
||||
variables:
|
||||
- - '1752491761974'
|
||||
- content
|
||||
- - '1752565402678'
|
||||
- content
|
||||
height: 129
|
||||
id: '1752565435219'
|
||||
position:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
positionAbsolute:
|
||||
x: 1505.4306671642219
|
||||
y: 281.3910724383104
|
||||
selected: false
|
||||
sourcePosition: right
|
||||
targetPosition: left
|
||||
type: custom
|
||||
width: 242
|
||||
viewport:
|
||||
x: -826.1791044466438
|
||||
y: -71.91725474841303
|
||||
zoom: 0.9980166672552107
|
||||
rag_pipeline_variables:
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: jina_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: jina_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752491761974'
|
||||
default_value: null
|
||||
label: Use sitemap
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
|
||||
iteratively based on page relevance, yielding fewer but higher-quality pages.
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: jina_use_sitemap
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: URL
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: https://docs.dify.ai/en/
|
||||
required: true
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_url
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: true
|
||||
label: Crawl sub-pages
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_crawl_sub_pages
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: 10
|
||||
label: Limit
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_limit
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Max depth
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: ''
|
||||
required: false
|
||||
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
|
||||
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
|
||||
+ one /, and so on.
|
||||
type: number
|
||||
unit: null
|
||||
variable: firecrawl_max_depth
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Exclude paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: blog/*, /about/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_exclude_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: Include only paths
|
||||
max_length: 256
|
||||
options: []
|
||||
placeholder: articles/*
|
||||
required: false
|
||||
tooltips: null
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: firecrawl_include_only_paths
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: '1752565402678'
|
||||
default_value: null
|
||||
label: firecrawl_extract_main_content
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: firecrawl_extract_main_content
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n\n
|
||||
label: delimiter
|
||||
max_length: 100
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 1024
|
||||
label: Maximum chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: \n
|
||||
label: Child delimiter
|
||||
max_length: 199
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: A delimiter is the character used to separate text. \n\n is recommended
|
||||
for splitting the original document into large parent chunks. You can also use
|
||||
special delimiters defined by yourself.
|
||||
type: text-input
|
||||
unit: null
|
||||
variable: child_delimiter
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: 512
|
||||
label: Child max chunk length
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: number
|
||||
unit: characters
|
||||
variable: child_max_chunk_length
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: paragraph
|
||||
label: Parent mode
|
||||
max_length: 48
|
||||
options:
|
||||
- full_doc
|
||||
- paragraph
|
||||
placeholder: null
|
||||
required: true
|
||||
tooltips: null
|
||||
type: select
|
||||
unit: null
|
||||
variable: parent_mode
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Replace consecutive spaces, newlines and tabs
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: replace_consecutive_spaces
|
||||
- allow_file_extension: null
|
||||
allow_file_upload_methods: null
|
||||
allowed_file_types: null
|
||||
belong_to_node_id: shared
|
||||
default_value: null
|
||||
label: Delete all URLs and email addresses
|
||||
max_length: 48
|
||||
options: []
|
||||
placeholder: null
|
||||
required: false
|
||||
tooltips: null
|
||||
type: checkbox
|
||||
unit: null
|
||||
variable: delete_urls_email
|
||||
Reference in New Issue
Block a user