This commit is contained in:
2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
"""
Repository implementations for data access.
This package contains concrete implementations of the repository interfaces
defined in the core.workflow.repository package.
"""
from core.repositories.celery_workflow_execution_repository import CeleryWorkflowExecutionRepository
from core.repositories.celery_workflow_node_execution_repository import CeleryWorkflowNodeExecutionRepository
from core.repositories.factory import DifyCoreRepositoryFactory, RepositoryImportError
from core.repositories.sqlalchemy_workflow_node_execution_repository import SQLAlchemyWorkflowNodeExecutionRepository
__all__ = [
"CeleryWorkflowExecutionRepository",
"CeleryWorkflowNodeExecutionRepository",
"DifyCoreRepositoryFactory",
"RepositoryImportError",
"SQLAlchemyWorkflowNodeExecutionRepository",
]

View File

@@ -0,0 +1,126 @@
"""
Celery-based implementation of the WorkflowExecutionRepository.
This implementation uses Celery tasks for asynchronous storage operations,
providing improved performance by offloading database operations to background workers.
"""
import logging
from typing import Union
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
from core.workflow.entities.workflow_execution import WorkflowExecution
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
from libs.helper import extract_tenant_id
from models import Account, CreatorUserRole, EndUser
from models.enums import WorkflowRunTriggeredFrom
from tasks.workflow_execution_tasks import (
save_workflow_execution_task,
)
logger = logging.getLogger(__name__)
class CeleryWorkflowExecutionRepository(WorkflowExecutionRepository):
"""
Celery-based implementation of the WorkflowExecutionRepository interface.
This implementation provides asynchronous storage capabilities by using Celery tasks
to handle database operations in background workers. This improves performance by
reducing the blocking time for workflow execution storage operations.
Key features:
- Asynchronous save operations using Celery tasks
- Support for multi-tenancy through tenant/app filtering
- Automatic retry and error handling through Celery
"""
_session_factory: sessionmaker
_tenant_id: str
_app_id: str | None
_triggered_from: WorkflowRunTriggeredFrom | None
_creator_user_id: str
_creator_user_role: CreatorUserRole
def __init__(
self,
session_factory: sessionmaker | Engine,
user: Union[Account, EndUser],
app_id: str | None,
triggered_from: WorkflowRunTriggeredFrom | None,
):
"""
Initialize the repository with Celery task configuration and context information.
Args:
session_factory: SQLAlchemy sessionmaker or engine for fallback operations
user: Account or EndUser object containing tenant_id, user ID, and role information
app_id: App ID for filtering by application (can be None)
triggered_from: Source of the execution trigger (DEBUGGING or APP_RUN)
"""
# Store session factory for fallback operations
if isinstance(session_factory, Engine):
self._session_factory = sessionmaker(bind=session_factory, expire_on_commit=False)
elif isinstance(session_factory, sessionmaker):
self._session_factory = session_factory
else:
raise ValueError(
f"Invalid session_factory type {type(session_factory).__name__}; expected sessionmaker or Engine"
)
# Extract tenant_id from user
tenant_id = extract_tenant_id(user)
if not tenant_id:
raise ValueError("User must have a tenant_id or current_tenant_id")
self._tenant_id = tenant_id
# Store app context
self._app_id = app_id
# Extract user context
self._triggered_from = triggered_from
self._creator_user_id = user.id
# Determine user role based on user type
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
logger.info(
"Initialized CeleryWorkflowExecutionRepository for tenant %s, app %s, triggered_from %s",
self._tenant_id,
self._app_id,
self._triggered_from,
)
def save(self, execution: WorkflowExecution):
"""
Save or update a WorkflowExecution instance asynchronously using Celery.
This method queues the save operation as a Celery task and returns immediately,
providing improved performance for high-throughput scenarios.
Args:
execution: The WorkflowExecution instance to save or update
"""
try:
# Serialize execution for Celery task
execution_data = execution.model_dump()
# Queue the save operation as a Celery task (fire and forget)
save_workflow_execution_task.delay( # type: ignore
execution_data=execution_data,
tenant_id=self._tenant_id,
app_id=self._app_id or "",
triggered_from=self._triggered_from.value if self._triggered_from else "",
creator_user_id=self._creator_user_id,
creator_user_role=self._creator_user_role.value,
)
logger.debug("Queued async save for workflow execution: %s", execution.id_)
except Exception:
logger.exception("Failed to queue save operation for execution %s", execution.id_)
# In case of Celery failure, we could implement a fallback to synchronous save
# For now, we'll re-raise the exception
raise

View File

@@ -0,0 +1,190 @@
"""
Celery-based implementation of the WorkflowNodeExecutionRepository.
This implementation uses Celery tasks for asynchronous storage operations,
providing improved performance by offloading database operations to background workers.
"""
import logging
from collections.abc import Sequence
from typing import Union
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
from core.workflow.entities.workflow_node_execution import WorkflowNodeExecution
from core.workflow.repositories.workflow_node_execution_repository import (
OrderConfig,
WorkflowNodeExecutionRepository,
)
from libs.helper import extract_tenant_id
from models import Account, CreatorUserRole, EndUser
from models.workflow import WorkflowNodeExecutionTriggeredFrom
from tasks.workflow_node_execution_tasks import (
save_workflow_node_execution_task,
)
logger = logging.getLogger(__name__)
class CeleryWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
"""
Celery-based implementation of the WorkflowNodeExecutionRepository interface.
This implementation provides asynchronous storage capabilities by using Celery tasks
to handle database operations in background workers. This improves performance by
reducing the blocking time for workflow node execution storage operations.
Key features:
- Asynchronous save operations using Celery tasks
- In-memory cache for immediate reads
- Support for multi-tenancy through tenant/app filtering
- Automatic retry and error handling through Celery
"""
_session_factory: sessionmaker
_tenant_id: str
_app_id: str | None
_triggered_from: WorkflowNodeExecutionTriggeredFrom | None
_creator_user_id: str
_creator_user_role: CreatorUserRole
_execution_cache: dict[str, WorkflowNodeExecution]
_workflow_execution_mapping: dict[str, list[str]]
def __init__(
self,
session_factory: sessionmaker | Engine,
user: Union[Account, EndUser],
app_id: str | None,
triggered_from: WorkflowNodeExecutionTriggeredFrom | None,
):
"""
Initialize the repository with Celery task configuration and context information.
Args:
session_factory: SQLAlchemy sessionmaker or engine for fallback operations
user: Account or EndUser object containing tenant_id, user ID, and role information
app_id: App ID for filtering by application (can be None)
triggered_from: Source of the execution trigger (SINGLE_STEP or WORKFLOW_RUN)
"""
# Store session factory for fallback operations
if isinstance(session_factory, Engine):
self._session_factory = sessionmaker(bind=session_factory, expire_on_commit=False)
elif isinstance(session_factory, sessionmaker):
self._session_factory = session_factory
else:
raise ValueError(
f"Invalid session_factory type {type(session_factory).__name__}; expected sessionmaker or Engine"
)
# Extract tenant_id from user
tenant_id = extract_tenant_id(user)
if not tenant_id:
raise ValueError("User must have a tenant_id or current_tenant_id")
self._tenant_id = tenant_id
# Store app context
self._app_id = app_id
# Extract user context
self._triggered_from = triggered_from
self._creator_user_id = user.id
# Determine user role based on user type
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
# In-memory cache for workflow node executions
self._execution_cache = {}
# Cache for mapping workflow_execution_ids to execution IDs for efficient retrieval
self._workflow_execution_mapping = {}
logger.info(
"Initialized CeleryWorkflowNodeExecutionRepository for tenant %s, app %s, triggered_from %s",
self._tenant_id,
self._app_id,
self._triggered_from,
)
def save(self, execution: WorkflowNodeExecution):
"""
Save or update a WorkflowNodeExecution instance to cache and asynchronously to database.
This method stores the execution in cache immediately for fast reads and queues
the save operation as a Celery task without tracking the task status.
Args:
execution: The WorkflowNodeExecution instance to save or update
"""
try:
# Store in cache immediately for fast reads
self._execution_cache[execution.id] = execution
# Update workflow execution mapping for efficient retrieval
if execution.workflow_execution_id:
if execution.workflow_execution_id not in self._workflow_execution_mapping:
self._workflow_execution_mapping[execution.workflow_execution_id] = []
if execution.id not in self._workflow_execution_mapping[execution.workflow_execution_id]:
self._workflow_execution_mapping[execution.workflow_execution_id].append(execution.id)
# Serialize execution for Celery task
execution_data = execution.model_dump()
# Queue the save operation as a Celery task (fire and forget)
save_workflow_node_execution_task.delay(
execution_data=execution_data,
tenant_id=self._tenant_id,
app_id=self._app_id or "",
triggered_from=self._triggered_from.value if self._triggered_from else "",
creator_user_id=self._creator_user_id,
creator_user_role=self._creator_user_role.value,
)
logger.debug("Cached and queued async save for workflow node execution: %s", execution.id)
except Exception:
logger.exception("Failed to cache or queue save operation for node execution %s", execution.id)
# In case of Celery failure, we could implement a fallback to synchronous save
# For now, we'll re-raise the exception
raise
def get_by_workflow_run(
self,
workflow_run_id: str,
order_config: OrderConfig | None = None,
) -> Sequence[WorkflowNodeExecution]:
"""
Retrieve all WorkflowNodeExecution instances for a specific workflow run from cache.
Args:
workflow_run_id: The workflow run ID
order_config: Optional configuration for ordering results
Returns:
A sequence of WorkflowNodeExecution instances
"""
try:
# Get execution IDs for this workflow run from cache
execution_ids = self._workflow_execution_mapping.get(workflow_run_id, [])
# Retrieve executions from cache
result = []
for execution_id in execution_ids:
if execution_id in self._execution_cache:
result.append(self._execution_cache[execution_id])
# Apply ordering if specified
if order_config and result:
# Sort based on the order configuration
reverse = order_config.order_direction == "desc"
# Sort by multiple fields if specified
for field_name in reversed(order_config.order_by):
result.sort(key=lambda x: getattr(x, field_name, 0), reverse=reverse)
logger.debug("Retrieved %d workflow node executions for run %s from cache", len(result), workflow_run_id)
return result
except Exception:
logger.exception("Failed to get workflow node executions for run %s from cache", workflow_run_id)
return []

View File

@@ -0,0 +1,108 @@
"""
Repository factory for dynamically creating repository instances based on configuration.
This module provides a Django-like settings system for repository implementations,
allowing users to configure different repository backends through string paths.
"""
from typing import Union
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
from configs import dify_config
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
from core.workflow.repositories.workflow_node_execution_repository import WorkflowNodeExecutionRepository
from libs.module_loading import import_string
from models import Account, EndUser
from models.enums import WorkflowRunTriggeredFrom
from models.workflow import WorkflowNodeExecutionTriggeredFrom
class RepositoryImportError(Exception):
"""Raised when a repository implementation cannot be imported or instantiated."""
pass
class DifyCoreRepositoryFactory:
"""
Factory for creating repository instances based on configuration.
This factory supports Django-like settings where repository implementations
are specified as module paths (e.g., 'module.submodule.ClassName').
"""
@classmethod
def create_workflow_execution_repository(
cls,
session_factory: Union[sessionmaker, Engine],
user: Union[Account, EndUser],
app_id: str,
triggered_from: WorkflowRunTriggeredFrom,
) -> WorkflowExecutionRepository:
"""
Create a WorkflowExecutionRepository instance based on configuration.
Args:
session_factory: SQLAlchemy sessionmaker or engine
user: Account or EndUser object
app_id: Application ID
triggered_from: Source of the execution trigger
Returns:
Configured WorkflowExecutionRepository instance
Raises:
RepositoryImportError: If the configured repository cannot be created
"""
class_path = dify_config.CORE_WORKFLOW_EXECUTION_REPOSITORY
try:
repository_class = import_string(class_path)
return repository_class(
session_factory=session_factory,
user=user,
app_id=app_id,
triggered_from=triggered_from,
)
except (ImportError, Exception) as e:
raise RepositoryImportError(f"Failed to create WorkflowExecutionRepository from '{class_path}': {e}") from e
@classmethod
def create_workflow_node_execution_repository(
cls,
session_factory: Union[sessionmaker, Engine],
user: Union[Account, EndUser],
app_id: str,
triggered_from: WorkflowNodeExecutionTriggeredFrom,
) -> WorkflowNodeExecutionRepository:
"""
Create a WorkflowNodeExecutionRepository instance based on configuration.
Args:
session_factory: SQLAlchemy sessionmaker or engine
user: Account or EndUser object
app_id: Application ID
triggered_from: Source of the execution trigger
Returns:
Configured WorkflowNodeExecutionRepository instance
Raises:
RepositoryImportError: If the configured repository cannot be created
"""
class_path = dify_config.CORE_WORKFLOW_NODE_EXECUTION_REPOSITORY
try:
repository_class = import_string(class_path)
return repository_class(
session_factory=session_factory,
user=user,
app_id=app_id,
triggered_from=triggered_from,
)
except (ImportError, Exception) as e:
raise RepositoryImportError(
f"Failed to create WorkflowNodeExecutionRepository from '{class_path}': {e}"
) from e

View File

@@ -0,0 +1,203 @@
"""
SQLAlchemy implementation of the WorkflowExecutionRepository.
"""
import json
import logging
from typing import Union
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
from core.workflow.entities import WorkflowExecution
from core.workflow.enums import WorkflowExecutionStatus, WorkflowType
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
from libs.helper import extract_tenant_id
from models import (
Account,
CreatorUserRole,
EndUser,
WorkflowRun,
)
from models.enums import WorkflowRunTriggeredFrom
logger = logging.getLogger(__name__)
class SQLAlchemyWorkflowExecutionRepository(WorkflowExecutionRepository):
"""
SQLAlchemy implementation of the WorkflowExecutionRepository interface.
This implementation supports multi-tenancy by filtering operations based on tenant_id.
Each method creates its own session, handles the transaction, and commits changes
to the database. This prevents long-running connections in the workflow core.
This implementation also includes an in-memory cache for workflow executions to improve
performance by reducing database queries.
"""
def __init__(
self,
session_factory: sessionmaker | Engine,
user: Union[Account, EndUser],
app_id: str | None,
triggered_from: WorkflowRunTriggeredFrom | None,
):
"""
Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
Args:
session_factory: SQLAlchemy sessionmaker or engine for creating sessions
user: Account or EndUser object containing tenant_id, user ID, and role information
app_id: App ID for filtering by application (can be None)
triggered_from: Source of the execution trigger (DEBUGGING or APP_RUN)
"""
# If an engine is provided, create a sessionmaker from it
if isinstance(session_factory, Engine):
self._session_factory = sessionmaker(bind=session_factory, expire_on_commit=False)
elif isinstance(session_factory, sessionmaker):
self._session_factory = session_factory
else:
raise ValueError(
f"Invalid session_factory type {type(session_factory).__name__}; expected sessionmaker or Engine"
)
# Extract tenant_id from user
tenant_id = extract_tenant_id(user)
if not tenant_id:
raise ValueError("User must have a tenant_id or current_tenant_id")
self._tenant_id = tenant_id
# Store app context
self._app_id = app_id
# Extract user context
self._triggered_from = triggered_from
self._creator_user_id = user.id
# Determine user role based on user type
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
# Initialize in-memory cache for workflow executions
# Key: execution_id, Value: WorkflowRun (DB model)
self._execution_cache: dict[str, WorkflowRun] = {}
def _to_domain_model(self, db_model: WorkflowRun) -> WorkflowExecution:
"""
Convert a database model to a domain model.
Args:
db_model: The database model to convert
Returns:
The domain model
"""
# Parse JSON fields
inputs = db_model.inputs_dict
outputs = db_model.outputs_dict
graph = db_model.graph_dict
# Convert status to domain enum
status = WorkflowExecutionStatus(db_model.status)
return WorkflowExecution(
id_=db_model.id,
workflow_id=db_model.workflow_id,
workflow_type=WorkflowType(db_model.type),
workflow_version=db_model.version,
graph=graph,
inputs=inputs,
outputs=outputs,
status=status,
error_message=db_model.error or "",
total_tokens=db_model.total_tokens,
total_steps=db_model.total_steps,
exceptions_count=db_model.exceptions_count,
started_at=db_model.created_at,
finished_at=db_model.finished_at,
)
def _to_db_model(self, domain_model: WorkflowExecution) -> WorkflowRun:
"""
Convert a domain model to a database model.
Args:
domain_model: The domain model to convert
Returns:
The database model
"""
# Use values from constructor if provided
if not self._triggered_from:
raise ValueError("triggered_from is required in repository constructor")
if not self._creator_user_id:
raise ValueError("created_by is required in repository constructor")
if not self._creator_user_role:
raise ValueError("created_by_role is required in repository constructor")
db_model = WorkflowRun()
db_model.id = domain_model.id_
db_model.tenant_id = self._tenant_id
if self._app_id is not None:
db_model.app_id = self._app_id
db_model.workflow_id = domain_model.workflow_id
db_model.triggered_from = self._triggered_from
# No sequence number generation needed anymore
db_model.type = domain_model.workflow_type
db_model.version = domain_model.workflow_version
db_model.graph = json.dumps(domain_model.graph) if domain_model.graph else None
db_model.inputs = json.dumps(domain_model.inputs) if domain_model.inputs else None
db_model.outputs = (
json.dumps(WorkflowRuntimeTypeConverter().to_json_encodable(domain_model.outputs))
if domain_model.outputs
else None
)
db_model.status = domain_model.status
db_model.error = domain_model.error_message or None
db_model.total_tokens = domain_model.total_tokens
db_model.total_steps = domain_model.total_steps
db_model.exceptions_count = domain_model.exceptions_count
db_model.created_by_role = self._creator_user_role
db_model.created_by = self._creator_user_id
db_model.created_at = domain_model.started_at
db_model.finished_at = domain_model.finished_at
# Calculate elapsed time if finished_at is available
if domain_model.finished_at:
db_model.elapsed_time = (domain_model.finished_at - domain_model.started_at).total_seconds()
else:
db_model.elapsed_time = 0
return db_model
def save(self, execution: WorkflowExecution):
"""
Save or update a WorkflowExecution domain entity to the database.
This method serves as a domain-to-database adapter that:
1. Converts the domain entity to its database representation
2. Persists the database model using SQLAlchemy's merge operation
3. Maintains proper multi-tenancy by including tenant context during conversion
4. Updates the in-memory cache for faster subsequent lookups
The method handles both creating new records and updating existing ones through
SQLAlchemy's merge operation.
Args:
execution: The WorkflowExecution domain entity to persist
"""
# Convert domain model to database model using tenant context and other attributes
db_model = self._to_db_model(execution)
# Create a new database session
with self._session_factory() as session:
# SQLAlchemy merge intelligently handles both insert and update operations
# based on the presence of the primary key
session.merge(db_model)
session.commit()
# Update the in-memory cache for faster subsequent lookups
self._execution_cache[db_model.id] = db_model

View File

@@ -0,0 +1,586 @@
"""
SQLAlchemy implementation of the WorkflowNodeExecutionRepository.
"""
import dataclasses
import json
import logging
from collections.abc import Callable, Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
from typing import Any, TypeVar, Union
import psycopg2.errors
from sqlalchemy import UnaryExpression, asc, desc, select
from sqlalchemy.engine import Engine
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker
from tenacity import before_sleep_log, retry, retry_if_exception, stop_after_attempt
from configs import dify_config
from core.model_runtime.utils.encoders import jsonable_encoder
from core.workflow.entities import WorkflowNodeExecution
from core.workflow.enums import NodeType, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
from core.workflow.repositories.workflow_node_execution_repository import OrderConfig, WorkflowNodeExecutionRepository
from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
from extensions.ext_storage import storage
from libs.helper import extract_tenant_id
from libs.uuid_utils import uuidv7
from models import (
Account,
CreatorUserRole,
EndUser,
WorkflowNodeExecutionModel,
WorkflowNodeExecutionTriggeredFrom,
)
from models.enums import ExecutionOffLoadType
from models.model import UploadFile
from models.workflow import WorkflowNodeExecutionOffload
from services.file_service import FileService
from services.variable_truncator import VariableTruncator
logger = logging.getLogger(__name__)
@dataclasses.dataclass(frozen=True)
class _InputsOutputsTruncationResult:
truncated_value: Mapping[str, Any]
file: UploadFile
offload: WorkflowNodeExecutionOffload
class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
"""
SQLAlchemy implementation of the WorkflowNodeExecutionRepository interface.
This implementation supports multi-tenancy by filtering operations based on tenant_id.
Each method creates its own session, handles the transaction, and commits changes
to the database. This prevents long-running connections in the workflow core.
This implementation also includes an in-memory cache for node executions to improve
performance by reducing database queries.
"""
def __init__(
self,
session_factory: sessionmaker | Engine,
user: Union[Account, EndUser],
app_id: str | None,
triggered_from: WorkflowNodeExecutionTriggeredFrom | None,
):
"""
Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
Args:
session_factory: SQLAlchemy sessionmaker or engine for creating sessions
user: Account or EndUser object containing tenant_id, user ID, and role information
app_id: App ID for filtering by application (can be None)
triggered_from: Source of the execution trigger (SINGLE_STEP or WORKFLOW_RUN)
"""
# If an engine is provided, create a sessionmaker from it
if isinstance(session_factory, Engine):
self._session_factory = sessionmaker(bind=session_factory, expire_on_commit=False)
elif isinstance(session_factory, sessionmaker):
self._session_factory = session_factory
else:
raise ValueError(
f"Invalid session_factory type {type(session_factory).__name__}; expected sessionmaker or Engine"
)
# Extract tenant_id from user
tenant_id = extract_tenant_id(user)
if not tenant_id:
raise ValueError("User must have a tenant_id or current_tenant_id")
self._tenant_id = tenant_id
# Store app context
self._app_id = app_id
# Extract user context
self._triggered_from = triggered_from
self._creator_user_id = user.id
self._user = user # Store the user object directly
# Determine user role based on user type
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
# Initialize in-memory cache for node executions
self._node_execution_cache: dict[str, WorkflowNodeExecutionModel] = {}
# Initialize FileService for handling offloaded data
self._file_service = FileService(session_factory)
def _create_truncator(self) -> VariableTruncator:
return VariableTruncator(
max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
)
def _to_domain_model(self, db_model: WorkflowNodeExecutionModel) -> WorkflowNodeExecution:
"""
Convert a database model to a domain model.
This requires the offload_data, and correspond inputs_file and outputs_file are preloaded.
Args:
db_model: The database model to convert. It must have `offload_data`
and the corresponding `inputs_file` and `outputs_file` preloaded.
Returns:
The domain model
"""
# Parse JSON fields - these might be truncated versions
inputs = db_model.inputs_dict
process_data = db_model.process_data_dict
outputs = db_model.outputs_dict
metadata = {WorkflowNodeExecutionMetadataKey(k): v for k, v in db_model.execution_metadata_dict.items()}
# Convert status to domain enum
status = WorkflowNodeExecutionStatus(db_model.status)
domain_model = WorkflowNodeExecution(
id=db_model.id,
node_execution_id=db_model.node_execution_id,
workflow_id=db_model.workflow_id,
workflow_execution_id=db_model.workflow_run_id,
index=db_model.index,
predecessor_node_id=db_model.predecessor_node_id,
node_id=db_model.node_id,
node_type=NodeType(db_model.node_type),
title=db_model.title,
inputs=inputs,
process_data=process_data,
outputs=outputs,
status=status,
error=db_model.error,
elapsed_time=db_model.elapsed_time,
metadata=metadata,
created_at=db_model.created_at,
finished_at=db_model.finished_at,
)
if not db_model.offload_data:
return domain_model
offload_data = db_model.offload_data
# Store truncated versions for API responses
# TODO: consider load content concurrently.
input_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.INPUTS))
if input_offload is not None:
assert input_offload.file is not None
domain_model.inputs = self._load_file(input_offload.file)
domain_model.set_truncated_inputs(inputs)
outputs_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.OUTPUTS))
if outputs_offload is not None:
assert outputs_offload.file is not None
domain_model.outputs = self._load_file(outputs_offload.file)
domain_model.set_truncated_outputs(outputs)
process_data_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.PROCESS_DATA))
if process_data_offload is not None:
assert process_data_offload.file is not None
domain_model.process_data = self._load_file(process_data_offload.file)
domain_model.set_truncated_process_data(process_data)
return domain_model
def _load_file(self, file: UploadFile) -> Mapping[str, Any]:
content = storage.load(file.key)
return json.loads(content)
@staticmethod
def _json_encode(values: Mapping[str, Any]) -> str:
json_converter = WorkflowRuntimeTypeConverter()
return json.dumps(json_converter.to_json_encodable(values))
def _to_db_model(self, domain_model: WorkflowNodeExecution) -> WorkflowNodeExecutionModel:
"""
Convert a domain model to a database model. This copies the inputs /
process_data / outputs from domain model directly without applying truncation.
Args:
domain_model: The domain model to convert
Returns:
The database model, without setting inputs, process_data and outputs fields.
"""
# Use values from constructor if provided
if not self._triggered_from:
raise ValueError("triggered_from is required in repository constructor")
if not self._creator_user_id:
raise ValueError("created_by is required in repository constructor")
if not self._creator_user_role:
raise ValueError("created_by_role is required in repository constructor")
converter = WorkflowRuntimeTypeConverter()
# json_converter = WorkflowRuntimeTypeConverter()
db_model = WorkflowNodeExecutionModel()
db_model.id = domain_model.id
db_model.tenant_id = self._tenant_id
if self._app_id is not None:
db_model.app_id = self._app_id
db_model.workflow_id = domain_model.workflow_id
db_model.triggered_from = self._triggered_from
db_model.workflow_run_id = domain_model.workflow_execution_id
db_model.index = domain_model.index
db_model.predecessor_node_id = domain_model.predecessor_node_id
db_model.node_execution_id = domain_model.node_execution_id
db_model.node_id = domain_model.node_id
db_model.node_type = domain_model.node_type
db_model.title = domain_model.title
db_model.inputs = (
_deterministic_json_dump(converter.to_json_encodable(domain_model.inputs))
if domain_model.inputs is not None
else None
)
db_model.process_data = (
_deterministic_json_dump(converter.to_json_encodable(domain_model.process_data))
if domain_model.process_data is not None
else None
)
db_model.outputs = (
_deterministic_json_dump(converter.to_json_encodable(domain_model.outputs))
if domain_model.outputs is not None
else None
)
# inputs, process_data and outputs are handled below
db_model.status = domain_model.status
db_model.error = domain_model.error
db_model.elapsed_time = domain_model.elapsed_time
db_model.execution_metadata = (
json.dumps(jsonable_encoder(domain_model.metadata)) if domain_model.metadata else None
)
db_model.created_at = domain_model.created_at
db_model.created_by_role = self._creator_user_role
db_model.created_by = self._creator_user_id
db_model.finished_at = domain_model.finished_at
return db_model
def _is_duplicate_key_error(self, exception: BaseException) -> bool:
"""Check if the exception is a duplicate key constraint violation."""
return isinstance(exception, IntegrityError) and isinstance(exception.orig, psycopg2.errors.UniqueViolation)
def _regenerate_id_on_duplicate(self, execution: WorkflowNodeExecution, db_model: WorkflowNodeExecutionModel):
"""Regenerate UUID v7 for both domain and database models when duplicate key detected."""
new_id = str(uuidv7())
logger.warning(
"Duplicate key conflict for workflow node execution ID %s, generating new UUID v7: %s", db_model.id, new_id
)
db_model.id = new_id
execution.id = new_id
def _truncate_and_upload(
self,
values: Mapping[str, Any] | None,
execution_id: str,
type_: ExecutionOffLoadType,
) -> _InputsOutputsTruncationResult | None:
if values is None:
return None
converter = WorkflowRuntimeTypeConverter()
json_encodable_value = converter.to_json_encodable(values)
truncator = self._create_truncator()
truncated_values, truncated = truncator.truncate_variable_mapping(json_encodable_value)
if not truncated:
return None
value_json = _deterministic_json_dump(json_encodable_value)
assert value_json is not None, "value_json should be not None here."
suffix = type_.value
upload_file = self._file_service.upload_file(
filename=f"node_execution_{execution_id}_{suffix}.json",
content=value_json.encode("utf-8"),
mimetype="application/json",
user=self._user,
)
offload = WorkflowNodeExecutionOffload(
id=uuidv7(),
tenant_id=self._tenant_id,
app_id=self._app_id,
node_execution_id=execution_id,
type_=type_,
file_id=upload_file.id,
)
return _InputsOutputsTruncationResult(
truncated_value=truncated_values,
file=upload_file,
offload=offload,
)
def save(self, execution: WorkflowNodeExecution) -> None:
"""
Save or update a NodeExecution domain entity to the database.
This method serves as a domain-to-database adapter that:
1. Converts the domain entity to its database representation
2. Checks for existing records and updates or inserts accordingly
3. Handles truncation and offloading of large inputs/outputs
4. Persists the database model using SQLAlchemy's merge operation
5. Maintains proper multi-tenancy by including tenant context during conversion
6. Updates the in-memory cache for faster subsequent lookups
The method handles both creating new records and updating existing ones through
SQLAlchemy's merge operation.
Args:
execution: The NodeExecution domain entity to persist
"""
# NOTE: The workflow engine triggers `save` multiple times for a single node execution:
# when the node starts, any time it retries, and once more when it reaches a terminal state.
# Only the final call contains the complete inputs and outputs payloads, so earlier invocations
# must tolerate missing data without attempting to offload variables.
# Convert domain model to database model using tenant context and other attributes
db_model = self._to_db_model(execution)
# Use tenacity for retry logic with duplicate key handling
@retry(
stop=stop_after_attempt(3),
retry=retry_if_exception(self._is_duplicate_key_error),
before_sleep=before_sleep_log(logger, logging.WARNING),
reraise=True,
)
def _save_with_retry():
try:
self._persist_to_database(db_model)
except IntegrityError as e:
if self._is_duplicate_key_error(e):
# Generate new UUID and retry
self._regenerate_id_on_duplicate(execution, db_model)
raise # Let tenacity handle the retry
else:
# Different integrity error, don't retry
logger.exception("Non-duplicate key integrity error while saving workflow node execution")
raise
try:
_save_with_retry()
# Update the in-memory cache after successful save
if db_model.node_execution_id:
self._node_execution_cache[db_model.node_execution_id] = db_model
except Exception:
logger.exception("Failed to save workflow node execution after all retries")
raise
def _persist_to_database(self, db_model: WorkflowNodeExecutionModel):
"""
Persist the database model to the database.
Checks if a record with the same ID exists and either updates it or creates a new one.
Args:
db_model: The database model to persist
"""
with self._session_factory() as session:
# Check if record already exists
existing = session.get(WorkflowNodeExecutionModel, db_model.id)
if existing:
# Update existing record by copying all non-private attributes
for key, value in db_model.__dict__.items():
if not key.startswith("_"):
setattr(existing, key, value)
else:
# Add new record
session.add(db_model)
session.commit()
# Update the in-memory cache for faster subsequent lookups
# Only cache if we have a node_execution_id to use as the cache key
if db_model.node_execution_id:
self._node_execution_cache[db_model.node_execution_id] = db_model
def save_execution_data(self, execution: WorkflowNodeExecution):
domain_model = execution
with self._session_factory(expire_on_commit=False) as session:
query = WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel)).where(
WorkflowNodeExecutionModel.id == domain_model.id
)
db_model: WorkflowNodeExecutionModel | None = session.execute(query).scalars().first()
if db_model is not None:
offload_data = db_model.offload_data
else:
db_model = self._to_db_model(domain_model)
offload_data = db_model.offload_data
if domain_model.inputs is not None:
result = self._truncate_and_upload(
domain_model.inputs,
domain_model.id,
ExecutionOffLoadType.INPUTS,
)
if result is not None:
db_model.inputs = self._json_encode(result.truncated_value)
domain_model.set_truncated_inputs(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.inputs = self._json_encode(domain_model.inputs)
if domain_model.outputs is not None:
result = self._truncate_and_upload(
domain_model.outputs,
domain_model.id,
ExecutionOffLoadType.OUTPUTS,
)
if result is not None:
db_model.outputs = self._json_encode(result.truncated_value)
domain_model.set_truncated_outputs(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.outputs = self._json_encode(domain_model.outputs)
if domain_model.process_data is not None:
result = self._truncate_and_upload(
domain_model.process_data,
domain_model.id,
ExecutionOffLoadType.PROCESS_DATA,
)
if result is not None:
db_model.process_data = self._json_encode(result.truncated_value)
domain_model.set_truncated_process_data(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.process_data = self._json_encode(domain_model.process_data)
db_model.offload_data = offload_data
with self._session_factory() as session, session.begin():
session.merge(db_model)
session.flush()
def get_db_models_by_workflow_run(
self,
workflow_run_id: str,
order_config: OrderConfig | None = None,
triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
) -> Sequence[WorkflowNodeExecutionModel]:
"""
Retrieve all WorkflowNodeExecution database models for a specific workflow run.
The returned models have `offload_data` preloaded, along with the associated
`inputs_file` and `outputs_file` data.
This method directly returns database models without converting to domain models,
which is useful when you need to access database-specific fields like triggered_from.
It also updates the in-memory cache with the retrieved models.
Args:
workflow_run_id: The workflow run ID
order_config: Optional configuration for ordering results
order_config.order_by: List of fields to order by (e.g., ["index", "created_at"])
order_config.order_direction: Direction to order ("asc" or "desc")
Returns:
A list of WorkflowNodeExecution database models
"""
with self._session_factory() as session:
stmt = WorkflowNodeExecutionModel.preload_offload_data_and_files(select(WorkflowNodeExecutionModel))
stmt = stmt.where(
WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id,
WorkflowNodeExecutionModel.tenant_id == self._tenant_id,
WorkflowNodeExecutionModel.triggered_from == triggered_from,
)
if self._app_id:
stmt = stmt.where(WorkflowNodeExecutionModel.app_id == self._app_id)
# Apply ordering if provided
if order_config and order_config.order_by:
order_columns: list[UnaryExpression] = []
for field in order_config.order_by:
column = getattr(WorkflowNodeExecutionModel, field, None)
if not column:
continue
if order_config.order_direction == "desc":
order_columns.append(desc(column))
else:
order_columns.append(asc(column))
if order_columns:
stmt = stmt.order_by(*order_columns)
db_models = session.scalars(stmt).all()
# Update the cache with the retrieved DB models
for model in db_models:
if model.node_execution_id:
self._node_execution_cache[model.node_execution_id] = model
return db_models
def get_by_workflow_run(
self,
workflow_run_id: str,
order_config: OrderConfig | None = None,
triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
) -> Sequence[WorkflowNodeExecution]:
"""
Retrieve all NodeExecution instances for a specific workflow run.
This method always queries the database to ensure complete and ordered results,
but updates the cache with any retrieved executions.
Args:
workflow_run_id: The workflow run ID
order_config: Optional configuration for ordering results
order_config.order_by: List of fields to order by (e.g., ["index", "created_at"])
order_config.order_direction: Direction to order ("asc" or "desc")
Returns:
A list of NodeExecution instances
"""
# Get the database models using the new method
db_models = self.get_db_models_by_workflow_run(workflow_run_id, order_config, triggered_from)
with ThreadPoolExecutor(max_workers=10) as executor:
domain_models = executor.map(self._to_domain_model, db_models, timeout=30)
return list(domain_models)
def _deterministic_json_dump(value: Mapping[str, Any]) -> str:
return json.dumps(value, sort_keys=True)
_T = TypeVar("_T")
def _find_first(seq: Sequence[_T], pred: Callable[[_T], bool]) -> _T | None:
filtered = [i for i in seq if pred(i)]
if filtered:
return filtered[0]
return None
def _filter_by_offload_type(offload_type: ExecutionOffLoadType) -> Callable[[WorkflowNodeExecutionOffload], bool]:
def f(offload: WorkflowNodeExecutionOffload) -> bool:
return offload.type_ == offload_type
return f
def _replace_or_append_offload(
seq: list[WorkflowNodeExecutionOffload], elem: WorkflowNodeExecutionOffload
) -> list[WorkflowNodeExecutionOffload]:
"""Replace all elements in `seq` that satisfy the equality condition defined by `eq_func` with `elem`.
Args:
seq: The sequence of elements to process.
elem: The new element to insert.
eq_func: A function that determines equality between elements.
Returns:
A new sequence with the specified elements replaced or appended.
"""
ls = [i for i in seq if i.type_ != elem.type_]
ls.append(elem)
return ls