dify

2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions
--- a/dify/api/core/schemas/init.py
+++ b/dify/api/core/schemas/init.py
@@ -0,0 +1,5 @@
+# Schema management package
+
+from .resolver import resolve_dify_schema_refs
+
+__all__ = ["resolve_dify_schema_refs"]
--- a/dify/api/core/schemas/builtin/schemas/v1/file.json
+++ b/dify/api/core/schemas/builtin/schemas/v1/file.json
@@ -0,0 +1,43 @@
+{
+  "$id": "https://dify.ai/schemas/v1/file.json",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "version": "1.0.0",
+  "type": "object",
+  "title": "File",
+  "description": "Schema for file objects (v1)",
+  "properties": {
+    "name": {
+      "type": "string",
+      "description": "file name"
+    },
+    "size": {
+      "type": "number",
+      "description": "file size"
+    },
+    "extension": {
+      "type": "string",
+      "description": "file extension"
+    },
+    "type": {
+      "type": "string",
+      "description": "file type"
+    },
+    "mime_type": {
+      "type": "string",
+      "description": "file mime type"
+    },
+    "transfer_method": {
+      "type": "string",
+      "description": "file transfer method"
+    },
+    "url": {
+      "type": "string",
+      "description": "file url"
+    },
+    "related_id": {
+      "type": "string",
+      "description": "file related id"
+    }
+  },
+  "required": ["name"]
+}
--- a/dify/api/core/schemas/builtin/schemas/v1/general_structure.json
+++ b/dify/api/core/schemas/builtin/schemas/v1/general_structure.json
@@ -0,0 +1,11 @@
+{
+  "$id": "https://dify.ai/schemas/v1/general_structure.json",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "version": "1.0.0",
+  "type": "array",
+  "title": "General Structure",
+  "description": "Schema for general structure (v1) - array of strings",
+  "items": {
+    "type": "string"
+  }
+}
--- a/dify/api/core/schemas/builtin/schemas/v1/parent_child_structure.json
+++ b/dify/api/core/schemas/builtin/schemas/v1/parent_child_structure.json
@@ -0,0 +1,36 @@
+{
+  "$id": "https://dify.ai/schemas/v1/parent_child_structure.json",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "version": "1.0.0",
+  "type": "object",
+  "title": "Parent-Child Structure",
+  "description": "Schema for parent-child structure (v1)",
+  "properties": {
+    "parent_mode": {
+      "type": "string",
+      "description": "The mode of parent-child relationship"
+    },
+    "parent_child_chunks": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "parent_content": {
+            "type": "string",
+            "description": "The parent content"
+          },
+          "child_contents": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "List of child contents"
+          }
+        },
+        "required": ["parent_content", "child_contents"]
+      },
+      "description": "List of parent-child chunk pairs"
+    }
+  },
+  "required": ["parent_mode", "parent_child_chunks"]
+}
--- a/dify/api/core/schemas/builtin/schemas/v1/qa_structure.json
+++ b/dify/api/core/schemas/builtin/schemas/v1/qa_structure.json
@@ -0,0 +1,29 @@
+{
+  "$id": "https://dify.ai/schemas/v1/qa_structure.json",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "version": "1.0.0",
+  "type": "object",
+  "title": "Q&A Structure",
+  "description": "Schema for question-answer structure (v1)",
+  "properties": {
+    "qa_chunks": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "question": {
+            "type": "string",
+            "description": "The question"
+          },
+          "answer": {
+            "type": "string",
+            "description": "The answer"
+          }
+        },
+        "required": ["question", "answer"]
+      },
+      "description": "List of question-answer pairs"
+    }
+  },
+  "required": ["qa_chunks"]
+}
--- a/dify/api/core/schemas/registry.py
+++ b/dify/api/core/schemas/registry.py
@@ -0,0 +1,129 @@
+import json
+import logging
+import threading
+from collections.abc import Mapping, MutableMapping
+from pathlib import Path
+from typing import Any, ClassVar, Optional
+
+
+class SchemaRegistry:
+    """Schema registry manages JSON schemas with version support"""
+
+    logger: ClassVar[logging.Logger] = logging.getLogger(__name__)
+
+    _default_instance: ClassVar[Optional["SchemaRegistry"]] = None
+    _lock: ClassVar[threading.Lock] = threading.Lock()
+
+    def __init__(self, base_dir: str):
+        self.base_dir = Path(base_dir)
+        self.versions: MutableMapping[str, MutableMapping[str, Any]] = {}
+        self.metadata: MutableMapping[str, MutableMapping[str, Any]] = {}
+
+    @classmethod
+    def default_registry(cls) -> "SchemaRegistry":
+        """Returns the default schema registry for builtin schemas (thread-safe singleton)"""
+        if cls._default_instance is None:
+            with cls._lock:
+                # Double-checked locking pattern
+                if cls._default_instance is None:
+                    current_dir = Path(__file__).parent
+                    schema_dir = current_dir / "builtin" / "schemas"
+
+                    registry = cls(str(schema_dir))
+                    registry.load_all_versions()
+
+                    cls._default_instance = registry
+
+        return cls._default_instance
+
+    def load_all_versions(self) -> None:
+        """Scans the schema directory and loads all versions"""
+        if not self.base_dir.exists():
+            return
+
+        for entry in self.base_dir.iterdir():
+            if not entry.is_dir():
+                continue
+
+            version = entry.name
+            if not version.startswith("v"):
+                continue
+
+            self._load_version_dir(version, entry)
+
+    def _load_version_dir(self, version: str, version_dir: Path) -> None:
+        """Loads all schemas in a version directory"""
+        if not version_dir.exists():
+            return
+
+        if version not in self.versions:
+            self.versions[version] = {}
+
+        for entry in version_dir.iterdir():
+            if entry.suffix != ".json":
+                continue
+
+            schema_name = entry.stem
+            self._load_schema(version, schema_name, entry)
+
+    def _load_schema(self, version: str, schema_name: str, schema_path: Path) -> None:
+        """Loads a single schema file"""
+        try:
+            with open(schema_path, encoding="utf-8") as f:
+                schema = json.load(f)
+
+            # Store the schema
+            self.versions[version][schema_name] = schema
+
+            # Extract and store metadata
+            uri = f"https://dify.ai/schemas/{version}/{schema_name}.json"
+            metadata = {
+                "version": version,
+                "title": schema.get("title", ""),
+                "description": schema.get("description", ""),
+                "deprecated": schema.get("deprecated", False),
+            }
+            self.metadata[uri] = metadata
+
+        except (OSError, json.JSONDecodeError) as e:
+            self.logger.warning("Failed to load schema %s/%s: %s", version, schema_name, e)
+
+    def get_schema(self, uri: str) -> Any | None:
+        """Retrieves a schema by URI with version support"""
+        version, schema_name = self._parse_uri(uri)
+        if not version or not schema_name:
+            return None
+
+        version_schemas = self.versions.get(version)
+        if not version_schemas:
+            return None
+
+        return version_schemas.get(schema_name)
+
+    def _parse_uri(self, uri: str) -> tuple[str, str]:
+        """Parses a schema URI to extract version and schema name"""
+        from core.schemas.resolver import parse_dify_schema_uri
+
+        return parse_dify_schema_uri(uri)
+
+    def list_versions(self) -> list[str]:
+        """Returns all available versions"""
+        return sorted(self.versions.keys())
+
+    def list_schemas(self, version: str) -> list[str]:
+        """Returns all schemas in a specific version"""
+        version_schemas = self.versions.get(version)
+        if not version_schemas:
+            return []
+
+        return sorted(version_schemas.keys())
+
+    def get_all_schemas_for_version(self, version: str = "v1") -> list[Mapping[str, Any]]:
+        """Returns all schemas for a version in the API format"""
+        version_schemas = self.versions.get(version, {})
+
+        result: list[Mapping[str, Any]] = []
+        for schema_name, schema in version_schemas.items():
+            result.append({"name": schema_name, "label": schema.get("title", schema_name), "schema": schema})
+
+        return result
--- a/dify/api/core/schemas/resolver.py
+++ b/dify/api/core/schemas/resolver.py
@@ -0,0 +1,397 @@
+import logging
+import re
+import threading
+from collections import deque
+from dataclasses import dataclass
+from typing import Any, Union
+
+from core.schemas.registry import SchemaRegistry
+
+logger = logging.getLogger(__name__)
+
+# Type aliases for better clarity
+SchemaType = Union[dict[str, Any], list[Any], str, int, float, bool, None]
+SchemaDict = dict[str, Any]
+
+# Pre-compiled pattern for better performance
+_DIFY_SCHEMA_PATTERN = re.compile(r"^https://dify\.ai/schemas/(v\d+)/(.+)\.json$")
+
+
+class SchemaResolutionError(Exception):
+    """Base exception for schema resolution errors"""
+
+    pass
+
+
+class CircularReferenceError(SchemaResolutionError):
+    """Raised when a circular reference is detected"""
+
+    def __init__(self, ref_uri: str, ref_path: list[str]):
+        self.ref_uri = ref_uri
+        self.ref_path = ref_path
+        super().__init__(f"Circular reference detected: {ref_uri} in path {' -> '.join(ref_path)}")
+
+
+class MaxDepthExceededError(SchemaResolutionError):
+    """Raised when maximum resolution depth is exceeded"""
+
+    def __init__(self, max_depth: int):
+        self.max_depth = max_depth
+        super().__init__(f"Maximum resolution depth ({max_depth}) exceeded")
+
+
+class SchemaNotFoundError(SchemaResolutionError):
+    """Raised when a referenced schema cannot be found"""
+
+    def __init__(self, ref_uri: str):
+        self.ref_uri = ref_uri
+        super().__init__(f"Schema not found: {ref_uri}")
+
+
+@dataclass
+class QueueItem:
+    """Represents an item in the BFS queue"""
+
+    current: Any
+    parent: Any | None
+    key: Union[str, int] | None
+    depth: int
+    ref_path: set[str]
+
+
+class SchemaResolver:
+    """Resolver for Dify schema references with caching and optimizations"""
+
+    _cache: dict[str, SchemaDict] = {}
+    _cache_lock = threading.Lock()
+
+    def __init__(self, registry: SchemaRegistry | None = None, max_depth: int = 10):
+        """
+        Initialize the schema resolver
+
+        Args:
+            registry: Schema registry to use (defaults to default registry)
+            max_depth: Maximum depth for reference resolution
+        """
+        self.registry = registry or SchemaRegistry.default_registry()
+        self.max_depth = max_depth
+
+    @classmethod
+    def clear_cache(cls) -> None:
+        """Clear the global schema cache"""
+        with cls._cache_lock:
+            cls._cache.clear()
+
+    def resolve(self, schema: SchemaType) -> SchemaType:
+        """
+        Resolve all $ref references in the schema
+
+        Performance optimization: quickly checks for $ref presence before processing.
+
+        Args:
+            schema: Schema to resolve
+
+        Returns:
+            Resolved schema with all references expanded
+
+        Raises:
+            CircularReferenceError: If circular reference detected
+            MaxDepthExceededError: If max depth exceeded
+            SchemaNotFoundError: If referenced schema not found
+        """
+        if not isinstance(schema, (dict, list)):
+            return schema
+
+        # Fast path: if no Dify refs found, return original schema unchanged
+        # This avoids expensive deepcopy and BFS traversal for schemas without refs
+        if not _has_dify_refs(schema):
+            return schema
+
+        # Slow path: schema contains refs, perform full resolution
+        import copy
+
+        result = copy.deepcopy(schema)
+
+        # Initialize BFS queue
+        queue = deque([QueueItem(current=result, parent=None, key=None, depth=0, ref_path=set())])
+
+        while queue:
+            item = queue.popleft()
+
+            # Process the current item
+            self._process_queue_item(queue, item)
+
+        return result
+
+    def _process_queue_item(self, queue: deque, item: QueueItem) -> None:
+        """Process a single queue item"""
+        if isinstance(item.current, dict):
+            self._process_dict(queue, item)
+        elif isinstance(item.current, list):
+            self._process_list(queue, item)
+
+    def _process_dict(self, queue: deque, item: QueueItem) -> None:
+        """Process a dictionary item"""
+        ref_uri = item.current.get("$ref")
+
+        if ref_uri and _is_dify_schema_ref(ref_uri):
+            # Handle $ref resolution
+            self._resolve_ref(queue, item, ref_uri)
+        else:
+            # Process nested items
+            for key, value in item.current.items():
+                if isinstance(value, (dict, list)):
+                    next_depth = item.depth + 1
+                    if next_depth >= self.max_depth:
+                        raise MaxDepthExceededError(self.max_depth)
+                    queue.append(
+                        QueueItem(current=value, parent=item.current, key=key, depth=next_depth, ref_path=item.ref_path)
+                    )
+
+    def _process_list(self, queue: deque, item: QueueItem) -> None:
+        """Process a list item"""
+        for idx, value in enumerate(item.current):
+            if isinstance(value, (dict, list)):
+                next_depth = item.depth + 1
+                if next_depth >= self.max_depth:
+                    raise MaxDepthExceededError(self.max_depth)
+                queue.append(
+                    QueueItem(current=value, parent=item.current, key=idx, depth=next_depth, ref_path=item.ref_path)
+                )
+
+    def _resolve_ref(self, queue: deque, item: QueueItem, ref_uri: str) -> None:
+        """Resolve a $ref reference"""
+        # Check for circular reference
+        if ref_uri in item.ref_path:
+            # Mark as circular and skip
+            item.current["$circular_ref"] = True
+            logger.warning("Circular reference detected: %s", ref_uri)
+            return
+
+        # Get resolved schema (from cache or registry)
+        resolved_schema = self._get_resolved_schema(ref_uri)
+        if not resolved_schema:
+            logger.warning("Schema not found: %s", ref_uri)
+            return
+
+        # Update ref path
+        new_ref_path = item.ref_path | {ref_uri}
+
+        # Replace the reference with resolved schema
+        next_depth = item.depth + 1
+        if next_depth >= self.max_depth:
+            raise MaxDepthExceededError(self.max_depth)
+
+        if item.parent is None:
+            # Root level replacement
+            item.current.clear()
+            item.current.update(resolved_schema)
+            queue.append(
+                QueueItem(current=item.current, parent=None, key=None, depth=next_depth, ref_path=new_ref_path)
+            )
+        else:
+            # Update parent container
+            item.parent[item.key] = resolved_schema.copy()
+            queue.append(
+                QueueItem(
+                    current=item.parent[item.key],
+                    parent=item.parent,
+                    key=item.key,
+                    depth=next_depth,
+                    ref_path=new_ref_path,
+                )
+            )
+
+    def _get_resolved_schema(self, ref_uri: str) -> SchemaDict | None:
+        """Get resolved schema from cache or registry"""
+        # Check cache first
+        with self._cache_lock:
+            if ref_uri in self._cache:
+                return self._cache[ref_uri].copy()
+
+        # Fetch from registry
+        schema = self.registry.get_schema(ref_uri)
+        if not schema:
+            return None
+
+        # Clean and cache
+        cleaned = _remove_metadata_fields(schema)
+        with self._cache_lock:
+            self._cache[ref_uri] = cleaned
+
+        return cleaned.copy()
+
+
+def resolve_dify_schema_refs(
+    schema: SchemaType, registry: SchemaRegistry | None = None, max_depth: int = 30
+) -> SchemaType:
+    """
+    Resolve $ref references in Dify schema to actual schema content
+
+    This is a convenience function that creates a resolver and resolves the schema.
+    Performance optimization: quickly checks for $ref presence before processing.
+
+    Args:
+        schema: Schema object that may contain $ref references
+        registry: Optional schema registry, defaults to default registry
+        max_depth: Maximum depth to prevent infinite loops (default: 30)
+
+    Returns:
+        Schema with all $ref references resolved to actual content
+
+    Raises:
+        CircularReferenceError: If circular reference detected
+        MaxDepthExceededError: If maximum depth exceeded
+        SchemaNotFoundError: If referenced schema not found
+    """
+    # Fast path: if no Dify refs found, return original schema unchanged
+    # This avoids expensive deepcopy and BFS traversal for schemas without refs
+    if not _has_dify_refs(schema):
+        return schema
+
+    # Slow path: schema contains refs, perform full resolution
+    resolver = SchemaResolver(registry, max_depth)
+    return resolver.resolve(schema)
+
+
+def _remove_metadata_fields(schema: dict) -> dict:
+    """
+    Remove metadata fields from schema that shouldn't be included in resolved output
+
+    Args:
+        schema: Schema dictionary
+
+    Returns:
+        Cleaned schema without metadata fields
+    """
+    # Create a copy and remove metadata fields
+    cleaned = schema.copy()
+    metadata_fields = ["$id", "$schema", "version"]
+
+    for field in metadata_fields:
+        cleaned.pop(field, None)
+
+    return cleaned
+
+
+def _is_dify_schema_ref(ref_uri: Any) -> bool:
+    """
+    Check if the reference URI is a Dify schema reference
+
+    Args:
+        ref_uri: URI to check
+
+    Returns:
+        True if it's a Dify schema reference
+    """
+    if not isinstance(ref_uri, str):
+        return False
+
+    # Use pre-compiled pattern for better performance
+    return bool(_DIFY_SCHEMA_PATTERN.match(ref_uri))
+
+
+def _has_dify_refs_recursive(schema: SchemaType) -> bool:
+    """
+    Recursively check if a schema contains any Dify $ref references
+
+    This is the fallback method when string-based detection is not possible.
+
+    Args:
+        schema: Schema to check for references
+
+    Returns:
+        True if any Dify $ref is found, False otherwise
+    """
+    if isinstance(schema, dict):
+        # Check if this dict has a $ref field
+        ref_uri = schema.get("$ref")
+        if ref_uri and _is_dify_schema_ref(ref_uri):
+            return True
+
+        # Check nested values
+        for value in schema.values():
+            if _has_dify_refs_recursive(value):
+                return True
+
+    elif isinstance(schema, list):
+        # Check each item in the list
+        for item in schema:
+            if _has_dify_refs_recursive(item):
+                return True
+
+    # Primitive types don't contain refs
+    return False
+
+
+def _has_dify_refs_hybrid(schema: SchemaType) -> bool:
+    """
+    Hybrid detection: fast string scan followed by precise recursive check
+
+    Performance optimization using two-phase detection:
+    1. Fast string scan to quickly eliminate schemas without $ref
+    2. Precise recursive validation only for potential candidates
+
+    Args:
+        schema: Schema to check for references
+
+    Returns:
+        True if any Dify $ref is found, False otherwise
+    """
+    # Phase 1: Fast string-based pre-filtering
+    try:
+        import json
+
+        schema_str = json.dumps(schema, separators=(",", ":"))
+
+        # Quick elimination: no $ref at all
+        if '"$ref"' not in schema_str:
+            return False
+
+        # Quick elimination: no Dify schema URLs
+        if "https://dify.ai/schemas/" not in schema_str:
+            return False
+
+    except (TypeError, ValueError, OverflowError):
+        # JSON serialization failed (e.g., circular references, non-serializable objects)
+        # Fall back to recursive detection
+        logger.debug("JSON serialization failed for schema, using recursive detection")
+        return _has_dify_refs_recursive(schema)
+
+    # Phase 2: Precise recursive validation
+    # Only executed for schemas that passed string pre-filtering
+    return _has_dify_refs_recursive(schema)
+
+
+def _has_dify_refs(schema: SchemaType) -> bool:
+    """
+    Check if a schema contains any Dify $ref references
+
+    Uses hybrid detection for optimal performance:
+    - Fast string scan for quick elimination
+    - Precise recursive check for validation
+
+    Args:
+        schema: Schema to check for references
+
+    Returns:
+        True if any Dify $ref is found, False otherwise
+    """
+    return _has_dify_refs_hybrid(schema)
+
+
+def parse_dify_schema_uri(uri: str) -> tuple[str, str]:
+    """
+    Parse a Dify schema URI to extract version and schema name
+
+    Args:
+        uri: Schema URI to parse
+
+    Returns:
+        Tuple of (version, schema_name) or ("", "") if invalid
+    """
+    match = _DIFY_SCHEMA_PATTERN.match(uri)
+    if not match:
+        return "", ""
+
+    return match.group(1), match.group(2)
--- a/dify/api/core/schemas/schema_manager.py
+++ b/dify/api/core/schemas/schema_manager.py
@@ -0,0 +1,62 @@
+from collections.abc import Mapping
+from typing import Any
+
+from core.schemas.registry import SchemaRegistry
+
+
+class SchemaManager:
+    """Schema manager provides high-level schema operations"""
+
+    def __init__(self, registry: SchemaRegistry | None = None):
+        self.registry = registry or SchemaRegistry.default_registry()
+
+    def get_all_schema_definitions(self, version: str = "v1") -> list[Mapping[str, Any]]:
+        """
+        Get all JSON Schema definitions for a specific version
+
+        Args:
+            version: Schema version, defaults to v1
+
+        Returns:
+            Array containing schema definitions, each element contains name and schema fields
+        """
+        return self.registry.get_all_schemas_for_version(version)
+
+    def get_schema_by_name(self, schema_name: str, version: str = "v1") -> Mapping[str, Any] | None:
+        """
+        Get a specific schema by name
+
+        Args:
+            schema_name: Schema name
+            version: Schema version, defaults to v1
+
+        Returns:
+            Dictionary containing name and schema, returns None if not found
+        """
+        uri = f"https://dify.ai/schemas/{version}/{schema_name}.json"
+        schema = self.registry.get_schema(uri)
+
+        if schema:
+            return {"name": schema_name, "schema": schema}
+        return None
+
+    def list_available_schemas(self, version: str = "v1") -> list[str]:
+        """
+        List all available schema names for a specific version
+
+        Args:
+            version: Schema version, defaults to v1
+
+        Returns:
+            List of schema names
+        """
+        return self.registry.list_schemas(version)
+
+    def list_available_versions(self) -> list[str]:
+        """
+        List all available schema versions
+
+        Returns:
+            List of versions
+        """
+        return self.registry.list_versions()