This commit is contained in:
2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# Schema management package
from .resolver import resolve_dify_schema_refs
__all__ = ["resolve_dify_schema_refs"]

View File

@@ -0,0 +1,43 @@
{
"$id": "https://dify.ai/schemas/v1/file.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "1.0.0",
"type": "object",
"title": "File",
"description": "Schema for file objects (v1)",
"properties": {
"name": {
"type": "string",
"description": "file name"
},
"size": {
"type": "number",
"description": "file size"
},
"extension": {
"type": "string",
"description": "file extension"
},
"type": {
"type": "string",
"description": "file type"
},
"mime_type": {
"type": "string",
"description": "file mime type"
},
"transfer_method": {
"type": "string",
"description": "file transfer method"
},
"url": {
"type": "string",
"description": "file url"
},
"related_id": {
"type": "string",
"description": "file related id"
}
},
"required": ["name"]
}

View File

@@ -0,0 +1,11 @@
{
"$id": "https://dify.ai/schemas/v1/general_structure.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "1.0.0",
"type": "array",
"title": "General Structure",
"description": "Schema for general structure (v1) - array of strings",
"items": {
"type": "string"
}
}

View File

@@ -0,0 +1,36 @@
{
"$id": "https://dify.ai/schemas/v1/parent_child_structure.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "1.0.0",
"type": "object",
"title": "Parent-Child Structure",
"description": "Schema for parent-child structure (v1)",
"properties": {
"parent_mode": {
"type": "string",
"description": "The mode of parent-child relationship"
},
"parent_child_chunks": {
"type": "array",
"items": {
"type": "object",
"properties": {
"parent_content": {
"type": "string",
"description": "The parent content"
},
"child_contents": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of child contents"
}
},
"required": ["parent_content", "child_contents"]
},
"description": "List of parent-child chunk pairs"
}
},
"required": ["parent_mode", "parent_child_chunks"]
}

View File

@@ -0,0 +1,29 @@
{
"$id": "https://dify.ai/schemas/v1/qa_structure.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "1.0.0",
"type": "object",
"title": "Q&A Structure",
"description": "Schema for question-answer structure (v1)",
"properties": {
"qa_chunks": {
"type": "array",
"items": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "The question"
},
"answer": {
"type": "string",
"description": "The answer"
}
},
"required": ["question", "answer"]
},
"description": "List of question-answer pairs"
}
},
"required": ["qa_chunks"]
}

View File

@@ -0,0 +1,129 @@
import json
import logging
import threading
from collections.abc import Mapping, MutableMapping
from pathlib import Path
from typing import Any, ClassVar, Optional
class SchemaRegistry:
"""Schema registry manages JSON schemas with version support"""
logger: ClassVar[logging.Logger] = logging.getLogger(__name__)
_default_instance: ClassVar[Optional["SchemaRegistry"]] = None
_lock: ClassVar[threading.Lock] = threading.Lock()
def __init__(self, base_dir: str):
self.base_dir = Path(base_dir)
self.versions: MutableMapping[str, MutableMapping[str, Any]] = {}
self.metadata: MutableMapping[str, MutableMapping[str, Any]] = {}
@classmethod
def default_registry(cls) -> "SchemaRegistry":
"""Returns the default schema registry for builtin schemas (thread-safe singleton)"""
if cls._default_instance is None:
with cls._lock:
# Double-checked locking pattern
if cls._default_instance is None:
current_dir = Path(__file__).parent
schema_dir = current_dir / "builtin" / "schemas"
registry = cls(str(schema_dir))
registry.load_all_versions()
cls._default_instance = registry
return cls._default_instance
def load_all_versions(self) -> None:
"""Scans the schema directory and loads all versions"""
if not self.base_dir.exists():
return
for entry in self.base_dir.iterdir():
if not entry.is_dir():
continue
version = entry.name
if not version.startswith("v"):
continue
self._load_version_dir(version, entry)
def _load_version_dir(self, version: str, version_dir: Path) -> None:
"""Loads all schemas in a version directory"""
if not version_dir.exists():
return
if version not in self.versions:
self.versions[version] = {}
for entry in version_dir.iterdir():
if entry.suffix != ".json":
continue
schema_name = entry.stem
self._load_schema(version, schema_name, entry)
def _load_schema(self, version: str, schema_name: str, schema_path: Path) -> None:
"""Loads a single schema file"""
try:
with open(schema_path, encoding="utf-8") as f:
schema = json.load(f)
# Store the schema
self.versions[version][schema_name] = schema
# Extract and store metadata
uri = f"https://dify.ai/schemas/{version}/{schema_name}.json"
metadata = {
"version": version,
"title": schema.get("title", ""),
"description": schema.get("description", ""),
"deprecated": schema.get("deprecated", False),
}
self.metadata[uri] = metadata
except (OSError, json.JSONDecodeError) as e:
self.logger.warning("Failed to load schema %s/%s: %s", version, schema_name, e)
def get_schema(self, uri: str) -> Any | None:
"""Retrieves a schema by URI with version support"""
version, schema_name = self._parse_uri(uri)
if not version or not schema_name:
return None
version_schemas = self.versions.get(version)
if not version_schemas:
return None
return version_schemas.get(schema_name)
def _parse_uri(self, uri: str) -> tuple[str, str]:
"""Parses a schema URI to extract version and schema name"""
from core.schemas.resolver import parse_dify_schema_uri
return parse_dify_schema_uri(uri)
def list_versions(self) -> list[str]:
"""Returns all available versions"""
return sorted(self.versions.keys())
def list_schemas(self, version: str) -> list[str]:
"""Returns all schemas in a specific version"""
version_schemas = self.versions.get(version)
if not version_schemas:
return []
return sorted(version_schemas.keys())
def get_all_schemas_for_version(self, version: str = "v1") -> list[Mapping[str, Any]]:
"""Returns all schemas for a version in the API format"""
version_schemas = self.versions.get(version, {})
result: list[Mapping[str, Any]] = []
for schema_name, schema in version_schemas.items():
result.append({"name": schema_name, "label": schema.get("title", schema_name), "schema": schema})
return result

View File

@@ -0,0 +1,397 @@
import logging
import re
import threading
from collections import deque
from dataclasses import dataclass
from typing import Any, Union
from core.schemas.registry import SchemaRegistry
logger = logging.getLogger(__name__)
# Type aliases for better clarity
SchemaType = Union[dict[str, Any], list[Any], str, int, float, bool, None]
SchemaDict = dict[str, Any]
# Pre-compiled pattern for better performance
_DIFY_SCHEMA_PATTERN = re.compile(r"^https://dify\.ai/schemas/(v\d+)/(.+)\.json$")
class SchemaResolutionError(Exception):
"""Base exception for schema resolution errors"""
pass
class CircularReferenceError(SchemaResolutionError):
"""Raised when a circular reference is detected"""
def __init__(self, ref_uri: str, ref_path: list[str]):
self.ref_uri = ref_uri
self.ref_path = ref_path
super().__init__(f"Circular reference detected: {ref_uri} in path {' -> '.join(ref_path)}")
class MaxDepthExceededError(SchemaResolutionError):
"""Raised when maximum resolution depth is exceeded"""
def __init__(self, max_depth: int):
self.max_depth = max_depth
super().__init__(f"Maximum resolution depth ({max_depth}) exceeded")
class SchemaNotFoundError(SchemaResolutionError):
"""Raised when a referenced schema cannot be found"""
def __init__(self, ref_uri: str):
self.ref_uri = ref_uri
super().__init__(f"Schema not found: {ref_uri}")
@dataclass
class QueueItem:
"""Represents an item in the BFS queue"""
current: Any
parent: Any | None
key: Union[str, int] | None
depth: int
ref_path: set[str]
class SchemaResolver:
"""Resolver for Dify schema references with caching and optimizations"""
_cache: dict[str, SchemaDict] = {}
_cache_lock = threading.Lock()
def __init__(self, registry: SchemaRegistry | None = None, max_depth: int = 10):
"""
Initialize the schema resolver
Args:
registry: Schema registry to use (defaults to default registry)
max_depth: Maximum depth for reference resolution
"""
self.registry = registry or SchemaRegistry.default_registry()
self.max_depth = max_depth
@classmethod
def clear_cache(cls) -> None:
"""Clear the global schema cache"""
with cls._cache_lock:
cls._cache.clear()
def resolve(self, schema: SchemaType) -> SchemaType:
"""
Resolve all $ref references in the schema
Performance optimization: quickly checks for $ref presence before processing.
Args:
schema: Schema to resolve
Returns:
Resolved schema with all references expanded
Raises:
CircularReferenceError: If circular reference detected
MaxDepthExceededError: If max depth exceeded
SchemaNotFoundError: If referenced schema not found
"""
if not isinstance(schema, (dict, list)):
return schema
# Fast path: if no Dify refs found, return original schema unchanged
# This avoids expensive deepcopy and BFS traversal for schemas without refs
if not _has_dify_refs(schema):
return schema
# Slow path: schema contains refs, perform full resolution
import copy
result = copy.deepcopy(schema)
# Initialize BFS queue
queue = deque([QueueItem(current=result, parent=None, key=None, depth=0, ref_path=set())])
while queue:
item = queue.popleft()
# Process the current item
self._process_queue_item(queue, item)
return result
def _process_queue_item(self, queue: deque, item: QueueItem) -> None:
"""Process a single queue item"""
if isinstance(item.current, dict):
self._process_dict(queue, item)
elif isinstance(item.current, list):
self._process_list(queue, item)
def _process_dict(self, queue: deque, item: QueueItem) -> None:
"""Process a dictionary item"""
ref_uri = item.current.get("$ref")
if ref_uri and _is_dify_schema_ref(ref_uri):
# Handle $ref resolution
self._resolve_ref(queue, item, ref_uri)
else:
# Process nested items
for key, value in item.current.items():
if isinstance(value, (dict, list)):
next_depth = item.depth + 1
if next_depth >= self.max_depth:
raise MaxDepthExceededError(self.max_depth)
queue.append(
QueueItem(current=value, parent=item.current, key=key, depth=next_depth, ref_path=item.ref_path)
)
def _process_list(self, queue: deque, item: QueueItem) -> None:
"""Process a list item"""
for idx, value in enumerate(item.current):
if isinstance(value, (dict, list)):
next_depth = item.depth + 1
if next_depth >= self.max_depth:
raise MaxDepthExceededError(self.max_depth)
queue.append(
QueueItem(current=value, parent=item.current, key=idx, depth=next_depth, ref_path=item.ref_path)
)
def _resolve_ref(self, queue: deque, item: QueueItem, ref_uri: str) -> None:
"""Resolve a $ref reference"""
# Check for circular reference
if ref_uri in item.ref_path:
# Mark as circular and skip
item.current["$circular_ref"] = True
logger.warning("Circular reference detected: %s", ref_uri)
return
# Get resolved schema (from cache or registry)
resolved_schema = self._get_resolved_schema(ref_uri)
if not resolved_schema:
logger.warning("Schema not found: %s", ref_uri)
return
# Update ref path
new_ref_path = item.ref_path | {ref_uri}
# Replace the reference with resolved schema
next_depth = item.depth + 1
if next_depth >= self.max_depth:
raise MaxDepthExceededError(self.max_depth)
if item.parent is None:
# Root level replacement
item.current.clear()
item.current.update(resolved_schema)
queue.append(
QueueItem(current=item.current, parent=None, key=None, depth=next_depth, ref_path=new_ref_path)
)
else:
# Update parent container
item.parent[item.key] = resolved_schema.copy()
queue.append(
QueueItem(
current=item.parent[item.key],
parent=item.parent,
key=item.key,
depth=next_depth,
ref_path=new_ref_path,
)
)
def _get_resolved_schema(self, ref_uri: str) -> SchemaDict | None:
"""Get resolved schema from cache or registry"""
# Check cache first
with self._cache_lock:
if ref_uri in self._cache:
return self._cache[ref_uri].copy()
# Fetch from registry
schema = self.registry.get_schema(ref_uri)
if not schema:
return None
# Clean and cache
cleaned = _remove_metadata_fields(schema)
with self._cache_lock:
self._cache[ref_uri] = cleaned
return cleaned.copy()
def resolve_dify_schema_refs(
schema: SchemaType, registry: SchemaRegistry | None = None, max_depth: int = 30
) -> SchemaType:
"""
Resolve $ref references in Dify schema to actual schema content
This is a convenience function that creates a resolver and resolves the schema.
Performance optimization: quickly checks for $ref presence before processing.
Args:
schema: Schema object that may contain $ref references
registry: Optional schema registry, defaults to default registry
max_depth: Maximum depth to prevent infinite loops (default: 30)
Returns:
Schema with all $ref references resolved to actual content
Raises:
CircularReferenceError: If circular reference detected
MaxDepthExceededError: If maximum depth exceeded
SchemaNotFoundError: If referenced schema not found
"""
# Fast path: if no Dify refs found, return original schema unchanged
# This avoids expensive deepcopy and BFS traversal for schemas without refs
if not _has_dify_refs(schema):
return schema
# Slow path: schema contains refs, perform full resolution
resolver = SchemaResolver(registry, max_depth)
return resolver.resolve(schema)
def _remove_metadata_fields(schema: dict) -> dict:
"""
Remove metadata fields from schema that shouldn't be included in resolved output
Args:
schema: Schema dictionary
Returns:
Cleaned schema without metadata fields
"""
# Create a copy and remove metadata fields
cleaned = schema.copy()
metadata_fields = ["$id", "$schema", "version"]
for field in metadata_fields:
cleaned.pop(field, None)
return cleaned
def _is_dify_schema_ref(ref_uri: Any) -> bool:
"""
Check if the reference URI is a Dify schema reference
Args:
ref_uri: URI to check
Returns:
True if it's a Dify schema reference
"""
if not isinstance(ref_uri, str):
return False
# Use pre-compiled pattern for better performance
return bool(_DIFY_SCHEMA_PATTERN.match(ref_uri))
def _has_dify_refs_recursive(schema: SchemaType) -> bool:
"""
Recursively check if a schema contains any Dify $ref references
This is the fallback method when string-based detection is not possible.
Args:
schema: Schema to check for references
Returns:
True if any Dify $ref is found, False otherwise
"""
if isinstance(schema, dict):
# Check if this dict has a $ref field
ref_uri = schema.get("$ref")
if ref_uri and _is_dify_schema_ref(ref_uri):
return True
# Check nested values
for value in schema.values():
if _has_dify_refs_recursive(value):
return True
elif isinstance(schema, list):
# Check each item in the list
for item in schema:
if _has_dify_refs_recursive(item):
return True
# Primitive types don't contain refs
return False
def _has_dify_refs_hybrid(schema: SchemaType) -> bool:
"""
Hybrid detection: fast string scan followed by precise recursive check
Performance optimization using two-phase detection:
1. Fast string scan to quickly eliminate schemas without $ref
2. Precise recursive validation only for potential candidates
Args:
schema: Schema to check for references
Returns:
True if any Dify $ref is found, False otherwise
"""
# Phase 1: Fast string-based pre-filtering
try:
import json
schema_str = json.dumps(schema, separators=(",", ":"))
# Quick elimination: no $ref at all
if '"$ref"' not in schema_str:
return False
# Quick elimination: no Dify schema URLs
if "https://dify.ai/schemas/" not in schema_str:
return False
except (TypeError, ValueError, OverflowError):
# JSON serialization failed (e.g., circular references, non-serializable objects)
# Fall back to recursive detection
logger.debug("JSON serialization failed for schema, using recursive detection")
return _has_dify_refs_recursive(schema)
# Phase 2: Precise recursive validation
# Only executed for schemas that passed string pre-filtering
return _has_dify_refs_recursive(schema)
def _has_dify_refs(schema: SchemaType) -> bool:
"""
Check if a schema contains any Dify $ref references
Uses hybrid detection for optimal performance:
- Fast string scan for quick elimination
- Precise recursive check for validation
Args:
schema: Schema to check for references
Returns:
True if any Dify $ref is found, False otherwise
"""
return _has_dify_refs_hybrid(schema)
def parse_dify_schema_uri(uri: str) -> tuple[str, str]:
"""
Parse a Dify schema URI to extract version and schema name
Args:
uri: Schema URI to parse
Returns:
Tuple of (version, schema_name) or ("", "") if invalid
"""
match = _DIFY_SCHEMA_PATTERN.match(uri)
if not match:
return "", ""
return match.group(1), match.group(2)

View File

@@ -0,0 +1,62 @@
from collections.abc import Mapping
from typing import Any
from core.schemas.registry import SchemaRegistry
class SchemaManager:
"""Schema manager provides high-level schema operations"""
def __init__(self, registry: SchemaRegistry | None = None):
self.registry = registry or SchemaRegistry.default_registry()
def get_all_schema_definitions(self, version: str = "v1") -> list[Mapping[str, Any]]:
"""
Get all JSON Schema definitions for a specific version
Args:
version: Schema version, defaults to v1
Returns:
Array containing schema definitions, each element contains name and schema fields
"""
return self.registry.get_all_schemas_for_version(version)
def get_schema_by_name(self, schema_name: str, version: str = "v1") -> Mapping[str, Any] | None:
"""
Get a specific schema by name
Args:
schema_name: Schema name
version: Schema version, defaults to v1
Returns:
Dictionary containing name and schema, returns None if not found
"""
uri = f"https://dify.ai/schemas/{version}/{schema_name}.json"
schema = self.registry.get_schema(uri)
if schema:
return {"name": schema_name, "schema": schema}
return None
def list_available_schemas(self, version: str = "v1") -> list[str]:
"""
List all available schema names for a specific version
Args:
version: Schema version, defaults to v1
Returns:
List of schema names
"""
return self.registry.list_schemas(version)
def list_available_versions(self) -> list[str]:
"""
List all available schema versions
Returns:
List of versions
"""
return self.registry.list_versions()