This commit is contained in:
2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions

View File

@@ -0,0 +1,565 @@
"""
Tencent APM Trace Client - handles network operations, metrics, and API communication
"""
from __future__ import annotations
import importlib
import json
import logging
import os
import socket
from typing import TYPE_CHECKING
from urllib.parse import urlparse
try:
from importlib.metadata import version
except ImportError:
from importlib_metadata import version # type: ignore[import-not-found]
if TYPE_CHECKING:
from opentelemetry.metrics import Meter
from opentelemetry.metrics._internal.instrument import Histogram
from opentelemetry.sdk.metrics.export import MetricReader
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.semconv.resource import ResourceAttributes
from opentelemetry.trace import SpanKind
from opentelemetry.util.types import AttributeValue
from configs import dify_config
from .entities.semconv import (
GEN_AI_SERVER_TIME_TO_FIRST_TOKEN,
GEN_AI_STREAMING_TIME_TO_GENERATE,
GEN_AI_TOKEN_USAGE,
GEN_AI_TRACE_DURATION,
LLM_OPERATION_DURATION,
)
from .entities.tencent_trace_entity import SpanData
logger = logging.getLogger(__name__)
def _get_opentelemetry_sdk_version() -> str:
"""Get OpenTelemetry SDK version dynamically."""
try:
return version("opentelemetry-sdk")
except Exception:
logger.debug("Failed to get opentelemetry-sdk version, using default")
return "1.27.0" # fallback version
class TencentTraceClient:
"""Tencent APM trace client using OpenTelemetry OTLP exporter"""
def __init__(
self,
service_name: str,
endpoint: str,
token: str,
max_queue_size: int = 1000,
schedule_delay_sec: int = 5,
max_export_batch_size: int = 50,
metrics_export_interval_sec: int = 10,
):
self.endpoint = endpoint
self.token = token
self.service_name = service_name
self.metrics_export_interval_sec = metrics_export_interval_sec
self.resource = Resource(
attributes={
ResourceAttributes.SERVICE_NAME: service_name,
ResourceAttributes.SERVICE_VERSION: f"dify-{dify_config.project.version}-{dify_config.COMMIT_SHA}",
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: f"{dify_config.DEPLOY_ENV}-{dify_config.EDITION}",
ResourceAttributes.HOST_NAME: socket.gethostname(),
ResourceAttributes.TELEMETRY_SDK_LANGUAGE: "python",
ResourceAttributes.TELEMETRY_SDK_NAME: "opentelemetry",
ResourceAttributes.TELEMETRY_SDK_VERSION: _get_opentelemetry_sdk_version(),
}
)
# Prepare gRPC endpoint/metadata
grpc_endpoint, insecure, _, _ = self._resolve_grpc_target(endpoint)
headers = (("authorization", f"Bearer {token}"),)
self.exporter = OTLPSpanExporter(
endpoint=grpc_endpoint,
headers=headers,
insecure=insecure,
timeout=30,
)
self.tracer_provider = TracerProvider(resource=self.resource)
self.span_processor = BatchSpanProcessor(
span_exporter=self.exporter,
max_queue_size=max_queue_size,
schedule_delay_millis=schedule_delay_sec * 1000,
max_export_batch_size=max_export_batch_size,
)
self.tracer_provider.add_span_processor(self.span_processor)
# use dify api version as tracer version
self.tracer = self.tracer_provider.get_tracer("dify-sdk", dify_config.project.version)
# Store span contexts for parent-child relationships
self.span_contexts: dict[int, trace_api.SpanContext] = {}
self.meter: Meter | None = None
self.meter_provider: MeterProvider | None = None
self.hist_llm_duration: Histogram | None = None
self.hist_token_usage: Histogram | None = None
self.hist_time_to_first_token: Histogram | None = None
self.hist_time_to_generate: Histogram | None = None
self.hist_trace_duration: Histogram | None = None
self.metric_reader: MetricReader | None = None
# Metrics exporter and instruments
try:
from opentelemetry.sdk.metrics import Histogram, MeterProvider
from opentelemetry.sdk.metrics.export import AggregationTemporality, PeriodicExportingMetricReader
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "").strip().lower()
use_http_protobuf = protocol in {"http/protobuf", "http-protobuf"}
use_http_json = protocol in {"http/json", "http-json"}
# Tencent APM works best with delta aggregation temporality
preferred_temporality: dict[type, AggregationTemporality] = {Histogram: AggregationTemporality.DELTA}
def _create_metric_exporter(exporter_cls, **kwargs):
"""Create metric exporter with preferred_temporality support"""
try:
return exporter_cls(**kwargs, preferred_temporality=preferred_temporality)
except Exception:
return exporter_cls(**kwargs)
metric_reader = None
if use_http_json:
exporter_cls = None
for mod_path in (
"opentelemetry.exporter.otlp.http.json.metric_exporter",
"opentelemetry.exporter.otlp.json.metric_exporter",
):
try:
mod = importlib.import_module(mod_path)
exporter_cls = getattr(mod, "OTLPMetricExporter", None)
if exporter_cls:
break
except Exception:
continue
if exporter_cls is not None:
metric_exporter = _create_metric_exporter(
exporter_cls,
endpoint=endpoint,
headers={"authorization": f"Bearer {token}"},
)
else:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
OTLPMetricExporter as HttpMetricExporter,
)
metric_exporter = _create_metric_exporter(
HttpMetricExporter,
endpoint=endpoint,
headers={"authorization": f"Bearer {token}"},
)
metric_reader = PeriodicExportingMetricReader(
metric_exporter, export_interval_millis=self.metrics_export_interval_sec * 1000
)
elif use_http_protobuf:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
OTLPMetricExporter as HttpMetricExporter,
)
metric_exporter = _create_metric_exporter(
HttpMetricExporter,
endpoint=endpoint,
headers={"authorization": f"Bearer {token}"},
)
metric_reader = PeriodicExportingMetricReader(
metric_exporter, export_interval_millis=self.metrics_export_interval_sec * 1000
)
else:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
OTLPMetricExporter as GrpcMetricExporter,
)
m_grpc_endpoint, m_insecure, _, _ = self._resolve_grpc_target(endpoint)
metric_exporter = _create_metric_exporter(
GrpcMetricExporter,
endpoint=m_grpc_endpoint,
headers={"authorization": f"Bearer {token}"},
insecure=m_insecure,
)
metric_reader = PeriodicExportingMetricReader(
metric_exporter, export_interval_millis=self.metrics_export_interval_sec * 1000
)
if metric_reader is not None:
# Use instance-level MeterProvider instead of global to support config changes
# without worker restart. Each TencentTraceClient manages its own MeterProvider.
provider = MeterProvider(resource=self.resource, metric_readers=[metric_reader])
self.meter_provider = provider
self.meter = provider.get_meter("dify-sdk", dify_config.project.version)
# LLM operation duration histogram
self.hist_llm_duration = self.meter.create_histogram(
name=LLM_OPERATION_DURATION,
unit="s",
description="LLM operation duration (seconds)",
)
# Token usage histogram with exponential buckets
self.hist_token_usage = self.meter.create_histogram(
name=GEN_AI_TOKEN_USAGE,
unit="token",
description="Number of tokens used in prompt and completions",
)
# Time to first token histogram
self.hist_time_to_first_token = self.meter.create_histogram(
name=GEN_AI_SERVER_TIME_TO_FIRST_TOKEN,
unit="s",
description="Time to first token for streaming LLM responses (seconds)",
)
# Time to generate histogram
self.hist_time_to_generate = self.meter.create_histogram(
name=GEN_AI_STREAMING_TIME_TO_GENERATE,
unit="s",
description="Total time to generate streaming LLM responses (seconds)",
)
# Trace duration histogram
self.hist_trace_duration = self.meter.create_histogram(
name=GEN_AI_TRACE_DURATION,
unit="s",
description="End-to-end GenAI trace duration (seconds)",
)
self.metric_reader = metric_reader
else:
self.meter = None
self.meter_provider = None
self.hist_llm_duration = None
self.hist_token_usage = None
self.hist_time_to_first_token = None
self.hist_time_to_generate = None
self.hist_trace_duration = None
self.metric_reader = None
except Exception:
logger.exception("[Tencent APM] Metrics initialization failed; metrics disabled")
self.meter = None
self.meter_provider = None
self.hist_llm_duration = None
self.hist_token_usage = None
self.hist_time_to_first_token = None
self.hist_time_to_generate = None
self.hist_trace_duration = None
self.metric_reader = None
def add_span(self, span_data: SpanData) -> None:
"""Create and export span using OpenTelemetry Tracer API"""
try:
self._create_and_export_span(span_data)
logger.debug("[Tencent APM] Created span: %s", span_data.name)
except Exception:
logger.exception("[Tencent APM] Failed to create span: %s", span_data.name)
# Metrics recording API
def record_llm_duration(self, latency_seconds: float, attributes: dict[str, str] | None = None) -> None:
"""Record LLM operation duration histogram in seconds."""
try:
if not hasattr(self, "hist_llm_duration") or self.hist_llm_duration is None:
return
attrs: dict[str, str] = {}
if attributes:
for k, v in attributes.items():
attrs[k] = str(v) if not isinstance(v, (str, int, float, bool)) else v # type: ignore[assignment]
logger.info(
"[Tencent Metrics] Metric: %s | Value: %.4f | Attributes: %s",
LLM_OPERATION_DURATION,
latency_seconds,
json.dumps(attrs, ensure_ascii=False),
)
self.hist_llm_duration.record(latency_seconds, attrs) # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Failed to record LLM duration", exc_info=True)
def record_token_usage(
self,
token_count: int,
token_type: str,
operation_name: str,
request_model: str,
response_model: str,
server_address: str,
provider: str,
) -> None:
"""Record token usage histogram.
Args:
token_count: Number of tokens used
token_type: "input" or "output"
operation_name: Operation name (e.g., "chat")
request_model: Model used in request
response_model: Model used in response
server_address: Server address
provider: Model provider name
"""
try:
if not hasattr(self, "hist_token_usage") or self.hist_token_usage is None:
return
attributes = {
"gen_ai.operation.name": operation_name,
"gen_ai.request.model": request_model,
"gen_ai.response.model": response_model,
"gen_ai.system": provider,
"gen_ai.token.type": token_type,
"server.address": server_address,
}
logger.info(
"[Tencent Metrics] Metric: %s | Value: %d | Attributes: %s",
GEN_AI_TOKEN_USAGE,
token_count,
json.dumps(attributes, ensure_ascii=False),
)
self.hist_token_usage.record(token_count, attributes) # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Failed to record token usage", exc_info=True)
def record_time_to_first_token(
self, ttft_seconds: float, provider: str, model: str, operation_name: str = "chat"
) -> None:
"""Record time to first token histogram.
Args:
ttft_seconds: Time to first token in seconds
provider: Model provider name
model: Model name
operation_name: Operation name (default: "chat")
"""
try:
if not hasattr(self, "hist_time_to_first_token") or self.hist_time_to_first_token is None:
return
attributes = {
"gen_ai.operation.name": operation_name,
"gen_ai.system": provider,
"gen_ai.request.model": model,
"gen_ai.response.model": model,
"stream": "true",
}
logger.info(
"[Tencent Metrics] Metric: %s | Value: %.4f | Attributes: %s",
GEN_AI_SERVER_TIME_TO_FIRST_TOKEN,
ttft_seconds,
json.dumps(attributes, ensure_ascii=False),
)
self.hist_time_to_first_token.record(ttft_seconds, attributes) # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Failed to record time to first token", exc_info=True)
def record_time_to_generate(
self, ttg_seconds: float, provider: str, model: str, operation_name: str = "chat"
) -> None:
"""Record time to generate histogram.
Args:
ttg_seconds: Time to generate in seconds
provider: Model provider name
model: Model name
operation_name: Operation name (default: "chat")
"""
try:
if not hasattr(self, "hist_time_to_generate") or self.hist_time_to_generate is None:
return
attributes = {
"gen_ai.operation.name": operation_name,
"gen_ai.system": provider,
"gen_ai.request.model": model,
"gen_ai.response.model": model,
"stream": "true",
}
logger.info(
"[Tencent Metrics] Metric: %s | Value: %.4f | Attributes: %s",
GEN_AI_STREAMING_TIME_TO_GENERATE,
ttg_seconds,
json.dumps(attributes, ensure_ascii=False),
)
self.hist_time_to_generate.record(ttg_seconds, attributes) # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Failed to record time to generate", exc_info=True)
def record_trace_duration(self, duration_seconds: float, attributes: dict[str, str] | None = None) -> None:
"""Record end-to-end trace duration histogram in seconds.
Args:
duration_seconds: Trace duration in seconds
attributes: Optional attributes (e.g., conversation_mode, app_id)
"""
try:
if not hasattr(self, "hist_trace_duration") or self.hist_trace_duration is None:
return
attrs: dict[str, str] = {}
if attributes:
for k, v in attributes.items():
attrs[k] = str(v) if not isinstance(v, (str, int, float, bool)) else v # type: ignore[assignment]
logger.info(
"[Tencent Metrics] Metric: %s | Value: %.4f | Attributes: %s",
GEN_AI_TRACE_DURATION,
duration_seconds,
json.dumps(attrs, ensure_ascii=False),
)
self.hist_trace_duration.record(duration_seconds, attrs) # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Failed to record trace duration", exc_info=True)
def _create_and_export_span(self, span_data: SpanData) -> None:
"""Create span using OpenTelemetry Tracer API"""
try:
parent_context = None
if span_data.parent_span_id and span_data.parent_span_id in self.span_contexts:
parent_context = trace_api.set_span_in_context(
trace_api.NonRecordingSpan(self.span_contexts[span_data.parent_span_id])
)
span = self.tracer.start_span(
name=span_data.name,
context=parent_context,
kind=SpanKind.INTERNAL,
start_time=span_data.start_time,
)
self.span_contexts[span_data.span_id] = span.get_span_context()
if span_data.attributes:
attributes: dict[str, AttributeValue] = {}
for key, value in span_data.attributes.items():
if isinstance(value, (int, float, bool)):
attributes[key] = value
else:
attributes[key] = str(value)
span.set_attributes(attributes)
if span_data.events:
for event in span_data.events:
span.add_event(event.name, event.attributes, event.timestamp)
if span_data.status:
span.set_status(span_data.status)
# Manually end span; do not use context manager to avoid double-end warnings
span.end(end_time=span_data.end_time)
except Exception:
logger.exception("[Tencent APM] Error creating span: %s", span_data.name)
def api_check(self) -> bool:
"""Check API connectivity using socket connection test for gRPC endpoints"""
try:
# Resolve gRPC target consistently with exporters
_, _, host, port = self._resolve_grpc_target(self.endpoint)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((host, port))
sock.close()
if result == 0:
logger.info("[Tencent APM] Endpoint %s:%s is accessible", host, port)
return True
else:
logger.warning("[Tencent APM] Endpoint %s:%s is not accessible", host, port)
if host in ["127.0.0.1", "localhost"]:
logger.info("[Tencent APM] Development environment detected, allowing config save")
return True
return False
except Exception:
logger.exception("[Tencent APM] API check failed")
if "127.0.0.1" in self.endpoint or "localhost" in self.endpoint:
return True
return False
def get_project_url(self) -> str:
"""Get project console URL"""
return "https://console.cloud.tencent.com/apm"
def shutdown(self) -> None:
"""Shutdown the client and export remaining spans"""
try:
if self.span_processor:
logger.info("[Tencent APM] Flushing remaining spans before shutdown")
_ = self.span_processor.force_flush()
self.span_processor.shutdown()
if self.tracer_provider:
self.tracer_provider.shutdown()
# Shutdown instance-level meter provider
if self.meter_provider is not None:
try:
self.meter_provider.shutdown() # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Error shutting down meter provider", exc_info=True)
if self.metric_reader is not None:
try:
self.metric_reader.shutdown() # type: ignore[attr-defined]
except Exception:
logger.debug("[Tencent APM] Error shutting down metric reader", exc_info=True)
except Exception:
logger.exception("[Tencent APM] Error during client shutdown")
@staticmethod
def _resolve_grpc_target(endpoint: str, default_port: int = 4317) -> tuple[str, bool, str, int]:
"""Normalize endpoint to gRPC target and security flag.
Returns:
(grpc_endpoint, insecure, host, port)
"""
try:
if endpoint.startswith(("http://", "https://")):
parsed = urlparse(endpoint)
host = parsed.hostname or "localhost"
port = parsed.port or default_port
insecure = parsed.scheme == "http"
return f"{host}:{port}", insecure, host, port
host = endpoint
port = default_port
if ":" in endpoint:
parts = endpoint.rsplit(":", 1)
host = parts[0] or "localhost"
try:
port = int(parts[1])
except Exception:
port = default_port
insecure = ("localhost" in host) or ("127.0.0.1" in host)
return f"{host}:{port}", insecure, host, port
except Exception:
host, port = "localhost", default_port
return f"{host}:{port}", True, host, port

View File

@@ -0,0 +1 @@
# Tencent trace entities module

View File

@@ -0,0 +1,89 @@
from enum import Enum
# public
GEN_AI_SESSION_ID = "gen_ai.session.id"
GEN_AI_USER_ID = "gen_ai.user.id"
GEN_AI_USER_NAME = "gen_ai.user.name"
GEN_AI_SPAN_KIND = "gen_ai.span.kind"
GEN_AI_FRAMEWORK = "gen_ai.framework"
GEN_AI_IS_ENTRY = "gen_ai.is_entry" # mark to count the LLM-related traces
# Chain
INPUT_VALUE = "gen_ai.entity.input"
OUTPUT_VALUE = "gen_ai.entity.output"
# Retriever
RETRIEVAL_QUERY = "retrieval.query"
RETRIEVAL_DOCUMENT = "retrieval.document"
# GENERATION
GEN_AI_MODEL_NAME = "gen_ai.response.model"
GEN_AI_PROVIDER = "gen_ai.provider.name"
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens"
GEN_AI_PROMPT_TEMPLATE_TEMPLATE = "gen_ai.prompt_template.template"
GEN_AI_PROMPT_TEMPLATE_VARIABLE = "gen_ai.prompt_template.variable"
GEN_AI_PROMPT = "gen_ai.prompt"
GEN_AI_COMPLETION = "gen_ai.completion"
GEN_AI_RESPONSE_FINISH_REASON = "gen_ai.response.finish_reason"
# Streaming Span Attributes
GEN_AI_IS_STREAMING_REQUEST = "llm.is_streaming" # Same as OpenLLMetry semconv
# Tool
TOOL_NAME = "tool.name"
TOOL_DESCRIPTION = "tool.description"
TOOL_PARAMETERS = "tool.parameters"
# Instrumentation Library
INSTRUMENTATION_NAME = "dify-sdk"
INSTRUMENTATION_VERSION = "0.1.0"
INSTRUMENTATION_LANGUAGE = "python"
# Metrics
LLM_OPERATION_DURATION = "gen_ai.client.operation.duration"
GEN_AI_TOKEN_USAGE = "gen_ai.client.token.usage"
GEN_AI_SERVER_TIME_TO_FIRST_TOKEN = "gen_ai.server.time_to_first_token"
GEN_AI_STREAMING_TIME_TO_GENERATE = "gen_ai.streaming.time_to_generate"
# The LLM trace duration which is exclusive to tencent apm
GEN_AI_TRACE_DURATION = "gen_ai.trace.duration"
# Token Usage Attributes
GEN_AI_OPERATION_NAME = "gen_ai.operation.name"
GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
GEN_AI_SYSTEM = "gen_ai.system"
GEN_AI_TOKEN_TYPE = "gen_ai.token.type"
SERVER_ADDRESS = "server.address"
class GenAISpanKind(Enum):
WORKFLOW = "WORKFLOW" # OpenLLMetry
RETRIEVER = "RETRIEVER" # RAG
GENERATION = "GENERATION" # Langfuse
TOOL = "TOOL" # OpenLLMetry
AGENT = "AGENT" # OpenLLMetry
TASK = "TASK" # OpenLLMetry

View File

@@ -0,0 +1,21 @@
from collections.abc import Sequence
from opentelemetry import trace as trace_api
from opentelemetry.sdk.trace import Event
from opentelemetry.trace import Status, StatusCode
from pydantic import BaseModel, Field
class SpanData(BaseModel):
model_config = {"arbitrary_types_allowed": True}
trace_id: int = Field(..., description="The unique identifier for the trace.")
parent_span_id: int | None = Field(None, description="The ID of the parent span, if any.")
span_id: int = Field(..., description="The unique identifier for this span.")
name: str = Field(..., description="The name of the span.")
attributes: dict[str, str] = Field(default_factory=dict, description="Attributes associated with the span.")
events: Sequence[Event] = Field(default_factory=list, description="Events recorded in the span.")
links: Sequence[trace_api.Link] = Field(default_factory=list, description="Links to other spans.")
status: Status = Field(default=Status(StatusCode.UNSET), description="The status of the span.")
start_time: int = Field(..., description="The start time of the span in nanoseconds.")
end_time: int = Field(..., description="The end time of the span in nanoseconds.")

View File

@@ -0,0 +1,383 @@
"""
Tencent APM Span Builder - handles all span construction logic
"""
import json
import logging
from datetime import datetime
from opentelemetry.trace import Status, StatusCode
from core.ops.entities.trace_entity import (
DatasetRetrievalTraceInfo,
MessageTraceInfo,
ToolTraceInfo,
WorkflowTraceInfo,
)
from core.ops.tencent_trace.entities.semconv import (
GEN_AI_COMPLETION,
GEN_AI_FRAMEWORK,
GEN_AI_IS_ENTRY,
GEN_AI_IS_STREAMING_REQUEST,
GEN_AI_MODEL_NAME,
GEN_AI_PROMPT,
GEN_AI_PROVIDER,
GEN_AI_RESPONSE_FINISH_REASON,
GEN_AI_SESSION_ID,
GEN_AI_SPAN_KIND,
GEN_AI_USAGE_INPUT_TOKENS,
GEN_AI_USAGE_OUTPUT_TOKENS,
GEN_AI_USAGE_TOTAL_TOKENS,
GEN_AI_USER_ID,
INPUT_VALUE,
OUTPUT_VALUE,
RETRIEVAL_DOCUMENT,
RETRIEVAL_QUERY,
TOOL_DESCRIPTION,
TOOL_NAME,
TOOL_PARAMETERS,
GenAISpanKind,
)
from core.ops.tencent_trace.entities.tencent_trace_entity import SpanData
from core.ops.tencent_trace.utils import TencentTraceUtils
from core.rag.models.document import Document
from core.workflow.entities.workflow_node_execution import (
WorkflowNodeExecution,
WorkflowNodeExecutionMetadataKey,
WorkflowNodeExecutionStatus,
)
logger = logging.getLogger(__name__)
class TencentSpanBuilder:
"""Builder class for constructing different types of spans"""
@staticmethod
def _get_time_nanoseconds(time_value: datetime | None) -> int:
"""Convert datetime to nanoseconds for span creation."""
return TencentTraceUtils.convert_datetime_to_nanoseconds(time_value)
@staticmethod
def build_workflow_spans(
trace_info: WorkflowTraceInfo, trace_id: int, user_id: str, links: list | None = None
) -> list[SpanData]:
"""Build workflow-related spans"""
spans = []
links = links or []
message_span_id = None
workflow_span_id = TencentTraceUtils.convert_to_span_id(trace_info.workflow_run_id, "workflow")
if hasattr(trace_info, "metadata") and trace_info.metadata.get("conversation_id"):
message_span_id = TencentTraceUtils.convert_to_span_id(trace_info.workflow_run_id, "message")
status = Status(StatusCode.OK)
if trace_info.error:
status = Status(StatusCode.ERROR, trace_info.error)
if message_span_id:
message_span = TencentSpanBuilder._build_message_span(
trace_info, trace_id, message_span_id, user_id, status, links
)
spans.append(message_span)
workflow_span = TencentSpanBuilder._build_workflow_span(
trace_info, trace_id, workflow_span_id, message_span_id, user_id, status, links
)
spans.append(workflow_span)
return spans
@staticmethod
def _build_message_span(
trace_info: WorkflowTraceInfo, trace_id: int, message_span_id: int, user_id: str, status: Status, links: list
) -> SpanData:
"""Build message span for chatflow"""
return SpanData(
trace_id=trace_id,
parent_span_id=None,
span_id=message_span_id,
name="message",
start_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.start_time),
end_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.end_time),
attributes={
GEN_AI_SESSION_ID: trace_info.metadata.get("conversation_id", ""),
GEN_AI_USER_ID: str(user_id),
GEN_AI_SPAN_KIND: GenAISpanKind.WORKFLOW.value,
GEN_AI_FRAMEWORK: "dify",
GEN_AI_IS_ENTRY: "true",
INPUT_VALUE: trace_info.workflow_run_inputs.get("sys.query", ""),
OUTPUT_VALUE: json.dumps(trace_info.workflow_run_outputs, ensure_ascii=False),
},
status=status,
links=links,
)
@staticmethod
def _build_workflow_span(
trace_info: WorkflowTraceInfo,
trace_id: int,
workflow_span_id: int,
message_span_id: int | None,
user_id: str,
status: Status,
links: list,
) -> SpanData:
"""Build workflow span"""
attributes = {
GEN_AI_USER_ID: str(user_id),
GEN_AI_SPAN_KIND: GenAISpanKind.WORKFLOW.value,
GEN_AI_FRAMEWORK: "dify",
INPUT_VALUE: json.dumps(trace_info.workflow_run_inputs, ensure_ascii=False),
OUTPUT_VALUE: json.dumps(trace_info.workflow_run_outputs, ensure_ascii=False),
}
if message_span_id is None:
attributes[GEN_AI_IS_ENTRY] = "true"
return SpanData(
trace_id=trace_id,
parent_span_id=message_span_id,
span_id=workflow_span_id,
name="workflow",
start_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.start_time),
end_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.end_time),
attributes=attributes,
status=status,
links=links,
)
@staticmethod
def build_workflow_llm_span(
trace_id: int, workflow_span_id: int, trace_info: WorkflowTraceInfo, node_execution: WorkflowNodeExecution
) -> SpanData:
"""Build LLM span for workflow nodes."""
process_data = node_execution.process_data or {}
outputs = node_execution.outputs or {}
usage_data = process_data.get("usage", {}) if "usage" in process_data else outputs.get("usage", {})
attributes = {
GEN_AI_SESSION_ID: trace_info.metadata.get("conversation_id", ""),
GEN_AI_SPAN_KIND: GenAISpanKind.GENERATION.value,
GEN_AI_FRAMEWORK: "dify",
GEN_AI_MODEL_NAME: process_data.get("model_name", ""),
GEN_AI_PROVIDER: process_data.get("model_provider", ""),
GEN_AI_USAGE_INPUT_TOKENS: str(usage_data.get("prompt_tokens", 0)),
GEN_AI_USAGE_OUTPUT_TOKENS: str(usage_data.get("completion_tokens", 0)),
GEN_AI_USAGE_TOTAL_TOKENS: str(usage_data.get("total_tokens", 0)),
GEN_AI_PROMPT: json.dumps(process_data.get("prompts", []), ensure_ascii=False),
GEN_AI_COMPLETION: str(outputs.get("text", "")),
GEN_AI_RESPONSE_FINISH_REASON: outputs.get("finish_reason", ""),
INPUT_VALUE: json.dumps(process_data.get("prompts", []), ensure_ascii=False),
OUTPUT_VALUE: str(outputs.get("text", "")),
}
if usage_data.get("time_to_first_token") is not None:
attributes[GEN_AI_IS_STREAMING_REQUEST] = "true"
return SpanData(
trace_id=trace_id,
parent_span_id=workflow_span_id,
span_id=TencentTraceUtils.convert_to_span_id(node_execution.id, "node"),
name="GENERATION",
start_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.created_at),
end_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.finished_at),
attributes=attributes,
status=TencentSpanBuilder._get_workflow_node_status(node_execution),
)
@staticmethod
def build_message_span(
trace_info: MessageTraceInfo, trace_id: int, user_id: str, links: list | None = None
) -> SpanData:
"""Build message span."""
links = links or []
status = Status(StatusCode.OK)
if trace_info.error:
status = Status(StatusCode.ERROR, trace_info.error)
attributes = {
GEN_AI_SESSION_ID: trace_info.metadata.get("conversation_id", ""),
GEN_AI_USER_ID: str(user_id),
GEN_AI_SPAN_KIND: GenAISpanKind.WORKFLOW.value,
GEN_AI_FRAMEWORK: "dify",
GEN_AI_IS_ENTRY: "true",
INPUT_VALUE: str(trace_info.inputs or ""),
OUTPUT_VALUE: str(trace_info.outputs or ""),
}
if trace_info.is_streaming_request:
attributes[GEN_AI_IS_STREAMING_REQUEST] = "true"
return SpanData(
trace_id=trace_id,
parent_span_id=None,
span_id=TencentTraceUtils.convert_to_span_id(trace_info.message_id, "message"),
name="message",
start_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.start_time),
end_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.end_time),
attributes=attributes,
status=status,
links=links,
)
@staticmethod
def build_tool_span(trace_info: ToolTraceInfo, trace_id: int, parent_span_id: int) -> SpanData:
"""Build tool span."""
status = Status(StatusCode.OK)
if trace_info.error:
status = Status(StatusCode.ERROR, trace_info.error)
return SpanData(
trace_id=trace_id,
parent_span_id=parent_span_id,
span_id=TencentTraceUtils.convert_to_span_id(trace_info.message_id, "tool"),
name=trace_info.tool_name,
start_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.start_time),
end_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.end_time),
attributes={
GEN_AI_SPAN_KIND: GenAISpanKind.TOOL.value,
GEN_AI_FRAMEWORK: "dify",
TOOL_NAME: trace_info.tool_name,
TOOL_DESCRIPTION: "",
TOOL_PARAMETERS: json.dumps(trace_info.tool_parameters, ensure_ascii=False),
INPUT_VALUE: json.dumps(trace_info.tool_inputs, ensure_ascii=False),
OUTPUT_VALUE: str(trace_info.tool_outputs),
},
status=status,
)
@staticmethod
def build_retrieval_span(trace_info: DatasetRetrievalTraceInfo, trace_id: int, parent_span_id: int) -> SpanData:
"""Build dataset retrieval span."""
status = Status(StatusCode.OK)
if getattr(trace_info, "error", None):
status = Status(StatusCode.ERROR, trace_info.error) # type: ignore[arg-type]
documents_data = TencentSpanBuilder._extract_retrieval_documents(trace_info.documents)
return SpanData(
trace_id=trace_id,
parent_span_id=parent_span_id,
span_id=TencentTraceUtils.convert_to_span_id(trace_info.message_id, "retrieval"),
name="retrieval",
start_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.start_time),
end_time=TencentSpanBuilder._get_time_nanoseconds(trace_info.end_time),
attributes={
GEN_AI_SPAN_KIND: GenAISpanKind.RETRIEVER.value,
GEN_AI_FRAMEWORK: "dify",
RETRIEVAL_QUERY: str(trace_info.inputs or ""),
RETRIEVAL_DOCUMENT: json.dumps(documents_data, ensure_ascii=False),
INPUT_VALUE: str(trace_info.inputs or ""),
OUTPUT_VALUE: json.dumps(documents_data, ensure_ascii=False),
},
status=status,
)
@staticmethod
def _get_workflow_node_status(node_execution: WorkflowNodeExecution) -> Status:
"""Get workflow node execution status."""
if node_execution.status == WorkflowNodeExecutionStatus.SUCCEEDED:
return Status(StatusCode.OK)
elif node_execution.status in [WorkflowNodeExecutionStatus.FAILED, WorkflowNodeExecutionStatus.EXCEPTION]:
return Status(StatusCode.ERROR, str(node_execution.error))
return Status(StatusCode.UNSET)
@staticmethod
def build_workflow_retrieval_span(
trace_id: int, workflow_span_id: int, trace_info: WorkflowTraceInfo, node_execution: WorkflowNodeExecution
) -> SpanData:
"""Build knowledge retrieval span for workflow nodes."""
input_value = ""
if node_execution.inputs:
input_value = str(node_execution.inputs.get("query", ""))
output_value = ""
if node_execution.outputs:
output_value = json.dumps(node_execution.outputs.get("result", []), ensure_ascii=False)
return SpanData(
trace_id=trace_id,
parent_span_id=workflow_span_id,
span_id=TencentTraceUtils.convert_to_span_id(node_execution.id, "node"),
name=node_execution.title,
start_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.created_at),
end_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.finished_at),
attributes={
GEN_AI_SESSION_ID: trace_info.metadata.get("conversation_id", ""),
GEN_AI_SPAN_KIND: GenAISpanKind.RETRIEVER.value,
GEN_AI_FRAMEWORK: "dify",
RETRIEVAL_QUERY: input_value,
RETRIEVAL_DOCUMENT: output_value,
INPUT_VALUE: input_value,
OUTPUT_VALUE: output_value,
},
status=TencentSpanBuilder._get_workflow_node_status(node_execution),
)
@staticmethod
def build_workflow_tool_span(
trace_id: int, workflow_span_id: int, trace_info: WorkflowTraceInfo, node_execution: WorkflowNodeExecution
) -> SpanData:
"""Build tool span for workflow nodes."""
tool_des = {}
if node_execution.metadata:
tool_des = node_execution.metadata.get(WorkflowNodeExecutionMetadataKey.TOOL_INFO, {})
return SpanData(
trace_id=trace_id,
parent_span_id=workflow_span_id,
span_id=TencentTraceUtils.convert_to_span_id(node_execution.id, "node"),
name=node_execution.title,
start_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.created_at),
end_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.finished_at),
attributes={
GEN_AI_SPAN_KIND: GenAISpanKind.TOOL.value,
GEN_AI_FRAMEWORK: "dify",
TOOL_NAME: node_execution.title,
TOOL_DESCRIPTION: json.dumps(tool_des, ensure_ascii=False),
TOOL_PARAMETERS: json.dumps(node_execution.inputs or {}, ensure_ascii=False),
INPUT_VALUE: json.dumps(node_execution.inputs or {}, ensure_ascii=False),
OUTPUT_VALUE: json.dumps(node_execution.outputs, ensure_ascii=False),
},
status=TencentSpanBuilder._get_workflow_node_status(node_execution),
)
@staticmethod
def build_workflow_task_span(
trace_id: int, workflow_span_id: int, trace_info: WorkflowTraceInfo, node_execution: WorkflowNodeExecution
) -> SpanData:
"""Build generic task span for workflow nodes."""
return SpanData(
trace_id=trace_id,
parent_span_id=workflow_span_id,
span_id=TencentTraceUtils.convert_to_span_id(node_execution.id, "node"),
name=node_execution.title,
start_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.created_at),
end_time=TencentSpanBuilder._get_time_nanoseconds(node_execution.finished_at),
attributes={
GEN_AI_SESSION_ID: trace_info.metadata.get("conversation_id", ""),
GEN_AI_SPAN_KIND: GenAISpanKind.TASK.value,
GEN_AI_FRAMEWORK: "dify",
INPUT_VALUE: json.dumps(node_execution.inputs, ensure_ascii=False),
OUTPUT_VALUE: json.dumps(node_execution.outputs, ensure_ascii=False),
},
status=TencentSpanBuilder._get_workflow_node_status(node_execution),
)
@staticmethod
def _extract_retrieval_documents(documents: list[Document]):
"""Extract documents data for retrieval tracing."""
documents_data = []
for document in documents:
document_data = {
"content": document.page_content,
"metadata": {
"dataset_id": document.metadata.get("dataset_id"),
"doc_id": document.metadata.get("doc_id"),
"document_id": document.metadata.get("document_id"),
},
"score": document.metadata.get("score"),
}
documents_data.append(document_data)
return documents_data

View File

@@ -0,0 +1,520 @@
"""
Tencent APM tracing implementation with separated concerns
"""
import logging
from sqlalchemy import select
from sqlalchemy.orm import Session, sessionmaker
from core.ops.base_trace_instance import BaseTraceInstance
from core.ops.entities.config_entity import TencentConfig
from core.ops.entities.trace_entity import (
BaseTraceInfo,
DatasetRetrievalTraceInfo,
GenerateNameTraceInfo,
MessageTraceInfo,
ModerationTraceInfo,
SuggestedQuestionTraceInfo,
ToolTraceInfo,
WorkflowTraceInfo,
)
from core.ops.tencent_trace.client import TencentTraceClient
from core.ops.tencent_trace.entities.tencent_trace_entity import SpanData
from core.ops.tencent_trace.span_builder import TencentSpanBuilder
from core.ops.tencent_trace.utils import TencentTraceUtils
from core.repositories import SQLAlchemyWorkflowNodeExecutionRepository
from core.workflow.entities.workflow_node_execution import (
WorkflowNodeExecution,
)
from core.workflow.nodes import NodeType
from extensions.ext_database import db
from models import Account, App, TenantAccountJoin, WorkflowNodeExecutionTriggeredFrom
logger = logging.getLogger(__name__)
class TencentDataTrace(BaseTraceInstance):
"""
Tencent APM trace implementation with single responsibility principle.
Acts as a coordinator that delegates specific tasks to specialized classes.
"""
def __init__(self, tencent_config: TencentConfig):
super().__init__(tencent_config)
self.trace_client = TencentTraceClient(
service_name=tencent_config.service_name,
endpoint=tencent_config.endpoint,
token=tencent_config.token,
metrics_export_interval_sec=5,
)
def trace(self, trace_info: BaseTraceInfo) -> None:
"""Main tracing entry point - coordinates different trace types."""
if isinstance(trace_info, WorkflowTraceInfo):
self.workflow_trace(trace_info)
elif isinstance(trace_info, MessageTraceInfo):
self.message_trace(trace_info)
elif isinstance(trace_info, ModerationTraceInfo):
pass
elif isinstance(trace_info, SuggestedQuestionTraceInfo):
self.suggested_question_trace(trace_info)
elif isinstance(trace_info, DatasetRetrievalTraceInfo):
self.dataset_retrieval_trace(trace_info)
elif isinstance(trace_info, ToolTraceInfo):
self.tool_trace(trace_info)
elif isinstance(trace_info, GenerateNameTraceInfo):
pass
def api_check(self) -> bool:
return self.trace_client.api_check()
def get_project_url(self) -> str:
return self.trace_client.get_project_url()
def workflow_trace(self, trace_info: WorkflowTraceInfo) -> None:
"""Handle workflow tracing by coordinating data retrieval and span construction."""
try:
trace_id = TencentTraceUtils.convert_to_trace_id(trace_info.workflow_run_id)
links = []
if trace_info.trace_id:
links.append(TencentTraceUtils.create_link(trace_info.trace_id))
user_id = self._get_user_id(trace_info)
workflow_spans = TencentSpanBuilder.build_workflow_spans(trace_info, trace_id, str(user_id), links)
for span in workflow_spans:
self.trace_client.add_span(span)
self._process_workflow_nodes(trace_info, trace_id)
# Record trace duration for entry span
self._record_workflow_trace_duration(trace_info)
except Exception:
logger.exception("[Tencent APM] Failed to process workflow trace")
def message_trace(self, trace_info: MessageTraceInfo) -> None:
"""Handle message tracing."""
try:
trace_id = TencentTraceUtils.convert_to_trace_id(trace_info.message_id)
user_id = self._get_user_id(trace_info)
links = []
if trace_info.trace_id:
links.append(TencentTraceUtils.create_link(trace_info.trace_id))
message_span = TencentSpanBuilder.build_message_span(trace_info, trace_id, str(user_id), links)
self.trace_client.add_span(message_span)
self._record_message_llm_metrics(trace_info)
# Record trace duration for entry span
self._record_message_trace_duration(trace_info)
except Exception:
logger.exception("[Tencent APM] Failed to process message trace")
def tool_trace(self, trace_info: ToolTraceInfo) -> None:
"""Handle tool tracing."""
try:
parent_span_id = None
trace_root_id = None
if trace_info.message_id:
parent_span_id = TencentTraceUtils.convert_to_span_id(trace_info.message_id, "message")
trace_root_id = trace_info.message_id
if parent_span_id and trace_root_id:
trace_id = TencentTraceUtils.convert_to_trace_id(trace_root_id)
tool_span = TencentSpanBuilder.build_tool_span(trace_info, trace_id, parent_span_id)
self.trace_client.add_span(tool_span)
except Exception:
logger.exception("[Tencent APM] Failed to process tool trace")
def dataset_retrieval_trace(self, trace_info: DatasetRetrievalTraceInfo) -> None:
"""Handle dataset retrieval tracing."""
try:
parent_span_id = None
trace_root_id = None
if trace_info.message_id:
parent_span_id = TencentTraceUtils.convert_to_span_id(trace_info.message_id, "message")
trace_root_id = trace_info.message_id
if parent_span_id and trace_root_id:
trace_id = TencentTraceUtils.convert_to_trace_id(trace_root_id)
retrieval_span = TencentSpanBuilder.build_retrieval_span(trace_info, trace_id, parent_span_id)
self.trace_client.add_span(retrieval_span)
except Exception:
logger.exception("[Tencent APM] Failed to process dataset retrieval trace")
def suggested_question_trace(self, trace_info: SuggestedQuestionTraceInfo) -> None:
"""Handle suggested question tracing"""
try:
logger.info("[Tencent APM] Processing suggested question trace")
except Exception:
logger.exception("[Tencent APM] Failed to process suggested question trace")
def _process_workflow_nodes(self, trace_info: WorkflowTraceInfo, trace_id: int) -> None:
"""Process workflow node executions."""
try:
workflow_span_id = TencentTraceUtils.convert_to_span_id(trace_info.workflow_run_id, "workflow")
node_executions = self._get_workflow_node_executions(trace_info)
for node_execution in node_executions:
try:
node_span = self._build_workflow_node_span(node_execution, trace_id, trace_info, workflow_span_id)
if node_span:
self.trace_client.add_span(node_span)
if node_execution.node_type == NodeType.LLM:
self._record_llm_metrics(node_execution)
except Exception:
logger.exception("[Tencent APM] Failed to process node execution: %s", node_execution.id)
except Exception:
logger.exception("[Tencent APM] Failed to process workflow nodes")
def _build_workflow_node_span(
self, node_execution: WorkflowNodeExecution, trace_id: int, trace_info: WorkflowTraceInfo, workflow_span_id: int
) -> SpanData | None:
"""Build span for different node types"""
try:
if node_execution.node_type == NodeType.LLM:
return TencentSpanBuilder.build_workflow_llm_span(
trace_id, workflow_span_id, trace_info, node_execution
)
elif node_execution.node_type == NodeType.KNOWLEDGE_RETRIEVAL:
return TencentSpanBuilder.build_workflow_retrieval_span(
trace_id, workflow_span_id, trace_info, node_execution
)
elif node_execution.node_type == NodeType.TOOL:
return TencentSpanBuilder.build_workflow_tool_span(
trace_id, workflow_span_id, trace_info, node_execution
)
else:
# Handle all other node types as generic tasks
return TencentSpanBuilder.build_workflow_task_span(
trace_id, workflow_span_id, trace_info, node_execution
)
except Exception:
logger.debug(
"[Tencent APM] Error building span for node %s: %s",
node_execution.id,
node_execution.node_type,
exc_info=True,
)
return None
def _get_workflow_node_executions(self, trace_info: WorkflowTraceInfo) -> list[WorkflowNodeExecution]:
"""Retrieve workflow node executions from database."""
try:
session_maker = sessionmaker(bind=db.engine)
with Session(db.engine, expire_on_commit=False) as session:
app_id = trace_info.metadata.get("app_id")
if not app_id:
raise ValueError("No app_id found in trace_info metadata")
app_stmt = select(App).where(App.id == app_id)
app = session.scalar(app_stmt)
if not app:
raise ValueError(f"App with id {app_id} not found")
if not app.created_by:
raise ValueError(f"App with id {app_id} has no creator")
account_stmt = select(Account).where(Account.id == app.created_by)
service_account = session.scalar(account_stmt)
if not service_account:
raise ValueError(f"Creator account not found for app {app_id}")
current_tenant = (
session.query(TenantAccountJoin).filter_by(account_id=service_account.id, current=True).first()
)
if not current_tenant:
raise ValueError(f"Current tenant not found for account {service_account.id}")
service_account.set_tenant_id(current_tenant.tenant_id)
repository = SQLAlchemyWorkflowNodeExecutionRepository(
session_factory=session_maker,
user=service_account,
app_id=trace_info.metadata.get("app_id"),
triggered_from=WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
)
executions = repository.get_by_workflow_run(workflow_run_id=trace_info.workflow_run_id)
return list(executions)
except Exception:
logger.exception("[Tencent APM] Failed to get workflow node executions")
return []
def _get_user_id(self, trace_info: BaseTraceInfo) -> str:
"""Get user ID from trace info."""
try:
tenant_id = None
user_id = None
if isinstance(trace_info, (WorkflowTraceInfo, GenerateNameTraceInfo)):
tenant_id = trace_info.tenant_id
if hasattr(trace_info, "metadata") and trace_info.metadata:
user_id = trace_info.metadata.get("user_id")
if user_id and tenant_id:
stmt = (
select(Account.name)
.join(TenantAccountJoin, Account.id == TenantAccountJoin.account_id)
.where(Account.id == user_id, TenantAccountJoin.tenant_id == tenant_id)
)
session_maker = sessionmaker(bind=db.engine)
with session_maker() as session:
account_name = session.scalar(stmt)
return account_name or str(user_id)
elif user_id:
return str(user_id)
return "anonymous"
except Exception:
logger.exception("[Tencent APM] Failed to get user ID")
return "unknown"
def _record_llm_metrics(self, node_execution: WorkflowNodeExecution) -> None:
"""Record LLM performance metrics"""
try:
process_data = node_execution.process_data or {}
outputs = node_execution.outputs or {}
usage = process_data.get("usage", {}) if "usage" in process_data else outputs.get("usage", {})
model_provider = process_data.get("model_provider", "unknown")
model_name = process_data.get("model_name", "unknown")
model_mode = process_data.get("model_mode", "chat")
# Record LLM duration
if hasattr(self.trace_client, "record_llm_duration"):
latency_s = float(usage.get("latency", 0.0))
if latency_s > 0:
# Determine if streaming from usage metrics
is_streaming = usage.get("time_to_first_token") is not None
attributes = {
"gen_ai.system": model_provider,
"gen_ai.response.model": model_name,
"gen_ai.operation.name": model_mode,
"stream": "true" if is_streaming else "false",
}
self.trace_client.record_llm_duration(latency_s, attributes)
# Record streaming metrics from usage
time_to_first_token = usage.get("time_to_first_token")
if time_to_first_token is not None and hasattr(self.trace_client, "record_time_to_first_token"):
ttft_seconds = float(time_to_first_token)
if ttft_seconds > 0:
self.trace_client.record_time_to_first_token(
ttft_seconds=ttft_seconds, provider=model_provider, model=model_name, operation_name=model_mode
)
time_to_generate = usage.get("time_to_generate")
if time_to_generate is not None and hasattr(self.trace_client, "record_time_to_generate"):
ttg_seconds = float(time_to_generate)
if ttg_seconds > 0:
self.trace_client.record_time_to_generate(
ttg_seconds=ttg_seconds, provider=model_provider, model=model_name, operation_name=model_mode
)
# Record token usage
if hasattr(self.trace_client, "record_token_usage"):
# Extract token counts
input_tokens = int(usage.get("prompt_tokens", 0))
output_tokens = int(usage.get("completion_tokens", 0))
if input_tokens > 0 or output_tokens > 0:
server_address = f"{model_provider}"
# Record input tokens
if input_tokens > 0:
self.trace_client.record_token_usage(
token_count=input_tokens,
token_type="input",
operation_name=model_mode,
request_model=model_name,
response_model=model_name,
server_address=server_address,
provider=model_provider,
)
# Record output tokens
if output_tokens > 0:
self.trace_client.record_token_usage(
token_count=output_tokens,
token_type="output",
operation_name=model_mode,
request_model=model_name,
response_model=model_name,
server_address=server_address,
provider=model_provider,
)
except Exception:
logger.debug("[Tencent APM] Failed to record LLM metrics")
def _record_message_llm_metrics(self, trace_info: MessageTraceInfo) -> None:
"""Record LLM metrics for message traces"""
try:
trace_metadata = trace_info.metadata or {}
message_data = trace_info.message_data or {}
provider_latency = 0.0
if isinstance(message_data, dict):
provider_latency = float(message_data.get("provider_response_latency", 0.0) or 0.0)
else:
provider_latency = float(getattr(message_data, "provider_response_latency", 0.0) or 0.0)
model_provider = trace_metadata.get("ls_provider") or (
message_data.get("model_provider", "") if isinstance(message_data, dict) else ""
)
model_name = trace_metadata.get("ls_model_name") or (
message_data.get("model_id", "") if isinstance(message_data, dict) else ""
)
# Record LLM duration
if provider_latency > 0 and hasattr(self.trace_client, "record_llm_duration"):
is_streaming = trace_info.is_streaming_request
duration_attributes = {
"gen_ai.system": model_provider,
"gen_ai.response.model": model_name,
"gen_ai.operation.name": "chat", # Message traces are always chat
"stream": "true" if is_streaming else "false",
}
self.trace_client.record_llm_duration(provider_latency, duration_attributes)
# Record streaming metrics for message traces
if trace_info.is_streaming_request:
# Record time to first token
if trace_info.gen_ai_server_time_to_first_token is not None and hasattr(
self.trace_client, "record_time_to_first_token"
):
ttft_seconds = float(trace_info.gen_ai_server_time_to_first_token)
if ttft_seconds > 0:
self.trace_client.record_time_to_first_token(
ttft_seconds=ttft_seconds, provider=str(model_provider or ""), model=str(model_name or "")
)
# Record time to generate
if trace_info.llm_streaming_time_to_generate is not None and hasattr(
self.trace_client, "record_time_to_generate"
):
ttg_seconds = float(trace_info.llm_streaming_time_to_generate)
if ttg_seconds > 0:
self.trace_client.record_time_to_generate(
ttg_seconds=ttg_seconds, provider=str(model_provider or ""), model=str(model_name or "")
)
# Record token usage
if hasattr(self.trace_client, "record_token_usage"):
input_tokens = int(trace_info.message_tokens or 0)
output_tokens = int(trace_info.answer_tokens or 0)
if input_tokens > 0:
self.trace_client.record_token_usage(
token_count=input_tokens,
token_type="input",
operation_name="chat",
request_model=str(model_name or ""),
response_model=str(model_name or ""),
server_address=str(model_provider or ""),
provider=str(model_provider or ""),
)
if output_tokens > 0:
self.trace_client.record_token_usage(
token_count=output_tokens,
token_type="output",
operation_name="chat",
request_model=str(model_name or ""),
response_model=str(model_name or ""),
server_address=str(model_provider or ""),
provider=str(model_provider or ""),
)
except Exception:
logger.debug("[Tencent APM] Failed to record message LLM metrics")
def _record_workflow_trace_duration(self, trace_info: WorkflowTraceInfo) -> None:
"""Record end-to-end workflow trace duration."""
try:
if not hasattr(self.trace_client, "record_trace_duration"):
return
# Calculate duration from start_time and end_time to match span duration
if trace_info.start_time and trace_info.end_time:
duration_s = (trace_info.end_time - trace_info.start_time).total_seconds()
else:
# Fallback to workflow_run_elapsed_time if timestamps not available
duration_s = float(trace_info.workflow_run_elapsed_time)
if duration_s > 0:
attributes = {
"conversation_mode": "workflow",
"workflow_status": trace_info.workflow_run_status,
}
# Add conversation_id if available
if trace_info.conversation_id:
attributes["has_conversation"] = "true"
else:
attributes["has_conversation"] = "false"
self.trace_client.record_trace_duration(duration_s, attributes)
except Exception:
logger.debug("[Tencent APM] Failed to record workflow trace duration")
def _record_message_trace_duration(self, trace_info: MessageTraceInfo) -> None:
"""Record end-to-end message trace duration."""
try:
if not hasattr(self.trace_client, "record_trace_duration"):
return
# Calculate duration from start_time and end_time
if trace_info.start_time and trace_info.end_time:
duration = (trace_info.end_time - trace_info.start_time).total_seconds()
if duration > 0:
attributes = {
"conversation_mode": trace_info.conversation_mode,
}
# Add streaming flag if available
if hasattr(trace_info, "is_streaming_request"):
attributes["stream"] = "true" if trace_info.is_streaming_request else "false"
self.trace_client.record_trace_duration(duration, attributes)
except Exception:
logger.debug("[Tencent APM] Failed to record message trace duration")
def __del__(self):
"""Ensure proper cleanup on garbage collection."""
try:
if hasattr(self, "trace_client"):
self.trace_client.shutdown()
except Exception:
pass

View File

@@ -0,0 +1,65 @@
"""
Utility functions for Tencent APM tracing
"""
import hashlib
import random
import uuid
from datetime import datetime
from typing import cast
from opentelemetry.trace import Link, SpanContext, TraceFlags
class TencentTraceUtils:
"""Utility class for common tracing operations."""
INVALID_SPAN_ID = 0x0000000000000000
INVALID_TRACE_ID = 0x00000000000000000000000000000000
@staticmethod
def convert_to_trace_id(uuid_v4: str | None) -> int:
try:
uuid_obj = uuid.UUID(uuid_v4) if uuid_v4 else uuid.uuid4()
except Exception as e:
raise ValueError(f"Invalid UUID input: {e}")
return cast(int, uuid_obj.int)
@staticmethod
def convert_to_span_id(uuid_v4: str | None, span_type: str) -> int:
try:
uuid_obj = uuid.UUID(uuid_v4) if uuid_v4 else uuid.uuid4()
except Exception as e:
raise ValueError(f"Invalid UUID input: {e}")
combined_key = f"{uuid_obj.hex}-{span_type}"
hash_bytes = hashlib.sha256(combined_key.encode("utf-8")).digest()
return int.from_bytes(hash_bytes[:8], byteorder="big", signed=False)
@staticmethod
def generate_span_id() -> int:
span_id = random.getrandbits(64)
while span_id == TencentTraceUtils.INVALID_SPAN_ID:
span_id = random.getrandbits(64)
return span_id
@staticmethod
def convert_datetime_to_nanoseconds(start_time: datetime | None) -> int:
if start_time is None:
start_time = datetime.now()
timestamp_in_seconds = start_time.timestamp()
return int(timestamp_in_seconds * 1e9)
@staticmethod
def create_link(trace_id_str: str) -> Link:
try:
trace_id = int(trace_id_str, 16) if len(trace_id_str) == 32 else cast(int, uuid.UUID(trace_id_str).int)
except (ValueError, TypeError):
trace_id = cast(int, uuid.uuid4().int)
span_context = SpanContext(
trace_id=trace_id,
span_id=TencentTraceUtils.INVALID_SPAN_ID,
is_remote=False,
trace_flags=TraceFlags(TraceFlags.SAMPLED),
)
return Link(span_context)