Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b3006e6
✨ Feat: Update monitoring configuration to use OpenTelemetry OTLP pro…
hhhhsc701 Apr 27, 2026
4351d67
Refine OpenTelemetry monitoring and multi-platform config
Apr 29, 2026
b4f1d67
Merge branch 'develop' into dev/opentelemetry
Apr 29, 2026
b71d7af
Add local Phoenix and Langfuse monitoring deployment support
May 6, 2026
e4e37ec
Merge branch 'refs/heads/develop' into dev/opentelemetry
hhhhsc701 May 6, 2026
d6035af
✨ Feat: Enhance monitoring capabilities with FastAPI instrumentation …
May 6, 2026
6290b32
✨ Feat: Add Grafana and Tempo support for enhanced monitoring capabil…
May 8, 2026
6bc16e7
✨ Feat: Update monitoring configurations with specific versioning for…
May 8, 2026
b7d8d64
✨ Feat: Refactor monitoring configuration to use environment variable…
May 8, 2026
d734591
✨ Feat: Expand OpenTelemetry design documentation to include observab…
May 9, 2026
7a752e7
✨ Feat: Add support for Apache SkyWalking as a monitoring provider, i…
May 11, 2026
db73c07
支持zipkin查看
hhhhsc701 May 11, 2026
6b08eb5
✨ Feat: Add support for LangSmith as a monitoring provider, including…
May 12, 2026
5e1c1b4
Merge remote-tracking branch 'origin/dev/opentelemetry' into dev/open…
May 12, 2026
6e4e531
clean code
May 12, 2026
8ffe8b6
Merge branch 'develop' into dev/opentelemetry
May 12, 2026
59b9e18
clean code
May 13, 2026
224c291
✨ Feat: Add OpenTelemetry monitoring stack with configurable provider…
May 13, 2026
d614401
Normalize monitoring env variables
May 13, 2026
2dda002
✨ Feat: Add OpenTelemetry monitoring stack with configurable provider…
May 13, 2026
95bfe0f
Merge branch 'refs/heads/develop' into dev/opentelemetry
hhhhsc701 May 14, 2026
c567866
支持白名单黑名单配置
hhhhsc701 May 15, 2026
7ddcae3
补充
May 15, 2026
165aa67
优化代码实现
May 18, 2026
b0cf078
优化代码实现
May 18, 2026
2588253
Enhance monitoring trace payload summaries
May 18, 2026
c31482c
fix test
May 18, 2026
d74f102
Merge branch 'refs/heads/develop' into dev/opentelemetry
hhhhsc701 May 19, 2026
718c166
clean code
May 19, 2026
f33dc37
Merge remote-tracking branch 'origin/dev/opentelemetry' into dev/open…
May 19, 2026
ba53386
Enhance monitoring configuration by adding LangSmith API key, project…
hhhhsc701 May 19, 2026
80f23da
Merge remote-tracking branch 'origin/dev/opentelemetry' into dev/open…
hhhhsc701 May 19, 2026
5f4a026
Add monitoring configuration options for trace content and limits
hhhhsc701 May 19, 2026
7dfb394
Remove deprecated fastapi flag and normalize trace payloads
May 19, 2026
4d8ed77
清楚langfuse适配字段
May 19, 2026
2b8421c
Update service configurations to support NodePort for Grafana, Langfu…
hhhhsc701 May 20, 2026
18e7758
网络配置优化
May 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ docker/uploads
docker/openssh-server
docker/volumes/db/data
docker/.env
docker/monitoring/monitoring.env
docker/.run
docker/deploy.options
k8s/helm/.deploy.options
Expand Down
5 changes: 0 additions & 5 deletions backend/apps/agent_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,13 @@
)
from utils.auth_utils import get_current_user_info, get_current_user_id

# Import monitoring utilities
from utils.monitoring import monitoring_manager

agent_runtime_router = APIRouter(prefix="/agent")
agent_config_router = APIRouter(prefix="/agent")
logger = logging.getLogger("agent_app")


# Define API route
@agent_runtime_router.post("/run")
@monitoring_manager.monitor_endpoint("agent.run", exclude_params=["authorization"])
async def agent_run_api(agent_request: AgentRequest, http_request: Request, authorization: str = Header(None)):
"""
Agent execution API endpoint
Expand Down Expand Up @@ -555,4 +551,3 @@ async def list_published_agents_api(
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail="Published agents list error."
)

44 changes: 39 additions & 5 deletions backend/apps/monitoring_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@

import logging
from http import HTTPStatus
from typing import Annotated, Optional
from typing import Annotated, Any

from fastapi import APIRouter, Header, HTTPException, Query
from sqlalchemy import text

from consts.const import (
ENABLE_TELEMETRY,
MONITORING_DASHBOARD_URL,
MONITORING_PROVIDER,
)
from consts.model import ConversationResponse
from database.client import get_monitoring_db_session
from utils.auth_utils import get_current_user_id
Expand All @@ -21,19 +26,38 @@
router = APIRouter(prefix="/monitoring")


def _normalize_monitoring_provider(value: str | None) -> str:
return str(value or "otlp").strip().lower()


def get_monitoring_status() -> dict[str, Any]:
"""Return telemetry state and the monitoring UI entrypoint for frontend use."""
telemetry_enabled = ENABLE_TELEMETRY
provider = _normalize_monitoring_provider(MONITORING_PROVIDER)
dashboard_url = MONITORING_DASHBOARD_URL.strip() or None

return {
"telemetry_enabled": telemetry_enabled,
"provider": provider,
"dashboard_url": dashboard_url,
"dashboard_port": None,
"dashboard_path": None,
}


def _compute_time_range_filter(time_range: str) -> str:
"""Convert time_range parameter to SQL timestamp condition."""
hours = {"24h": 24, "7d": 168, "30d": 720}.get(time_range, 24)
return f"m.create_time >= NOW() - INTERVAL '{hours} hours'"


def _query_model_metrics_from_db(
time_range: str, tenant_id: Optional[str] = None
) -> list[dict]:
time_range: str, tenant_id: str | None = None
) -> list[dict[str, Any]]:
time_filter = _compute_time_range_filter(time_range)

tenant_filter = ""
params = {}
params: dict[str, str] = {}
if tenant_id:
tenant_filter = "AND m.tenant_id = :tenant_id"
params["tenant_id"] = tenant_id
Expand Down Expand Up @@ -96,7 +120,7 @@ async def list_models_endpoint(
page: Annotated[int, Query(ge=1, description="Page number")] = 1,
page_size: Annotated[int, Query(
ge=1, le=100, description="Items per page")] = 20,
authorization: Annotated[Optional[str], Header()] = None,
authorization: Annotated[str | None, Header()] = None,
):
"""List all models with aggregated monitoring metrics from database."""
try:
Expand All @@ -113,3 +137,13 @@ async def list_models_endpoint(
logger.error(f"Failed to list monitoring models: {str(e)}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=str(e))


@router.get("/status", response_model=ConversationResponse)
async def get_monitoring_status_endpoint():
"""Return whether monitoring UI should be shown in the frontend."""
return ConversationResponse(
code=0,
message="success",
data=get_monitoring_status(),
)
73 changes: 60 additions & 13 deletions backend/consts/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,19 +336,66 @@ class VectorDatabaseType(str, Enum):
THINK_END_PATTERN = "</think>"


# Telemetry and Monitoring Configuration
ENABLE_TELEMETRY = os.getenv("ENABLE_TELEMETRY", "false").lower() == "true"
SERVICE_NAME = os.getenv("SERVICE_NAME", "nexent-backend")
JAEGER_ENDPOINT = os.getenv(
"JAEGER_ENDPOINT", "http://localhost:14268/api/traces")
PROMETHEUS_PORT = int(os.getenv("PROMETHEUS_PORT", "8000"))
TELEMETRY_SAMPLE_RATE = float(os.getenv("TELEMETRY_SAMPLE_RATE", "1.0"))

# Performance monitoring thresholds
LLM_SLOW_REQUEST_THRESHOLD_SECONDS = float(
os.getenv("LLM_SLOW_REQUEST_THRESHOLD_SECONDS", "5.0"))
LLM_SLOW_TOKEN_RATE_THRESHOLD = float(
os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD", "10.0")) # tokens per second
# Telemetry and Monitoring Configuration (OTLP Protocol)
MONITORING_PROVIDER = os.getenv("MONITORING_PROVIDER", "")
ENABLE_TELEMETRY_RAW = os.getenv("ENABLE_TELEMETRY")
ENABLE_TELEMETRY = (ENABLE_TELEMETRY_RAW or "false").lower() == "true"
OTEL_SERVICE_NAME_RAW = os.getenv("OTEL_SERVICE_NAME")
OTEL_SERVICE_NAME = OTEL_SERVICE_NAME_RAW or "nexent-backend"
OTEL_EXPORTER_OTLP_ENDPOINT_RAW = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
OTEL_EXPORTER_OTLP_ENDPOINT = OTEL_EXPORTER_OTLP_ENDPOINT_RAW or "http://localhost:4318"
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "")
OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "")
OTEL_EXPORTER_OTLP_PROTOCOL_RAW = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL")
OTEL_EXPORTER_OTLP_PROTOCOL = OTEL_EXPORTER_OTLP_PROTOCOL_RAW or "http"
OTEL_EXPORTER_OTLP_HEADERS_RAW = os.getenv("OTEL_EXPORTER_OTLP_HEADERS")
OTEL_EXPORTER_OTLP_HEADERS = OTEL_EXPORTER_OTLP_HEADERS_RAW or ""
OTEL_EXPORTER_OTLP_AUTHORIZATION = os.getenv("OTEL_EXPORTER_OTLP_AUTHORIZATION", "")
OTEL_EXPORTER_OTLP_X_API_KEY = os.getenv("OTEL_EXPORTER_OTLP_X_API_KEY", "")
OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION = os.getenv(
"OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION", "")
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY", "")
LANGSMITH_PROJECT = os.getenv("LANGSMITH_PROJECT", "")
OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENABLED")
OTEL_EXPORTER_OTLP_METRICS_ENABLED = (
OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW or "true").lower() == "true"
MONITORING_INSTRUMENT_REQUESTS_RAW = os.getenv("MONITORING_INSTRUMENT_REQUESTS")
MONITORING_INSTRUMENT_REQUESTS = (
MONITORING_INSTRUMENT_REQUESTS_RAW or "false").lower() == "true"
MONITORING_FASTAPI_INCLUDED_URLS = os.getenv("MONITORING_FASTAPI_INCLUDED_URLS", "")
MONITORING_FASTAPI_EXCLUDED_URLS = os.getenv("MONITORING_FASTAPI_EXCLUDED_URLS", "")
MONITORING_FASTAPI_EXCLUDE_SPANS = os.getenv("MONITORING_FASTAPI_EXCLUDE_SPANS", "receive,send")
MONITORING_PROJECT_NAME = os.getenv("MONITORING_PROJECT_NAME", "")
MONITORING_DASHBOARD_URL = os.getenv("MONITORING_DASHBOARD_URL", "")
MONITORING_TRACE_CONTENT_MODE = os.getenv("MONITORING_TRACE_CONTENT_MODE", "summary")
MONITORING_TRACE_MAX_CHARS = os.getenv("MONITORING_TRACE_MAX_CHARS", "4000")
MONITORING_TRACE_MAX_ITEMS = os.getenv("MONITORING_TRACE_MAX_ITEMS", "20")
TELEMETRY_SAMPLE_RATE_RAW = os.getenv("TELEMETRY_SAMPLE_RATE")
TELEMETRY_SAMPLE_RATE = float(TELEMETRY_SAMPLE_RATE_RAW or "1.0")

# Parse OTLP headers into dict format
def _parse_otlp_headers(headers_str: str) -> dict:
"""Parse OTLP headers string into dict. Format: 'key1=value1,key2=value2'"""
if not headers_str:
return {}
headers = {}
for pair in headers_str.split(","):
if "=" in pair:
key, value = pair.split("=", 1)
headers[key.strip()] = value.strip()
return headers

OTLP_HEADERS = _parse_otlp_headers(OTEL_EXPORTER_OTLP_HEADERS)
if OTEL_EXPORTER_OTLP_AUTHORIZATION:
OTLP_HEADERS["Authorization"] = OTEL_EXPORTER_OTLP_AUTHORIZATION
if OTEL_EXPORTER_OTLP_X_API_KEY:
OTLP_HEADERS["x-api-key"] = OTEL_EXPORTER_OTLP_X_API_KEY
elif LANGSMITH_API_KEY:
OTLP_HEADERS["x-api-key"] = LANGSMITH_API_KEY
if LANGSMITH_PROJECT:
OTLP_HEADERS["Langsmith-Project"] = LANGSMITH_PROJECT
if OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION:
OTLP_HEADERS["x-langfuse-ingestion-version"] = OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION


DEFAULT_ZH_TITLE = "新对话"
Expand Down
Loading
Loading