BodySense 可观测性设计
BodySense 项目的可观测性设计:结构化日志、Prometheus 指标、Grafana 仪表盘、告警规则、分布式追踪。
#type / concept
#status / growing
#tech / ops
#tech / dev / backend
[!info] related notes
- 知识地图: BodySense 项目 MOC
- 部署: Docker Compose
- 健康检查: 健康检查端点
BodySense 可观测性设计
可观测性三支柱
┌─────────────────────────────────────────────────────────┐
│ 可观测性 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 日志 │ │ 指标 │ │ 追踪 │ │
│ │ Logs │ │ Metrics │ │ Traces │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Loki │ │ Prometheus │ │ Jaeger │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │ │ │
│ └───────────────┼───────────────┘ │
│ ▼ │
│ ┌─────────────┐ │
│ │ Grafana │ │
│ └─────────────┘ │
└─────────────────────────────────────────────────────────┘
1. 结构化日志
Go 后端日志
// logger/logger.go
package logger
import (
"log/slog"
"os"
)
type Logger struct {
*slog.Logger
}
func New(level string) *Logger {
var logLevel slog.Level
switch level {
case "debug":
logLevel = slog.LevelDebug
case "info":
logLevel = slog.LevelInfo
case "warn":
logLevel = slog.LevelWarn
case "error":
logLevel = slog.LevelError
default:
logLevel = slog.LevelInfo
}
handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: logLevel,
})
return &Logger{slog.New(handler)}
}
// 带上下文的日志
func (l *Logger) WithRequestID(requestID string) *Logger {
return &Logger{l.Logger.With(slog.String("request_id", requestID))}
}
func (l *Logger) WithUserID(userID string) *Logger {
return &Logger{l.Logger.With(slog.String("user_id", userID))}
}
func (l *Logger) WithSessionID(sessionID string) *Logger {
return &Logger{l.Logger.With(slog.String("session_id", sessionID))}
}
日志级别使用
// Debug: 开发调试
logger.Debug("processing message",
slog.String("session_id", sessionID),
slog.String("message", message),
)
// Info: 正常操作
logger.Info("user logged in",
slog.String("user_id", userID),
slog.String("email", email),
)
// Warn: 警告
logger.Warn("rate limit approaching",
slog.String("client_ip", clientIP),
slog.Int64("current", current),
slog.Int("limit", limit),
)
// Error: 错误
logger.Error("failed to create session",
err,
slog.String("user_id", userID),
)
日志格式示例
{
"time": "2026-06-25T10:00:00Z",
"level": "INFO",
"msg": "user logged in",
"request_id": "req-123",
"user_id": "user-456",
"email": "test@example.com",
"method": "POST",
"path": "/api/v1/auth/login",
"status": 200,
"duration_ms": 150
}
Python AI 服务日志
# logger/logger.py
import structlog
import logging
def setup_logging(level: str = "INFO"):
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(
getattr(logging, level.upper())
),
)
def get_logger(**kwargs):
return structlog.get_logger(**kwargs)
# 使用
logger = get_logger(session_id="session-123")
logger.info("processing message", message="我肩膀疼")
2. Prometheus 指标
指标类型
| 类型 | 用途 | 示例 |
|---|---|---|
| Counter | 单调递增 | 请求总数、错误总数 |
| Gauge | 可增可减 | 当前连接数、内存使用 |
| Histogram | 分布统计 | 请求延迟分布 |
| Summary | 分位数 | P99 延迟 |
Go 后端指标
// metrics/metrics.go
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// HTTP 请求指标
httpRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status"},
)
httpRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 5},
},
[]string{"method", "path"},
)
// AI 服务指标
llmRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_requests_total",
Help: "Total number of LLM requests",
},
[]string{"model", "status"},
)
llmRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "llm_request_duration_seconds",
Help: "LLM request duration in seconds",
Buckets: []float64{0.5, 1, 2, 5, 10, 30},
},
[]string{"model"},
)
llmTokensUsed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "llm_tokens_used_total",
Help: "Total number of tokens used",
},
[]string{"model", "type"}, // type: prompt/completion
)
// 业务指标
activeSessions = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "active_sessions",
Help: "Number of active consultation sessions",
},
)
redFlagsDetected = promauto.NewCounter(
prometheus.CounterOpts{
Name: "red_flags_detected_total",
Help: "Total number of red flags detected",
},
)
// RAG 指标
ragSearchDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rag_search_duration_seconds",
Help: "RAG search duration in seconds",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1},
},
[]string{"type"}, // type: vector/keyword
)
ragSearchResults = promauto.NewHistogram(
prometheus.HistogramOpts{
Name: "rag_search_results",
Help: "Number of RAG search results",
Buckets: []float64{0, 1, 5, 10, 20},
},
)
)
指标记录
// middleware/metrics.go
func MetricsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
c.Next()
duration := time.Since(start).Seconds()
status := strconv.Itoa(c.Writer.Status())
httpRequestsTotal.WithLabelValues(
c.Request.Method,
c.FullPath(),
status,
).Inc()
httpRequestDuration.WithLabelValues(
c.Request.Method,
c.FullPath(),
).Observe(duration)
}
}
// AI 服务指标
func (p *Provider) callLLM(ctx context.Context, model string, messages []Message) (string, error) {
start := time.Now()
response, err := p.client.ChatCompletion(ctx, model, messages)
duration := time.Since(start).Seconds()
llmRequestDuration.WithLabelValues(model).Observe(duration)
if err != nil {
llmRequestsTotal.WithLabelValues(model, "error").Inc()
return "", err
}
llmRequestsTotal.WithLabelValues(model, "success").Inc()
llmTokensUsed.WithLabelValues(model, "prompt").Add(float64(response.Usage.PromptTokens))
llmTokensUsed.WithLabelValues(model, "completion").Add(float64(response.Usage.CompletionTokens))
return response.Content, nil
}
Prometheus 配置
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'bodysense-api'
static_configs:
- targets: ['api:8080']
metrics_path: '/metrics'
- job_name: 'bodysense-ai'
static_configs:
- targets: ['ai-service:8100']
metrics_path: '/metrics'
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
3. Grafana 仪表盘
API 仪表盘
{
"dashboard": {
"title": "BodySense API",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{path}} {{status}}"
}
]
},
{
"title": "Request Duration P99",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "{{method}} {{path}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m])",
"legendFormat": "Error Rate"
}
]
}
]
}
}
AI 服务仪表盘
{
"dashboard": {
"title": "BodySense AI Service",
"panels": [
{
"title": "LLM Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(llm_requests_total[5m])",
"legendFormat": "{{model}} {{status}}"
}
]
},
{
"title": "LLM Token Usage",
"type": "graph",
"targets": [
{
"expr": "rate(llm_tokens_used_total[5m])",
"legendFormat": "{{model}} {{type}}"
}
]
},
{
"title": "LLM Latency P95",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m]))",
"legendFormat": "{{model}}"
}
]
},
{
"title": "Red Flags Detected",
"type": "stat",
"targets": [
{
"expr": "increase(red_flags_detected_total[24h])",
"legendFormat": "24h"
}
]
}
]
}
}
4. 告警规则
告警配置
# alerts/rules.yml
groups:
- name: api_alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "P99 latency is {{ $value }}s"
- alert: ServiceDown
expr: up{job="bodysense-api"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "API service is down"
- name: ai_alerts
rules:
- alert: LLMHighErrorRate
expr: rate(llm_requests_total{status="error"}[5m]) / rate(llm_requests_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "LLM high error rate"
- alert: LLMHighLatency
expr: histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "LLM high latency"
- alert: HighRedFlagRate
expr: rate(red_flags_detected_total[1h]) > 10
for: 10m
labels:
severity: warning
annotations:
summary: "High red flag detection rate"
- name: infrastructure_alerts
rules:
- alert: HighCPU
expr: process_cpu_seconds_total > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage"
- alert: HighMemory
expr: process_resident_memory_bytes / 1024 / 1024 > 512
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
- alert: DiskSpaceLow
expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space"
Alertmanager 配置
# alertmanager.yml
global:
slack_api_url: 'https://hooks.slack.com/services/xxx'
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- channel: '#alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'
- name: 'pager'
pagerduty_configs:
- service_key: 'xxx'
5. 分布式追踪
OpenTelemetry 集成
// tracing/tracing.go
package tracing
import (
"context"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/jaeger"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)
func InitTracer(serviceName, jaegerURL string) (*sdktrace.TracerProvider, error) {
exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(jaegerURL)))
if err != nil {
return nil, err
}
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceNameKey.String(serviceName),
)),
)
otel.SetTracerProvider(tp)
return tp, nil
}
追踪中间件
// middleware/tracing.go
func TracingMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
tracer := otel.Tracer("http")
ctx, span := tracer.Start(c.Request.Context(), c.FullPath())
defer span.End()
c.Request = c.Request.WithContext(ctx)
c.Next()
span.SetAttributes(
attribute.String("http.method", c.Request.Method),
attribute.String("http.status_code", strconv.Itoa(c.Writer.Status())),
)
}
}
跨服务追踪
// AI 服务调用时传播 trace
func (p *Provider) callLLM(ctx context.Context, model string, messages []Message) (string, error) {
tracer := otel.Tracer("ai-service")
ctx, span := tracer.Start(ctx, "llm.call")
defer span.End()
span.SetAttributes(
attribute.String("llm.model", model),
)
// 调用 LLM
response, err := p.client.ChatCompletion(ctx, model, messages)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
}
return response.Content, err
}
6. 日志聚合
Loki 配置
# loki.yml
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
Promtail 配置
# promtail.yml
server:
http_listen_port: 9080
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: containers
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
7. 关键指标
业务指标
| 指标 | 说明 | 告警阈值 |
|---|---|---|
| active_sessions | 当前活跃会话数 | > 1000 |
| red_flags_detected_total | Red Flag 检测总数 | > 10/hour |
| consultation_duration_seconds | 咨询时长 | > 30min |
| user_retention_rate | 用户留存率 | < 50% |
技术指标
| 指标 | 说明 | 告警阈值 |
|---|---|---|
| http_requests_total | HTTP 请求总数 | - |
| http_request_duration_seconds | 请求延迟 | P99 > 2s |
| http_requests_total{status=“5xx”} | 5xx 错误数 | > 5% |
| llm_request_duration_seconds | LLM 请求延迟 | P95 > 10s |
| llm_tokens_used_total | Token 使用量 | - |
基础设施指标
| 指标 | 说明 | 告警阈值 |
|---|---|---|
| up | 服务存活 | == 0 |
| process_cpu_seconds_total | CPU 使用率 | > 80% |
| process_resident_memory_bytes | 内存使用 | > 512MB |
| node_filesystem_avail_bytes | 磁盘剩余 | < 10% |
8. Docker Compose 集成
# docker-compose.monitoring.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts:/etc/prometheus/alerts
- prometheus-data:/prometheus
ports:
- '9090:9090'
grafana:
image: grafana/grafana:latest
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
ports:
- '3000:3000'
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
loki:
image: grafana/loki:latest
volumes:
- loki-data:/loki
ports:
- '3100:3100'
promtail:
image: grafana/promtail:latest
volumes:
- ./promtail.yml:/etc/promtail/config.yml
- /var/log:/var/log
- /var/run/docker.sock:/var/run/docker.sock
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- '16686:16686'
- '6831:6831/udp'
volumes:
prometheus-data:
grafana-data:
loki-data:
常见面试问题
Q: 你的可观测性方案是什么?
A:
- 日志:结构化 JSON 日志,Loki 聚合,Grafana 查询
- 指标:Prometheus 采集,Grafana 仪表盘,Alertmanager 告警
- 追踪:OpenTelemetry + Jaeger,跨服务追踪
Q: 怎么监控 AI 服务质量?
A:
- 延迟监控:LLM 请求延迟 P95/P99
- 错误率:LLM 请求失败率
- Token 用量:成本核算
- Red Flag 率:安全指标
- Faithfulness 率:幻觉指标
Q: 线上出问题怎么排查?
A:
- Grafana 仪表盘:查看错误率、延迟变化
- 日志查询:Loki 搜索错误日志
- 链路追踪:Jaeger 查看请求链路
- 告警通知:Slack/PagerDuty 接收告警
常见错误
日志不结构化
// ❌ 纯文本日志
log.Printf("user %s logged in from %s", userID, ip)
// ✅ 结构化日志
logger.Info("user logged in",
slog.String("user_id", userID),
slog.String("client_ip", ip),
)
指标标签过多
// ❌ 标签值爆炸
httpRequestsTotal.WithLabelValues(
c.Request.Method,
c.FullPath(), // 包含动态参数,如 /api/v1/users/123
status,
).Inc()
// ✅ 使用路径模式
httpRequestsTotal.WithLabelValues(
c.Request.Method,
c.FullPath(), // 已经是模式,如 /api/v1/users/:id
status,
).Inc()
告警风暴
# ❌ 没有分组和去重
groups:
- name: alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status="500"}[5m]) > 0.1
# ✅ 分组和去重
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h