LLM可观测性：构建生产级大模型监控体系

LLM 应用在进入生产环境后，面临着一系列独特的可观测性挑战：Token 消耗波动、推理延迟异常、输出质量退化、语义漂移等问题。传统的 APM 工具有效但不够深入，需要针对 LLM 的特性专门设计监控体系。

一、为什么 LLM 监控与众不同？

1.1 传统监控 vs LLM 监控

维度	传统 Web 服务	LLM 服务	差异原因
延迟特征	相对固定	与输入/输出长度正相关	自回归生成的成本随长度变化
失败模式	HTTP 5xx / 超时	质量退化 / 幻觉 / 内容安全	输出可能在语法上正确但语义错误
成本特征	固定 CPU/MEM 开销	Token 计费，波动巨大	不同 prompt 差异可达 1000 倍
质量评估	结构化数据验证	需要 LLM 评估 LLM	缺乏 ground truth 参照

1.2 三大监控支柱

┌─────────────────────────────────────────────┐
│             LLM 可观测性体系                   │
├─────────────────┬───────────────┬─────────────┤
│  Metrics (指标)  │  Tracing (链路)│  Logging (日志)│
├─────────────────┼───────────────┼─────────────┤
│ Token 消耗率    │ 完整请求链路   │ 输入/输出样本  │
│ 推理延迟分布     │ 各步骤耗时     │ 异常请求记录   │
│ 缓存命中率      │ 模型路由决策   │ 质量评估结果   │
│ 错误率          │ 重试与降级      │ 告警事件日志   │
│ 成本趋势        │ 服务依赖关系   │ 成本审计记录   │
└─────────────────┴───────────────┴─────────────┘

二、Metrics：核心指标

2.1 延迟指标

import time
from prometheus_client import Histogram, Counter, Gauge
from typing import List, Dict

# 定义指标
LLM_LATENCY = Histogram(
    'llm_request_latency_seconds',
    'LLM 请求延迟分布（秒）',
    ['model', 'operation'],  # operation: ttt, ttft, itl
    buckets=(0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0)
)

LLM_TOKEN_USAGE = Counter(
    'llm_token_usage_total',
    'Token 消耗统计',
    ['model', 'token_type']  # token_type: input, output
)

LLM_COST = Counter(
    'llm_cost_dollars_total',
    '累计成本',
    ['model', 'provider']
)

LLM_ERRORS = Counter(
    'llm_errors_total',
    '错误计数',
    ['model', 'error_type']  # error_type: timeout, rate_limit, content_filter, etc.
)

# 核心延迟指标说明
"""
TTFT (Time to First Token)    首次 Token 延迟
  定义: 请求发出到收到第一个 token 的时间
  关注: 用户体验感知延迟
  目标: < 500ms (流式), < 2000ms (非流式)

ITL (Inter-Token Latency)     Token 间延迟
  定义: 相邻两个输出 token 的间隔时间
  关注: 流式输出的流畅度
  目标: < 50ms (感觉实时)

TPOT (Time Per Output Token)  每输出 Token 时间
  定义: 总生成时间 / 输出 token 数
  关注: 吞吐效率
  目标: < 30ms / token

Total (端到端延迟)
  定义: 请求到完整响应的时间
  关注: 整体服务性能
  目标: < 10s (95% 分位)
"""

2.2 成本指标

class CostTracker:
    """
    Token 成本追踪器
    支持按模型、用户、部门、项目维度统计
    """
    
    PRICING = {
        'gpt-4o': {'input': 2.50, 'output': 10.00},
        'gpt-4o-mini': {'input': 0.15, 'output': 0.60},
        'claude-3-sonnet': {'input': 3.00, 'output': 15.00},
        'claude-3-haiku': {'input': 0.25, 'output': 1.25},
        'deepseek-v3': {'input': 0.27, 'output': 1.10},
    }
    
    def __init__(self):
        self.daily_cost = {}  # date -> cost
    
    def record_usage(self, model: str, input_tokens: int, 
                     output_tokens: int, user_id: str = None):
        """记录一次 API 调用的成本"""
        pricing = self.PRICING.get(model, {'input': 0, 'output': 0})
        
        cost = (
            input_tokens / 1_000_000 * pricing['input'] +
            output_tokens / 1_000_000 * pricing['output']
        )
        
        LLM_COST.labels(model=model, provider=model.split('-')[0]).inc(cost)
        LLM_TOKEN_USAGE.labels(model=model, token_type='input').inc(input_tokens)
        LLM_TOKEN_USAGE.labels(model=model, token_type='output').inc(output_tokens)
        
        today = time.strftime('%Y-%m-%d')
        if today not in self.daily_cost:
            self.daily_cost[today] = 0
        self.daily_cost[today] += cost
        
        return cost
    
    def get_daily_report(self) -> dict:
        """生成今日成本报告"""
        today = time.strftime('%Y-%m-%d')
        return {
            'date': today,
            'total_cost': self.daily_cost.get(today, 0),
            'budget_usage_pct': self.daily_cost.get(today, 0) / self.daily_budget * 100
        }

2.3 质量指标

class QualityMetrics:
    """
    LLM 输出质量评估
    
    自动评估维度：
    1. 回复相关性（与问题的匹配度）
    2. 事实准确性（通过外部知识验证）
    3. 格式合规性（结构化输出格式检查）
    4. 安全性（内容安全检测）
    5. 长度效率（输出是否过于冗长）
    """
    
    def evaluate(self, question: str, answer: str, 
                 expected_format: str = None) -> dict:
        """多维度质量评估"""
        score = {}
        
        # 1. 相关性：答案是否与问题相关
        score['relevance'] = self._assess_relevance(question, answer)
        
        # 2. 长度效率
        score['length_efficiency'] = self._assess_length(
            question, answer
        )
        
        # 3. 格式合规性
        if expected_format:
            score['format_compliance'] = self._check_format(
                answer, expected_format
            )
        
        # 4. 安全性检查（关键词过滤作为快速检查）
        score['safety_pass'] = self._safety_check(answer)
        
        return score
    
    def _assess_relevance(self, q: str, a: str) -> float:
        """评估回答与问题的相关性（0-1）"""
        # 简化版：关键词重叠
        q_words = set(q.lower().split())
        a_words = set(a.lower().split())
        if not q_words:
            return 1.0
        overlap = len(q_words & a_words) / len(q_words)
        return min(overlap * 2, 1.0)  # 放大，但要合理
    
    def _assess_length(self, q: str, a: str) -> float:
        """评估回答长度效率（0-1）"""
        ratio = len(a) / max(len(q) * 5, 50)  # 期望回答为问题的5倍
        if ratio < 0.5:
            return 0.5  # 太短
        elif ratio > 20:
            return 0.3  # 太长（注水）
        return 1.0  # 适中

三、Tracing：全链路追踪

3.1 请求链路

from opentelemetry import trace
from opentelemetry.trace import SpanKind
import uuid

class LLMTracer:
    """
    LLM 请求全链路追踪
    
    链路结构：
    root_span: 用户请求
      ├── span_preprocess: 请求预处理（长度检查、路由决策）
      ├── span_cache_lookup: 缓存检查
      ├── span_llm_call: LLM API 调用
      │   ├── span_network: 网络请求
      │   ├── span_streaming: 流式接收（可选）
      │   └── span_postprocess: 响应后处理
      ├── span_quality_check: 质量评估
      └── span_response: 返回给用户
    """
    
    def __init__(self):
        self.tracer = trace.get_tracer(__name__)
    
    def trace_request(self, messages, model):
        """追踪一次完整的 LLM 请求"""
        
        request_id = str(uuid.uuid4())[:8]
        
        with self.tracer.start_as_current_span(
            "llm_request",
            kind=SpanKind.SERVER
        ) as root_span:
            root_span.set_attribute("request.id", request_id)
            root_span.set_attribute("model", model)
            
            # 预处理
            t0 = time.time()
            with self.tracer.start_as_current_span("preprocess") as span:
                # 检查长度、路由决策
                pass
            p_time = time.time() - t0
            root_span.set_attribute("preprocess.ms", p_time * 1000)
            
            # LLM 调用
            t0 = time.time()
            with self.tracer.start_as_current_span("llm_call") as llm_span:
                response = self._call_llm(messages, model)
            llm_time = time.time() - t0
            root_span.set_attribute("llm_call.ms", llm_time * 1000)
            root_span.set_attribute("output.tokens", response.usage.completion_tokens)
            
            # 质量评估
            with self.tracer.start_as_current_span("quality_check") as span:
                quality = self.quality_check(messages, response)
            
            return response

3.2 链路可视化

Trace ID: abc123def456
├── POST /v1/chat/completions    5.2s
│   ├── preprocess               12ms
│   │   ├── route_decision       3ms    → model: gpt-4o-mini
│   │   └── cache_check          8ms    → miss
│   ├── llm_call                 4.8s
│   │   ├── api_request          45ms   → HTTP POST
│   │   ├── ttft_wait            320ms  → TTFT
│   │   ├── streaming_receive    4.2s   → 150 tokens @ 35ms/tok
│   │   └── finish_reason        15ms   → stop
│   └── postprocess              280ms
│       ├── format_check         20ms   → ✅ JSON valid
│       ├── safety_check         45ms   → ✅ Passed
│       └── quality_eval         200ms  → Score: 0.85

四、Logging：智能日志系统

4.1 日志采样策略

import logging
import random

class AdaptiveSampler:
    """
    自适应日志采样
    
    策略：
    - 正常请求: 1% 采样
    - 慢请求 (>5s): 100% 采样
    - 错误请求: 100% 采样
    - 高成本请求 (>$0.1): 100% 采样
    """
    
    def __init__(self, base_rate=0.01):
        self.base_rate = base_rate
    
    def should_sample(self, latency, cost, has_error):
        if has_error:
            return True
        if latency > 5.0:  # 慢请求
            return True
        if cost > 0.1:     # 高成本请求
            return True
        
        return random.random() < self.base_rate

4.2 日志格式

{
    "timestamp": "2026-05-11T10:30:00Z",
    "request_id": "req_abc123",
    "session_id": "sess_xyz789",
    "user_id": "user_001",
    
    "request": {
        "model": "gpt-4o-mini",
        "messages_len": 1024,      # 输入字符数
        "messages_tokens": 312,    # 输入 token 数
        "temperature": 0.7,
        "max_tokens": 1000,
        "route_rule": "default"
    },
    
    "response": {
        "finish_reason": "stop",
        "output_tokens": 156,
        "output_chars": 623,
        "stop_words_hit": false
    },
    
    "performance": {
        "ttft_ms": 320,
        "itl_ms": 35,
        "total_time_ms": 5200,
        "tokens_per_second": 30.0
    },
    
    "cost": {
        "input_cost": 0.000047,
        "output_cost": 0.000094,
        "total_cost": 0.000141
    },
    
    "quality": {
        "relevance_score": 0.92,
        "format_valid": true,
        "safety_passed": true
    }
}

五、告警与仪表盘

5.1 关键告警规则

告警名称	条件	严重度	响应要求
TTFT 超标	P95 TTFT > 2s 持续 5 分钟	Critical	立即排查
Token 成本飙升	日成本 > 预算 150%	Warning	检查是否有异常调用
错误率突增	错误率 > 5% 持续 3 分钟	Critical	检查模型和网络
质量分数下降	平均质量分 < 0.6	Warning	检查模型或 prompt 变更
模型配额耗尽	高级模型配额用尽	Info	评估是否需要扩大配额
缓存命中率暴跌	命中率 < 20%（且 > 50% 基线）	Warning	检查缓存是否失效

5.2 仪表盘设计

┌──────────────────────────────────────────────────────┐
│  LLM 监控仪表盘                 2026-05-11 10:30:00   │
├───────────────┬──────────────────────────────────────┤
│  今日概览      │  实时指标                            │
├───────────────┼──────────────────────────────────────┤
│ 总请求: 12,345│  TTFT P50: 312ms   P95: 1.2s  P99: 3s│
│ 总 Token: 8.5M│  ITL Avg: 42ms     Cache: 32%        │
│ 总成本: $245  │  错误率: 1.2%    可用率: 99.8%        │
│ 预算使用: 65% │                                      │
├───────────────┴──────────────────────────────────────┤
│  成本趋势（近7天）                                     │
│  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━             │
│  $200  ┃░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              │
│        ┃░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              │
│  $100  ┃████░░████████░░████░░████░░                │
│        ┃████░░████████░░████░░████░░                │
│    $0  ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━              │
│        Mon  Tue  Wed  Thu  Fri  Sat  Sun              │
├──────────────────────────────────────────────────────┤
│  模型调用分布                                          │
│  ┌─────────────────────────────────────┐              │
│  │ GPT-4o-mini    ████████████████ 65% │              │
│  │ Claude Haiku   ██████████     30%  │              │
│  │ GPT-4o         ██              5%  │              │
│  └─────────────────────────────────────┘              │
└──────────────────────────────────────────────────────┘

六、工具与集成

6.1 主流方案对比

方案	适用规模	部署方式	成本	特点
Grafana + Prometheus	中大型	自建	中	灵活、全面
Datadog LLM Observability	大型	SaaS	高	开箱即用
LangSmith	中小型	SaaS	中	LLM 原生
自建 (Python SDK)	小型	自建	低	灵活定制

6.2 快速集成

# minimal_observability.py
# 一个简约版 LLM 可观测性 SDK

import time
import json
from datetime import datetime

class LLMObservability:
    """极简 LLM 观测 SDK，5分钟接入"""
    
    def __init__(self, log_file="llm_observations.jsonl"):
        self.log_file = log_file
    
    def observe(self, func):
        """装饰器：自动观测 LLM 调用"""
        def wrapper(*args, **kwargs):
            t0 = time.time()
            try:
                result = func(*args, **kwargs)
                success = True
                error = None
            except Exception as e:
                result = None
                success = False
                error = str(e)
            
            duration = time.time() - t0
            
            observation = {
                "timestamp": datetime.utcnow().isoformat(),
                "function": func.__name__,
                "duration_ms": round(duration * 1000, 2),
                "success": success,
                "error": error,
                "model": kwargs.get("model", "unknown"),
                "input_tokens": kwargs.get("input_tokens"),
                "output_tokens": kwargs.get("output_tokens"),
            }
            
            # 写入日志
            with open(self.log_file, "a") as f:
                f.write(json.dumps(observation, ensure_ascii=False) + "\n")
            
            return result
        
        return wrapper

# 使用
obs = LLMObservability()

class LLMClient:
    @obs.observe
    def chat(self, messages, model="gpt-4o-mini", **kwargs):
        # 实际调用
        pass

七、实施路线图

阶段	内容	时间	产出
Phase 1	基础日志 + 关键指标	1-2天	接入 LLM_OBSERVABILITY 类
Phase 2	延迟 + 成本 + 错误率仪表盘	3天	Grafana 面板
Phase 3	全链路追踪	1周	请求排障能力
Phase 4	质量评估 + 自动告警	2周	质量下降自动发现
Phase 5	成本预算 + 智能路由	3周	成本自动优化

总结

LLM 可观测性是 AI 工程化从”能用”到”好用”的关键跃迁。通过系统化的日志、指标和追踪体系，团队可以：

快速排障：当用户反馈”回复变差了”，几秒钟定位根因
持续优化：基于数据驱动降低延迟和成本，提升质量
成本透明：每个部门、每个功能的 Token 消耗一目了然
质量保障：自动检测质量退化，在影响用户之前发现