From 0c0ec53a50029f0ff8d8fe857f90e8b378996901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BE=84=E6=BD=AD?= Date: Sun, 1 Feb 2026 11:54:52 +0800 Subject: [PATCH] feat(ai-statistics): support token details and builtin keys for reasoning_tokens/cached_tokens (#3424) --- .../extensions/ai-statistics/README.md | 60 ++++++++ .../extensions/ai-statistics/README_EN.md | 60 ++++++++ .../wasm-go/extensions/ai-statistics/main.go | 145 ++++++++++++++++-- .../extensions/ai-statistics/main_test.go | 95 ++++++++++++ 4 files changed, 349 insertions(+), 11 deletions(-) diff --git a/plugins/wasm-go/extensions/ai-statistics/README.md b/plugins/wasm-go/extensions/ai-statistics/README.md index 6bf3c6663..4523d13c9 100644 --- a/plugins/wasm-go/extensions/ai-statistics/README.md +++ b/plugins/wasm-go/extensions/ai-statistics/README.md @@ -60,6 +60,27 @@ Attribute 配置说明: - `replace`:多个 chunk 中取最后一个有效 chunk 的值 - `append`:拼接多个有效 chunk 中的值,可用于获取回答内容 +### 内置属性 (Built-in Attributes) + +插件提供了一些内置属性键(key),可以直接使用而无需配置 `value_source` 和 `value`。这些内置属性会自动从请求/响应中提取相应的值: + +| 内置属性键 | 说明 | 适用场景 | +|---------|------|---------| +| `question` | 用户提问内容 | 支持 OpenAI/Claude 消息格式 | +| `answer` | AI 回答内容 | 支持 OpenAI/Claude 消息格式,流式和非流式 | +| `tool_calls` | 工具调用信息 | OpenAI/Claude 工具调用 | +| `reasoning` | 推理过程 | OpenAI o1 等推理模型 | +| `reasoning_tokens` | 推理 token 数(如 o1 模型) | OpenAI Chat Completions,从 `output_token_details.reasoning_tokens` 提取 | +| `cached_tokens` | 缓存命中的 token 数 | OpenAI Chat Completions,从 `input_token_details.cached_tokens` 提取 | +| `input_token_details` | 输入 token 详细信息(完整对象) | OpenAI/Gemini/Anthropic,包含缓存、工具使用等详情 | +| `output_token_details` | 输出 token 详细信息(完整对象) | OpenAI/Gemini/Anthropic,包含推理 token、生成图片数等详情 | + +使用内置属性时,只需设置 `key`、`apply_to_log` 等参数,无需设置 `value_source` 和 `value`。 + +**注意**: +- `reasoning_tokens` 和 `cached_tokens` 是从 token details 中提取的便捷字段,适用于 OpenAI Chat Completions API +- `input_token_details` 和 `output_token_details` 会以 JSON 字符串形式记录完整的 token 详情对象 + ## 配置示例 如果希望在网关访问日志中记录 ai-statistic 相关的统计值,需要修改 log_format,在原 log_format 基础上添加一个新字段,示例如下: @@ -272,6 +293,45 @@ attributes: 插件会自动按 `index` 字段识别每个独立的工具调用,拼接分片返回的 `arguments` 字符串,最终输出完整的工具调用列表。 +### 记录 Token 详情 + +使用内置属性记录 OpenAI Chat Completions 的 token 详细信息: + +```yaml +attributes: + # 使用便捷的内置属性提取特定字段 + - key: reasoning_tokens # 推理token数(o1等推理模型) + apply_to_log: true + - key: cached_tokens # 缓存命中的token数 + apply_to_log: true + # 记录完整的token详情对象 + - key: input_token_details + apply_to_log: true + - key: output_token_details + apply_to_log: true +``` + +#### 日志示例 + +对于使用了 prompt caching 和推理模型的请求,日志可能如下: + +```json +{ + "ai_log": "{\"model\":\"gpt-4o\",\"input_token\":\"100\",\"output_token\":\"50\",\"reasoning_tokens\":\"25\",\"cached_tokens\":\"80\",\"input_token_details\":\"{\\\"cached_tokens\\\":80}\",\"output_token_details\":\"{\\\"reasoning_tokens\\\":25}\",\"llm_service_duration\":\"2000\"}" +} +``` + +其中: +- `reasoning_tokens`: 25 - 推理过程产生的 token 数 +- `cached_tokens`: 80 - 从缓存中读取的 token 数 +- `input_token_details`: 完整的输入 token 详情(JSON 格式) +- `output_token_details`: 完整的输出 token 详情(JSON 格式) + +这些详情对于: +1. **成本优化**:了解缓存命中率,优化 prompt caching 策略 +2. **性能分析**:分析推理 token 占比,评估推理模型的实际开销 +3. **使用统计**:细粒度统计各类 token 的使用情况 + ## 调试 ### 验证 ai_log 内容 diff --git a/plugins/wasm-go/extensions/ai-statistics/README_EN.md b/plugins/wasm-go/extensions/ai-statistics/README_EN.md index 32d4c2489..bff24060f 100644 --- a/plugins/wasm-go/extensions/ai-statistics/README_EN.md +++ b/plugins/wasm-go/extensions/ai-statistics/README_EN.md @@ -60,6 +60,27 @@ When `value_source` is `response_streaming_body`, `rule` should be configured to - `replace`: extract value from the last valid chunk - `append`: join value pieces from all valid chunks +### Built-in Attributes + +The plugin provides several built-in attribute keys that can be used directly without configuring `value_source` and `value`. These built-in attributes automatically extract corresponding values from requests/responses: + +| Built-in Key | Description | Use Case | +|--------------|-------------|----------| +| `question` | User's question content | Supports OpenAI/Claude message formats | +| `answer` | AI's answer content | Supports OpenAI/Claude message formats, both streaming and non-streaming | +| `tool_calls` | Tool call information | OpenAI/Claude tool calls | +| `reasoning` | Reasoning process | OpenAI o1 and other reasoning models | +| `reasoning_tokens` | Number of reasoning tokens (e.g., o1 model) | OpenAI Chat Completions, extracted from `output_token_details.reasoning_tokens` | +| `cached_tokens` | Number of cached tokens | OpenAI Chat Completions, extracted from `input_token_details.cached_tokens` | +| `input_token_details` | Complete input token details (object) | OpenAI/Gemini/Anthropic, includes cache, tool usage, etc. | +| `output_token_details` | Complete output token details (object) | OpenAI/Gemini/Anthropic, includes reasoning tokens, generated images, etc. | + +When using built-in attributes, you only need to set `key`, `apply_to_log`, etc., without setting `value_source` and `value`. + +**Notes**: +- `reasoning_tokens` and `cached_tokens` are convenience fields extracted from token details, applicable to OpenAI Chat Completions API +- `input_token_details` and `output_token_details` will record the complete token details object as a JSON string + ## Configuration example If you want to record ai-statistic related statistical values in the gateway access log, you need to modify log_format and add a new field based on the original log_format. The example is as follows: @@ -147,6 +168,45 @@ If the request contains a session ID header, the log will automatically include When the configuration is empty, no additional attributes will be added to the span. +### Record Token Details + +Use built-in attributes to record token details for OpenAI Chat Completions: + +```yaml +attributes: + # Use convenient built-in attributes to extract specific fields + - key: reasoning_tokens # Reasoning tokens (o1 and other reasoning models) + apply_to_log: true + - key: cached_tokens # Cached tokens from prompt caching + apply_to_log: true + # Record complete token details objects + - key: input_token_details + apply_to_log: true + - key: output_token_details + apply_to_log: true +``` + +#### Log Example + +For requests using prompt caching and reasoning models, the log might look like: + +```json +{ + "ai_log": "{\"model\":\"gpt-4o\",\"input_token\":\"100\",\"output_token\":\"50\",\"reasoning_tokens\":\"25\",\"cached_tokens\":\"80\",\"input_token_details\":\"{\\\"cached_tokens\\\":80}\",\"output_token_details\":\"{\\\"reasoning_tokens\\\":25}\",\"llm_service_duration\":\"2000\"}" +} +``` + +Where: +- `reasoning_tokens`: 25 - Number of tokens generated during reasoning +- `cached_tokens`: 80 - Number of tokens read from cache +- `input_token_details`: Complete input token details (JSON format) +- `output_token_details`: Complete output token details (JSON format) + +These details are useful for: +1. **Cost optimization**: Understanding cache hit rates to optimize prompt caching strategy +2. **Performance analysis**: Analyzing reasoning token ratio to evaluate actual overhead of reasoning models +3. **Usage statistics**: Fine-grained statistics of various token types + ## Debugging ### Verifying ai_log Content diff --git a/plugins/wasm-go/extensions/ai-statistics/main.go b/plugins/wasm-go/extensions/ai-statistics/main.go index 0f4cadbcc..1529381ea 100644 --- a/plugins/wasm-go/extensions/ai-statistics/main.go +++ b/plugins/wasm-go/extensions/ai-statistics/main.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "encoding/binary" "encoding/json" "errors" "fmt" @@ -17,6 +18,16 @@ import ( "github.com/tidwall/gjson" ) +const ( + // Envoy log levels + LogLevelTrace = iota + LogLevelDebug + LogLevelInfo + LogLevelWarn + LogLevelError + LogLevelCritical +) + func main() {} func init() { @@ -90,10 +101,14 @@ const ( RuleAppend = "append" // Built-in attributes - BuiltinQuestionKey = "question" - BuiltinAnswerKey = "answer" - BuiltinToolCallsKey = "tool_calls" - BuiltinReasoningKey = "reasoning" + BuiltinQuestionKey = "question" + BuiltinAnswerKey = "answer" + BuiltinToolCallsKey = "tool_calls" + BuiltinReasoningKey = "reasoning" + BuiltinReasoningTokens = "reasoning_tokens" + BuiltinCachedTokens = "cached_tokens" + BuiltinInputTokenDetails = "input_token_details" + BuiltinOutputTokenDetails = "output_token_details" // Built-in attribute paths // Question paths (from request body) @@ -578,6 +593,14 @@ func onHttpStreamingBody(ctx wrapper.HttpContext, config AIStatisticsConfig, dat setSpanAttribute(ArmsModelName, usage.Model) setSpanAttribute(ArmsInputToken, usage.InputToken) setSpanAttribute(ArmsOutputToken, usage.OutputToken) + + // Set token details to context for later use in attributes + if len(usage.InputTokenDetails) > 0 { + ctx.SetContext(tokenusage.CtxKeyInputTokenDetails, usage.InputTokenDetails) + } + if len(usage.OutputTokenDetails) > 0 { + ctx.SetContext(tokenusage.CtxKeyOutputTokenDetails, usage.OutputTokenDetails) + } } } // If the end of the stream is reached, record metrics/logs/spans. @@ -634,6 +657,14 @@ func onHttpResponseBody(ctx wrapper.HttpContext, config AIStatisticsConfig, body setSpanAttribute(ArmsInputToken, usage.InputToken) setSpanAttribute(ArmsOutputToken, usage.OutputToken) setSpanAttribute(ArmsTotalToken, usage.TotalToken) + + // Set token details to context for later use in attributes + if len(usage.InputTokenDetails) > 0 { + ctx.SetContext(tokenusage.CtxKeyInputTokenDetails, usage.InputTokenDetails) + } + if len(usage.OutputTokenDetails) > 0 { + ctx.SetContext(tokenusage.CtxKeyOutputTokenDetails, usage.OutputTokenDetails) + } } } @@ -694,18 +725,41 @@ func setAttributeBySource(ctx wrapper.HttpContext, config AIStatisticsConfig, so if (value == nil || value == "") && attribute.DefaultValue != "" { value = attribute.DefaultValue } - if len(fmt.Sprint(value)) > config.valueLengthLimit { - value = fmt.Sprint(value)[:config.valueLengthLimit/2] + " [truncated] " + fmt.Sprint(value)[len(fmt.Sprint(value))-config.valueLengthLimit/2:] + + // Format value for logging/span + var formattedValue interface{} + switch v := value.(type) { + case map[string]int64: + // For token details maps, convert to JSON string + jsonBytes, err := json.Marshal(v) + if err != nil { + log.Warnf("failed to marshal token details: %v", err) + formattedValue = fmt.Sprint(v) + } else { + formattedValue = string(jsonBytes) + } + default: + formattedValue = value + if len(fmt.Sprint(value)) > config.valueLengthLimit { + formattedValue = fmt.Sprint(value)[:config.valueLengthLimit/2] + " [truncated] " + fmt.Sprint(value)[len(fmt.Sprint(value))-config.valueLengthLimit/2:] + } } - log.Debugf("[attribute] source type: %s, key: %s, value: %+v", source, key, value) + + log.Debugf("[attribute] source type: %s, key: %s, value: %+v", source, key, formattedValue) if attribute.ApplyToLog { if attribute.AsSeparateLogField { - marshalledJsonStr := wrapper.MarshalStr(fmt.Sprint(value)) + var marshalledJsonStr string + if _, ok := value.(map[string]int64); ok { + // Already marshaled in formattedValue + marshalledJsonStr = fmt.Sprint(formattedValue) + } else { + marshalledJsonStr = wrapper.MarshalStr(fmt.Sprint(formattedValue)) + } if err := proxywasm.SetProperty([]string{key}, []byte(marshalledJsonStr)); err != nil { log.Warnf("failed to set %s in filter state, raw is %s, err is %v", key, marshalledJsonStr, err) } } else { - ctx.SetUserAttribute(key, value) + ctx.SetUserAttribute(key, formattedValue) } } // for metrics @@ -723,7 +777,9 @@ func setAttributeBySource(ctx wrapper.HttpContext, config AIStatisticsConfig, so // isBuiltinAttribute checks if the given key is a built-in attribute func isBuiltinAttribute(key string) bool { - return key == BuiltinQuestionKey || key == BuiltinAnswerKey || key == BuiltinToolCallsKey || key == BuiltinReasoningKey + return key == BuiltinQuestionKey || key == BuiltinAnswerKey || key == BuiltinToolCallsKey || key == BuiltinReasoningKey || + key == BuiltinReasoningTokens || key == BuiltinCachedTokens || + key == BuiltinInputTokenDetails || key == BuiltinOutputTokenDetails } // getBuiltinAttributeDefaultSources returns the default value_source(s) for a built-in attribute @@ -734,6 +790,9 @@ func getBuiltinAttributeDefaultSources(key string) []string { return []string{RequestBody} case BuiltinAnswerKey, BuiltinToolCallsKey, BuiltinReasoningKey: return []string{ResponseStreamingBody, ResponseBody} + case BuiltinReasoningTokens, BuiltinCachedTokens, BuiltinInputTokenDetails, BuiltinOutputTokenDetails: + // Token details are only available after response is received + return []string{ResponseStreamingBody, ResponseBody} default: return nil } @@ -816,6 +875,38 @@ func getBuiltinAttributeFallback(ctx wrapper.HttpContext, config AIStatisticsCon return value } } + case BuiltinReasoningTokens: + // Extract reasoning_tokens from output_token_details (only available after response) + if source == ResponseBody || source == ResponseStreamingBody { + if outputTokenDetails, ok := ctx.GetContext(tokenusage.CtxKeyOutputTokenDetails).(map[string]int64); ok { + if reasoningTokens, exists := outputTokenDetails["reasoning_tokens"]; exists { + return reasoningTokens + } + } + } + case BuiltinCachedTokens: + // Extract cached_tokens from input_token_details (only available after response) + if source == ResponseBody || source == ResponseStreamingBody { + if inputTokenDetails, ok := ctx.GetContext(tokenusage.CtxKeyInputTokenDetails).(map[string]int64); ok { + if cachedTokens, exists := inputTokenDetails["cached_tokens"]; exists { + return cachedTokens + } + } + } + case BuiltinInputTokenDetails: + // Return the entire input_token_details map (only available after response) + if source == ResponseBody || source == ResponseStreamingBody { + if inputTokenDetails, ok := ctx.GetContext(tokenusage.CtxKeyInputTokenDetails).(map[string]int64); ok { + return inputTokenDetails + } + } + case BuiltinOutputTokenDetails: + // Return the entire output_token_details map (only available after response) + if source == ResponseBody || source == ResponseStreamingBody { + if outputTokenDetails, ok := ctx.GetContext(tokenusage.CtxKeyOutputTokenDetails).(map[string]int64); ok { + return outputTokenDetails + } + } } return nil } @@ -854,11 +945,31 @@ func extractStreamingBodyByJsonPath(data []byte, jsonPath string, rule string) i return value } +// shouldLogDebug returns true if the log level is debug or trace +func shouldLogDebug() bool { + value, err := proxywasm.CallForeignFunction("get_log_level", nil) + if err != nil { + // If we can't get log level, default to not logging debug info + return false + } + if len(value) < 4 { + // Invalid log level value length + return false + } + envoyLogLevel := binary.LittleEndian.Uint32(value[:4]) + return envoyLogLevel == LogLevelTrace || envoyLogLevel == LogLevelDebug +} + // debugLogAiLog logs the current user attributes that will be written to ai_log func debugLogAiLog(ctx wrapper.HttpContext) { + // Only log in debug/trace mode + if !shouldLogDebug() { + return + } + // Get all user attributes as a map userAttrs := make(map[string]interface{}) - + // Try to reconstruct from GetUserAttribute (note: this is best-effort) // The actual attributes are stored internally, we log what we know if question := ctx.GetUserAttribute("question"); question != nil { @@ -903,6 +1014,18 @@ func debugLogAiLog(ctx wrapper.HttpContext) { if llmServiceDuration := ctx.GetUserAttribute("llm_service_duration"); llmServiceDuration != nil { userAttrs["llm_service_duration"] = llmServiceDuration } + if reasoningTokens := ctx.GetUserAttribute("reasoning_tokens"); reasoningTokens != nil { + userAttrs["reasoning_tokens"] = reasoningTokens + } + if cachedTokens := ctx.GetUserAttribute("cached_tokens"); cachedTokens != nil { + userAttrs["cached_tokens"] = cachedTokens + } + if inputTokenDetails := ctx.GetUserAttribute("input_token_details"); inputTokenDetails != nil { + userAttrs["input_token_details"] = inputTokenDetails + } + if outputTokenDetails := ctx.GetUserAttribute("output_token_details"); outputTokenDetails != nil { + userAttrs["output_token_details"] = outputTokenDetails + } // Log the attributes as JSON logJson, _ := json.Marshal(userAttrs) diff --git a/plugins/wasm-go/extensions/ai-statistics/main_test.go b/plugins/wasm-go/extensions/ai-statistics/main_test.go index b5a021f97..2e6329ae6 100644 --- a/plugins/wasm-go/extensions/ai-statistics/main_test.go +++ b/plugins/wasm-go/extensions/ai-statistics/main_test.go @@ -1617,3 +1617,98 @@ func TestSessionIdDebugOutput(t *testing.T) { }) }) } + +// 测试配置:Token Details 配置 +var tokenDetailsConfig = func() json.RawMessage { + data, _ := json.Marshal(map[string]interface{}{ + "attributes": []map[string]interface{}{ + { + "key": "reasoning_tokens", + "apply_to_log": true, + }, + { + "key": "cached_tokens", + "apply_to_log": true, + }, + { + "key": "input_token_details", + "apply_to_log": true, + }, + { + "key": "output_token_details", + "apply_to_log": true, + }, + }, + "disable_openai_usage": false, + }) + return data +}() + +// TestTokenDetails 测试 token details 功能 +func TestTokenDetails(t *testing.T) { + t.Run("test builtin token details attributes", func(t *testing.T) { + host, status := test.NewTestHost(tokenDetailsConfig) + defer host.Reset() + require.Equal(t, types.OnPluginStartStatusOK, status) + + // 设置路由和集群名称 + host.SetRouteName("api-v1") + host.SetClusterName("cluster-1") + + // 1. 处理请求头 + action := host.CallOnHttpRequestHeaders([][2]string{ + {":authority", "example.com"}, + {":path", "/v1/chat/completions"}, + {":method", "POST"}, + }) + require.Equal(t, types.ActionContinue, action) + + // 2. 处理请求体 + requestBody := []byte(`{ + "model": "gpt-4o", + "messages": [ + {"role": "user", "content": "Test question"} + ] + }`) + action = host.CallOnHttpRequestBody(requestBody) + require.Equal(t, types.ActionContinue, action) + + // 3. 处理响应头 + action = host.CallOnHttpResponseHeaders([][2]string{ + {":status", "200"}, + {"content-type", "application/json"}, + }) + require.Equal(t, types.ActionContinue, action) + + // 4. 处理响应体(包含 token details) + responseBody := []byte(`{ + "id": "chatcmpl-123", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o", + "usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150, + "completion_tokens_details": { + "reasoning_tokens": 25 + }, + "prompt_tokens_details": { + "cached_tokens": 80 + } + }, + "choices": [{ + "message": { + "role": "assistant", + "content": "Test answer" + }, + "finish_reason": "stop" + }] + }`) + action = host.CallOnHttpResponseBody(responseBody) + require.Equal(t, types.ActionContinue, action) + + // 5. 完成请求 + host.CompleteHttp() + }) +}