higress/plugins/wasm-go/extensions/ai-context-limit/main.go

// Copyright (c) 2026 Alibaba Group Holding Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package main 实现 ai-context-limit Higress WASM 插件。
//
// 插件会在 OpenAI / Anthropic 等协议兼容请求到达上游模型之前估算输入 token 数，
// 并对超过配置阈值的请求提前返回错误响应。
package main

import (
	"fmt"
	"strings"
	"time"

	"github.com/higress-group/proxy-wasm-go-sdk/proxywasm"
	"github.com/higress-group/proxy-wasm-go-sdk/proxywasm/types"
	"github.com/higress-group/wasm-go/pkg/log"
	"github.com/higress-group/wasm-go/pkg/wrapper"
)

func main() {}

func init() {
	// 在插件加载时一次性初始化 token 编码器。
	if err := initEncoder(); err != nil {
		// 初始化失败为致命错误：缺少编码器后续无法计算 token，
		// 但 wasm 运行期不能 panic，记录后所有请求走"未启用"兜底路径
		// 实际触发概率极低（embed 词表打包失败才会出现）
		// log 包在 init() 中尚不可用，使用 fmt.Println 兜底
		fmt.Println("[ai-context-limit] init encoder failed:", err)
	}
	wrapper.SetCtx(
		"ai-context-limit",
		wrapper.ParseConfig(parseConfig),
		wrapper.ProcessRequestHeaders(onHttpRequestHeaders),
		wrapper.ProcessRequestBody(onHttpRequestBody),
	)
}

// onHttpRequestHeaders 处理请求头阶段
//
// 关键约束：
//   - envoy 默认 http filter buffer 仅 14.3KB，必须在此阶段调 SetRequestBodyBufferLimit
//   - 必须返回 HeaderStopIteration，否则 envoy 不会等待 body 阶段
//   - 非 JSON 请求直接放行，不读 body
func onHttpRequestHeaders(ctx wrapper.HttpContext, cfg Config) types.Action {
	ctx.DisableReroute()

	if !cfg.IsEnabled() {
		// 配置缺失时降级为不拦截，允许用户通过配置开关此插件
		log.Warnf("max_context_tokens not configured, plugin disabled for this request")
		ctx.DontReadRequestBody()
		return types.ActionContinue
	}

	contentType, _ := proxywasm.GetHttpRequestHeader("content-type")
	if !strings.Contains(strings.ToLower(contentType), "application/json") {
		log.Debugf("non-json content-type=%q, skip body inspection", contentType)
		ctx.DontReadRequestBody()
		return types.ActionContinue
	}
	if !ctx.HasRequestBody() {
		log.Debugf("no request body, skip")
		ctx.DontReadRequestBody()
		return types.ActionContinue
	}

	// 强制调大 envoy http downstream decoder buffer
	// 写入 envoy property: set_decoder_buffer_limit
	ctx.SetRequestBodyBufferLimit(MaxRequestBodyBytes)
	// 移除 content-length，body 处理后由 envoy 重新计算
	_ = proxywasm.RemoveHttpRequestHeader("content-length")
	// 暂停 header 流转，等待 onHttpRequestBody 处理完
	return types.HeaderStopIteration
}

// onHttpRequestBody 处理请求体阶段
//
// 流程：
//  1. 抽取请求体中所有需计 token 的文本（兼容 OpenAI / Anthropic 等协议）
//  2. 命中多模态（image_url/audio）→ 直接放行
//  3. token 计数 → ×buffer_ratio → 与阈值比较
//  4. 超阈值 → 发送 local response，OpenAI 风格错误体
//
// 各阶段统一 info 级耗时日志（[aicl]）方便 grep 与基准对照。
func onHttpRequestBody(ctx wrapper.HttpContext, cfg Config, body []byte) types.Action {
	if !cfg.IsEnabled() {
		return types.ActionContinue
	}
	bodyBytes := len(body)
	log.Infof("[aicl] body_received bytes=%d", bodyBytes)

	if encoder == nil {
		log.Errorf("[aicl] token encoder not initialized, skip token counting")
		return types.ActionContinue
	}

	t0 := time.Now()
	result := extractPromptText(body)
	extractMs := time.Since(t0).Milliseconds()
	log.Infof("[aicl] extract_done bytes=%d text_bytes=%d multimodal=%v elapsed_ms=%d",
		bodyBytes, len(result.Text), result.HasMultimodal, extractMs)

	if result.HasMultimodal {
		log.Debugf("[aicl] multimodal request detected, bypass token counting")
		return types.ActionContinue
	}

	t1 := time.Now()
	rawTokens := CountTokens(result.Text)
	encodeMs := time.Since(t1).Milliseconds()
	estimatedTokens := int(float64(rawTokens) * cfg.BufferRatio)
	log.Infof("[aicl] encode_done bytes=%d text_bytes=%d raw_tokens=%d estimated=%d "+
		"threshold=%d extract_ms=%d encode_ms=%d total_ms=%d",
		bodyBytes, len(result.Text), rawTokens, estimatedTokens,
		cfg.MaxContextTokens, extractMs, encodeMs, extractMs+encodeMs)

	if estimatedTokens > cfg.MaxContextTokens {
		return blockOverLimit(cfg, estimatedTokens)
	}
	return types.ActionContinue
}

// blockOverLimit 发送 OpenAI 风格的超限错误响应
//
// 响应体复刻 OpenAI 官方 context_length_exceeded 格式，
// 使客户端 SDK（openai-python / openai-node）可解析为 BadRequestError
func blockOverLimit(cfg Config, estimatedTokens int) types.Action {
	body := fmt.Sprintf(
		`{"error":{"message":"This model's maximum context length is %d tokens. `+
			`Your request had approximately %d tokens.",`+
			`"type":"invalid_request_error","code":"context_length_exceeded"}}`,
		cfg.MaxContextTokens, estimatedTokens,
	)
	headers := [][2]string{{"content-type", "application/json"}}
	if err := proxywasm.SendHttpResponse(uint32(cfg.ErrorStatusCode), headers, []byte(body), -1); err != nil {
		log.Errorf("send local response failed: %v", err)
		return types.ActionContinue
	}
	log.Infof("blocked: estimated %d > limit %d", estimatedTokens, cfg.MaxContextTokens)
	return types.ActionContinue
}