Vertex structured outputs (#3649)

2026-05-24 04:37:25 +08:00 · 2026-03-30 17:26:55 +08:00
parent 889ea67013
commit cd8ed99db5
4 changed files with 720 additions and 21 deletions
--- a/plugins/wasm-go/extensions/ai-proxy/provider/vertex.go
+++ b/plugins/wasm-go/extensions/ai-proxy/provider/vertex.go
@@ -259,12 +259,12 @@ func (v *vertexProvider) OnRequestBody(ctx wrapper.HttpContext, apiName ApiName,
 	if v.isOpenAICompatibleMode() {
 		ctx.SetContext(contextOpenAICompatibleMarker, true)
 		body, err := v.onOpenAICompatibleRequestBody(ctx, apiName, body, headers)
-		headers.Set("Content-Length", fmt.Sprint(len(body)))
-		util.ReplaceRequestHeaders(headers)
-		_ = proxywasm.ReplaceHttpRequestBody(body)
 		if err != nil {
 			return types.ActionContinue, err
 		}
+		headers.Set("Content-Length", fmt.Sprint(len(body)))
+		util.ReplaceRequestHeaders(headers)
+		_ = proxywasm.ReplaceHttpRequestBody(body)
 		// OpenAI 兼容模式需要 OAuth token
 		cached, err := v.getToken()
 		if cached {
@@ -277,6 +277,9 @@ func (v *vertexProvider) OnRequestBody(ctx wrapper.HttpContext, apiName ApiName,
 	}

 	body, err := v.TransformRequestBodyHeaders(ctx, apiName, body, headers)
+	if err != nil {
+		return types.ActionContinue, err
+	}
 	headers.Set("Content-Length", fmt.Sprint(len(body)))

 	if v.isExpressMode() {
@@ -284,15 +287,12 @@ func (v *vertexProvider) OnRequestBody(ctx wrapper.HttpContext, apiName ApiName,
 		headers.Del("Authorization")
 		util.ReplaceRequestHeaders(headers)
 		_ = proxywasm.ReplaceHttpRequestBody(body)
-		return types.ActionContinue, err
+		return types.ActionContinue, nil
 	}

 	// 标准模式: 需要获取 OAuth token
 	util.ReplaceRequestHeaders(headers)
 	_ = proxywasm.ReplaceHttpRequestBody(body)
-	if err != nil {
-		return types.ActionContinue, err
-	}
 	cached, err := v.getToken()
 	if cached {
 		return types.ActionContinue, nil
@@ -369,7 +369,10 @@ func (v *vertexProvider) onChatCompletionRequestBody(ctx wrapper.HttpContext, bo
 		path := v.getRequestPath(ApiNameChatCompletion, request.Model, request.Stream)
 		util.OverwriteRequestPathHeader(headers, path)

-		vertexRequest := v.buildVertexChatRequest(request)
+		vertexRequest, err := v.buildVertexChatRequest(request)
+		if err != nil {
+			return nil, err
+		}
 		return json.Marshal(vertexRequest)
 	}
 }
@@ -971,7 +974,7 @@ func (v *vertexProvider) getOpenAICompatibleRequestPath() string {
 	return fmt.Sprintf(vertexOpenAICompatiblePathTemplate, v.config.vertexProjectId, v.config.vertexRegion)
 }

-func (v *vertexProvider) buildVertexChatRequest(request *chatCompletionRequest) *vertexChatRequest {
+func (v *vertexProvider) buildVertexChatRequest(request *chatCompletionRequest) (*vertexChatRequest, error) {
 	safetySettings := make([]vertexChatSafetySetting, 0)
 	for category, threshold := range v.config.geminiSafetySetting {
 		safetySettings = append(safetySettings, vertexChatSafetySetting{
@@ -1006,6 +1009,9 @@ func (v *vertexProvider) buildVertexChatRequest(request *chatCompletionRequest)
 		}
 		vertexRequest.GenerationConfig.ThinkingConfig = thinkingConfig
 	}
+	if err := v.applyResponseFormatToGenerationConfig(request.ResponseFormat, &vertexRequest.GenerationConfig, request.Model); err != nil {
+		return nil, err
+	}
 	if request.Tools != nil {
 		functions := make([]function, 0, len(request.Tools))
 		for _, tool := range request.Tools {
@@ -1091,7 +1097,130 @@ func (v *vertexProvider) buildVertexChatRequest(request *chatCompletionRequest)
 		}
 	}

-	return &vertexRequest
+	return &vertexRequest, nil
+}
+
+// applyResponseFormatToGenerationConfig maps OpenAI response_format into Vertex generationConfig.
+// The mapping is strict for type=json_schema to avoid silently breaking structured-output contracts.
+func (v *vertexProvider) applyResponseFormatToGenerationConfig(responseFormat map[string]interface{}, generationConfig *vertexChatGenerationConfig, model string) error {
+	if generationConfig == nil || len(responseFormat) == 0 {
+		return nil
+	}
+
+	// NOTE: Gemini 2.0 structured output requires propertyOrdering.
+	// Because gemini-2.0-* is legacy and rarely used, we intentionally do not implement
+	// propertyOrdering synthesis here; instead we ignore response_format and keep request
+	// as non-structured output for stability and minimal conversion behavior.
+	if requiresPropertyOrderingForModel(model) {
+		return nil
+	}
+
+	responseFormatType, _ := responseFormat["type"].(string)
+	responseFormatType = strings.ToLower(responseFormatType)
+
+	switch responseFormatType {
+	case "":
+		// Be tolerant for non-standard clients that pass schema directly in response_format.
+		if isJSONSchemaMap(responseFormat) {
+			generationConfig.ResponseMimeType = util.MimeTypeApplicationJson
+			generationConfig.ResponseSchema = responseFormat
+		}
+	case "json_object":
+		generationConfig.ResponseMimeType = util.MimeTypeApplicationJson
+	case "json_schema":
+		schema := extractOpenAIJSONSchema(responseFormat)
+		if len(schema) == 0 {
+			return fmt.Errorf("invalid response_format.json_schema: missing schema object")
+		}
+		generationConfig.ResponseMimeType = util.MimeTypeApplicationJson
+		generationConfig.ResponseSchema = schema
+	case "text":
+		// Vertex defaults to text output when no response mime/schema is provided.
+	default:
+		// Be tolerant for non-standard usage where response_format itself is a JSON schema.
+		if isJSONSchemaType(responseFormatType) && isJSONSchemaMap(responseFormat) {
+			generationConfig.ResponseMimeType = util.MimeTypeApplicationJson
+			generationConfig.ResponseSchema = responseFormat
+		}
+	}
+	return nil
+}
+
+func extractOpenAIJSONSchema(responseFormat map[string]interface{}) map[string]interface{} {
+	jsonSchemaValue, ok := responseFormat["json_schema"]
+	if !ok {
+		return nil
+	}
+
+	jsonSchemaMap, ok := jsonSchemaValue.(map[string]interface{})
+	if !ok {
+		return nil
+	}
+
+	// OpenAI canonical format:
+	// {
+	//   "type":"json_schema",
+	//   "json_schema":{"name":"...","strict":true,"schema":{...}}
+	// }
+	if nestedSchemaValue, ok := jsonSchemaMap["schema"]; ok {
+		if nestedSchema, ok := nestedSchemaValue.(map[string]interface{}); ok {
+			return nestedSchema
+		}
+	}
+
+	// Tolerate non-standard format where json_schema itself is the schema.
+	if isJSONSchemaMap(jsonSchemaMap) {
+		return jsonSchemaMap
+	}
+	return nil
+}
+
+func isJSONSchemaType(value string) bool {
+	switch strings.ToLower(value) {
+	case "object", "array", "string", "number", "integer", "boolean", "null":
+		return true
+	default:
+		return false
+	}
+}
+
+func isJSONSchemaMap(schema map[string]interface{}) bool {
+	if len(schema) == 0 {
+		return false
+	}
+
+	if typeValue, ok := schema["type"].(string); ok && isJSONSchemaType(typeValue) {
+		return true
+	}
+
+	// Schema might omit "type" and still be valid for specific cases.
+	schemaKeys := []string{
+		"anyOf",
+		"enum",
+		"format",
+		"items",
+		"maximum",
+		"maxItems",
+		"minimum",
+		"minItems",
+		"nullable",
+		"properties",
+		"description",
+		"propertyOrdering",
+		"required",
+	}
+	for _, key := range schemaKeys {
+		if _, ok := schema[key]; ok {
+			return true
+		}
+	}
+
+	return false
+}
+
+func requiresPropertyOrderingForModel(model string) bool {
+	model = strings.ToLower(model)
+	return strings.HasPrefix(model, "gemini-2.0-")
 }

 func (v *vertexProvider) buildEmbeddingRequest(request *embeddingsRequest) *vertexEmbeddingRequest {
@@ -1170,14 +1299,16 @@ type vertexChatSafetySetting struct {
 }

 type vertexChatGenerationConfig struct {
-	Temperature        float64              `json:"temperature,omitempty"`
-	TopP               float64              `json:"topP,omitempty"`
-	TopK               int                  `json:"topK,omitempty"`
-	CandidateCount     int                  `json:"candidateCount,omitempty"`
-	MaxOutputTokens    int                  `json:"maxOutputTokens,omitempty"`
-	ThinkingConfig     vertexThinkingConfig `json:"thinkingConfig,omitempty"`
-	ResponseModalities []string             `json:"responseModalities,omitempty"`
-	ImageConfig        *vertexImageConfig   `json:"imageConfig,omitempty"`
+	Temperature        float64                `json:"temperature,omitempty"`
+	TopP               float64                `json:"topP,omitempty"`
+	TopK               int                    `json:"topK,omitempty"`
+	CandidateCount     int                    `json:"candidateCount,omitempty"`
+	MaxOutputTokens    int                    `json:"maxOutputTokens,omitempty"`
+	ThinkingConfig     vertexThinkingConfig   `json:"thinkingConfig,omitempty"`
+	ResponseMimeType   string                 `json:"responseMimeType,omitempty"`
+	ResponseSchema     map[string]interface{} `json:"responseSchema,omitempty"`
+	ResponseModalities []string               `json:"responseModalities,omitempty"`
+	ImageConfig        *vertexImageConfig     `json:"imageConfig,omitempty"`
 }

 type vertexImageConfig struct {