[feat] load balancing across different clusters and endpoints based on metrics (#3063)

2026-05-27 22:27:29 +08:00 · 2025-11-25 10:32:34 +08:00
parent 7a504fd67d
commit 42334f21df
12 changed files with 764 additions and 126 deletions
--- a/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend/types.go
+++ b/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend/types.go
@@ -0,0 +1,78 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backend
+
+import "fmt"
+
+type PodSet map[Pod]bool
+
+type Pod struct {
+	Name    string
+	Address string
+}
+
+func (p Pod) String() string {
+	return p.Name + ":" + p.Address
+}
+
+type Metrics struct {
+	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
+	ActiveModels map[string]int
+	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
+	MaxActiveModels         int
+	RunningQueueSize        int
+	WaitingQueueSize        int
+	KVCacheUsagePercent     float64
+	KvCacheMaxTokenCapacity int
+}
+
+type UserSelectedMetric struct {
+	MetricName  string
+	MetricValue float64
+}
+
+type PodMetrics struct {
+	Pod
+	Metrics
+	UserSelectedMetric
+}
+
+func (pm *PodMetrics) String() string {
+	return fmt.Sprintf("Pod: %+v; Metrics: %+v, UserSelectedMetric: %+v", pm.Pod, pm.Metrics, pm.UserSelectedMetric)
+}
+
+func (pm *PodMetrics) Clone() *PodMetrics {
+	cm := make(map[string]int, len(pm.ActiveModels))
+	for k, v := range pm.ActiveModels {
+		cm[k] = v
+	}
+	clone := &PodMetrics{
+		Pod: pm.Pod,
+		Metrics: Metrics{
+			ActiveModels:            cm,
+			RunningQueueSize:        pm.RunningQueueSize,
+			WaitingQueueSize:        pm.WaitingQueueSize,
+			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
+			KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity,
+		},
+		UserSelectedMetric: UserSelectedMetric{
+			MetricName:  pm.MetricName,
+			MetricValue: pm.MetricValue,
+		},
+	}
+	return clone
+}
--- a/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend/vllm/metrics.go
+++ b/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend/vllm/metrics.go
@@ -0,0 +1,160 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package vllm provides vllm specific pod metrics implementation.
+package vllm
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/alibaba/higress/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend"
+
+	dto "github.com/prometheus/client_model/go"
+	"go.uber.org/multierr"
+)
+
+const (
+	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
+	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
+	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
+	RunningQueueSizeMetricName = "vllm:num_requests_running"
+	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
+	/* TODO: Uncomment this once the following are added to the fork.
+	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
+	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
+	*/
+	KVCacheUsagePercentMetricName     = "vllm:gpu_cache_usage_perc"
+	KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
+)
+
+// promToPodMetrics updates internal pod metrics with scraped prometheus metrics.
+// A combined error is returned if errors occur in one or more metric processing.
+// it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map.
+func PromToPodMetrics(
+	metricFamilies map[string]*dto.MetricFamily,
+	existing *backend.PodMetrics,
+) (*backend.PodMetrics, error) {
+	var errs error
+	updated := existing.Clone()
+	// User selected metric
+	if updated.MetricName != "" {
+		metricValue, err := getLatestMetric(metricFamilies, updated.MetricName)
+		errs = multierr.Append(errs, err)
+		if err == nil {
+			updated.MetricValue = metricValue.GetGauge().GetValue()
+		}
+		return updated, errs
+	}
+	// Default metric
+	runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue())
+	}
+	waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue())
+	}
+	cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName)
+	errs = multierr.Append(errs, err)
+	if err == nil {
+		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
+	}
+
+	loraMetrics, _, err := getLatestLoraMetric(metricFamilies)
+	errs = multierr.Append(errs, err)
+	/* TODO: uncomment once this is available in vllm.
+	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
+	errs = multierr.Append(errs, err)
+	if err != nil {
+		updated.KvCacheMaxTokenCapacity = int(kvCap)
+	}
+	*/
+
+	if loraMetrics != nil {
+		updated.ActiveModels = make(map[string]int)
+		for _, label := range loraMetrics.GetLabel() {
+			if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.ActiveModels[adapter] = 0
+					}
+				}
+			}
+			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
+				if label.GetValue() != "" {
+					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+					if err != nil {
+						errs = multierr.Append(errs, err)
+					}
+				}
+			}
+		}
+
+	}
+
+	return updated, errs
+}
+
+// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
+// reason its specially fetched is because each label key value pair permutation generates new series
+// and only most recent is useful. The value of each series is the creation timestamp so we can
+// retrieve the latest by sorting the value.
+func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
+	if !ok {
+		// klog.Warningf("metric family %q not found", LoraRequestInfoMetricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+	}
+	var latestTs float64
+	var latest *dto.Metric
+	for _, m := range loraRequests.GetMetric() {
+		if m.GetGauge().GetValue() > latestTs {
+			latestTs = m.GetGauge().GetValue()
+			latest = m
+		}
+	}
+	return latest, time.Unix(0, int64(latestTs*1000)), nil
+}
+
+// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
+// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
+func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
+	mf, ok := metricFamilies[metricName]
+	if !ok {
+		// klog.Warningf("metric family %q not found", metricName)
+		return nil, fmt.Errorf("metric family %q not found", metricName)
+	}
+	if len(mf.GetMetric()) == 0 {
+		return nil, fmt.Errorf("no metrics available for %q", metricName)
+	}
+	var latestTs int64
+	var latest *dto.Metric
+	for _, m := range mf.GetMetric() {
+		if m.GetTimestampMs() >= latestTs {
+			latestTs = m.GetTimestampMs()
+			latest = m
+		}
+	}
+	// klog.V(logutil.TRACE).Infof("Got metric value %+v for metric %v", latest, metricName)
+	return latest, nil
+}