mirror of
https://github.com/alibaba/higress.git
synced 2026-05-28 06:37:26 +08:00
[feat] load balancing across different clusters and endpoints based on metrics (#3063)
This commit is contained in:
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
Copyright 2025 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package scheduling
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"math"
|
||||
|
||||
"github.com/alibaba/higress/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend"
|
||||
|
||||
"github.com/higress-group/proxy-wasm-go-sdk/proxywasm"
|
||||
)
|
||||
|
||||
type Filter interface {
|
||||
Name() string
|
||||
Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
|
||||
}
|
||||
|
||||
// filter applies current filterFunc, and then recursively applies next filters depending success or
|
||||
// failure of the current filterFunc.
|
||||
// It can be used to construct a flow chart algorithm.
|
||||
type filter struct {
|
||||
name string
|
||||
filter filterFunc
|
||||
// nextOnSuccess filter will be applied after successfully applying the current filter.
|
||||
// The filtered results will be passed to the next filter.
|
||||
nextOnSuccess *filter
|
||||
// nextOnFailure filter will be applied if current filter fails.
|
||||
// The original input will be passed to the next filter.
|
||||
nextOnFailure *filter
|
||||
// nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the
|
||||
// success or failure of the current filter.
|
||||
// NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil.
|
||||
// However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of
|
||||
// nextOnSuccessOrFailure, in the success and failure scenarios, respectively.
|
||||
nextOnSuccessOrFailure *filter
|
||||
|
||||
// callbacks api.FilterCallbackHandler
|
||||
}
|
||||
|
||||
func (f *filter) Name() string {
|
||||
if f == nil {
|
||||
return "nil"
|
||||
}
|
||||
return f.name
|
||||
}
|
||||
|
||||
func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
proxywasm.LogDebugf("Running filter %q on request %v with %v pods", f.name, req, len(pods))
|
||||
filtered, err := f.filter(req, pods)
|
||||
|
||||
next := f.nextOnSuccessOrFailure
|
||||
if err == nil && len(filtered) > 0 {
|
||||
if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil {
|
||||
// No succeeding filters to run, return.
|
||||
return filtered, err
|
||||
}
|
||||
if f.nextOnSuccess != nil {
|
||||
next = f.nextOnSuccess
|
||||
}
|
||||
// On success, pass the filtered result to the next filter.
|
||||
return next.Filter(req, filtered)
|
||||
} else {
|
||||
if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil {
|
||||
// No succeeding filters to run, return.
|
||||
return filtered, err
|
||||
}
|
||||
if f.nextOnFailure != nil {
|
||||
next = f.nextOnFailure
|
||||
}
|
||||
// On failure, pass the initial set of pods to the next filter.
|
||||
return next.Filter(req, pods)
|
||||
}
|
||||
}
|
||||
|
||||
// filterFunc filters a set of input pods to a subset.
|
||||
type filterFunc func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
|
||||
|
||||
// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
|
||||
func toFilterFunc(pp podPredicate) filterFunc {
|
||||
return func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
filtered := []*backend.PodMetrics{}
|
||||
for _, pod := range pods {
|
||||
pass := pp(req, pod)
|
||||
if pass {
|
||||
filtered = append(filtered, pod)
|
||||
}
|
||||
}
|
||||
if len(filtered) == 0 {
|
||||
return nil, errors.New("no pods left")
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
}
|
||||
|
||||
// leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range
|
||||
// (max-min) by the number of pods, and finds the pods that fall into the first range.
|
||||
// The intuition is that if there are multiple pods that share similar queue size in the low range,
|
||||
// we should consider them all instead of the absolute minimum one. This worked better than picking
|
||||
// the least one as it gives more choices for the next filter, which on aggregate gave better
|
||||
// results.
|
||||
// TODO: Compare this strategy with other strategies such as top K.
|
||||
func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
min := math.MaxInt
|
||||
max := 0
|
||||
filtered := []*backend.PodMetrics{}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.WaitingQueueSize <= min {
|
||||
min = pod.WaitingQueueSize
|
||||
}
|
||||
if pod.WaitingQueueSize >= max {
|
||||
max = pod.WaitingQueueSize
|
||||
}
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) {
|
||||
filtered = append(filtered, pod)
|
||||
}
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
|
||||
func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
return pod.WaitingQueueSize < queueingThresholdLoRA
|
||||
}
|
||||
|
||||
// leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range
|
||||
// (max-min) by the number of pods, and finds the pods that fall into the first range.
|
||||
// The intuition is that if there are multiple pods that share similar KV cache in the low range, we
|
||||
// should consider them all instead of the absolute minimum one. This worked better than picking the
|
||||
// least one as it gives more choices for the next filter, which on aggregate gave better results.
|
||||
// TODO: Compare this strategy with other strategies such as top K.
|
||||
func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
min := math.MaxFloat64
|
||||
var max float64 = 0
|
||||
filtered := []*backend.PodMetrics{}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.KVCacheUsagePercent <= min {
|
||||
min = pod.KVCacheUsagePercent
|
||||
}
|
||||
if pod.KVCacheUsagePercent >= max {
|
||||
max = pod.KVCacheUsagePercent
|
||||
}
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) {
|
||||
filtered = append(filtered, pod)
|
||||
}
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
|
||||
// podPredicate is a filter function to check whether a pod is desired.
|
||||
type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
|
||||
|
||||
// We consider serving an adapter low cost it the adapter is active in the model server, or the
|
||||
// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by
|
||||
// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to
|
||||
// a single pod. This gave good performance in our initial benchmarking results in the scenario
|
||||
// where # of lora slots > # of lora adapters.
|
||||
func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
_, ok := pod.ActiveModels[req.Model]
|
||||
return ok || len(pod.ActiveModels) < pod.MaxActiveModels
|
||||
}
|
||||
|
||||
// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
|
||||
func loRAAffinityPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
_, ok := pod.ActiveModels[req.Model]
|
||||
return ok
|
||||
}
|
||||
|
||||
// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
|
||||
func canAcceptNewLoraPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
return len(pod.ActiveModels) < pod.MaxActiveModels
|
||||
}
|
||||
|
||||
func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
return req.Critical
|
||||
}
|
||||
|
||||
func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate {
|
||||
return func(req *LLMRequest, pod *backend.PodMetrics) bool {
|
||||
return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
Copyright 2025 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
// Package scheduling implements request scheduling algorithms.
|
||||
package scheduling
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/rand"
|
||||
"strings"
|
||||
|
||||
"github.com/alibaba/higress/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend"
|
||||
"github.com/alibaba/higress/plugins/wasm-go/extensions/ai-load-balancer/endpoint_metrics/backend/vllm"
|
||||
|
||||
"github.com/prometheus/common/expfmt"
|
||||
)
|
||||
|
||||
const (
|
||||
MetricPolicyDefault = "default"
|
||||
MetricPolicyLeast = "least"
|
||||
MetricPolicyMost = "most"
|
||||
)
|
||||
|
||||
const (
|
||||
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
|
||||
kvCacheThreshold = 0.8
|
||||
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
|
||||
queueThresholdCritical = 5
|
||||
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
|
||||
// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
|
||||
// The value of 50 is arrived heuristicically based on experiments.
|
||||
queueingThresholdLoRA = 50
|
||||
)
|
||||
|
||||
var (
|
||||
defaultFilter = &filter{
|
||||
name: "critical request",
|
||||
filter: toFilterFunc(criticalRequestPredicate),
|
||||
nextOnSuccess: lowLatencyFilter,
|
||||
nextOnFailure: sheddableRequestFilter,
|
||||
}
|
||||
|
||||
// queueLoRAAndKVCacheFilter applied least queue -> low cost lora -> least KV Cache filter
|
||||
queueLoRAAndKVCacheFilter = &filter{
|
||||
name: "least queuing",
|
||||
filter: leastQueuingFilterFunc,
|
||||
nextOnSuccessOrFailure: &filter{
|
||||
name: "low cost LoRA",
|
||||
filter: toFilterFunc(lowLoRACostPredicate),
|
||||
nextOnSuccessOrFailure: &filter{
|
||||
name: "least KV cache percent",
|
||||
filter: leastKVCacheFilterFunc,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// queueAndKVCacheFilter applies least queue followed by least KV Cache filter
|
||||
queueAndKVCacheFilter = &filter{
|
||||
name: "least queuing",
|
||||
filter: leastQueuingFilterFunc,
|
||||
nextOnSuccessOrFailure: &filter{
|
||||
name: "least KV cache percent",
|
||||
filter: leastKVCacheFilterFunc,
|
||||
},
|
||||
}
|
||||
|
||||
lowLatencyFilter = &filter{
|
||||
name: "low queueing filter",
|
||||
filter: toFilterFunc((lowQueueingPodPredicate)),
|
||||
nextOnSuccess: &filter{
|
||||
name: "affinity LoRA",
|
||||
filter: toFilterFunc(loRAAffinityPredicate),
|
||||
nextOnSuccess: queueAndKVCacheFilter,
|
||||
nextOnFailure: &filter{
|
||||
name: "can accept LoRA Adapter",
|
||||
filter: toFilterFunc(canAcceptNewLoraPredicate),
|
||||
nextOnSuccessOrFailure: queueAndKVCacheFilter,
|
||||
},
|
||||
},
|
||||
nextOnFailure: queueLoRAAndKVCacheFilter,
|
||||
}
|
||||
|
||||
sheddableRequestFilter = &filter{
|
||||
// When there is at least one model server that's not queuing requests, and still has KV
|
||||
// cache below a certain threshold, we consider this model server has capacity to handle
|
||||
// a sheddable request without impacting critical requests.
|
||||
name: "has capacity for sheddable requests",
|
||||
filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(queueThresholdCritical, kvCacheThreshold)),
|
||||
nextOnSuccess: queueLoRAAndKVCacheFilter,
|
||||
// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
|
||||
// request to make room for critical requests.
|
||||
nextOnFailure: &filter{
|
||||
name: "drop request",
|
||||
filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
// api.LogDebugf("Dropping request %v", req)
|
||||
return []*backend.PodMetrics{}, errors.New("dropping request due to limited backend resources")
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
func NewScheduler(pm []*backend.PodMetrics, filter Filter) *Scheduler {
|
||||
|
||||
return &Scheduler{
|
||||
podMetrics: pm,
|
||||
filter: filter,
|
||||
}
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
podMetrics []*backend.PodMetrics
|
||||
filter Filter
|
||||
}
|
||||
|
||||
// Schedule finds the target pod based on metrics and the requested lora adapter.
|
||||
func (s *Scheduler) Schedule(req *LLMRequest) (targetPod backend.Pod, err error) {
|
||||
pods, err := s.filter.Filter(req, s.podMetrics)
|
||||
if err != nil || len(pods) == 0 {
|
||||
return backend.Pod{}, fmt.Errorf("failed to apply filter, resulted %v pods: %w", len(pods), err)
|
||||
}
|
||||
i := rand.Intn(len(pods))
|
||||
return pods[i].Pod, nil
|
||||
}
|
||||
|
||||
func GetScheduler(hostMetrics map[string]string, metricPolicy string, targetMetric string) (*Scheduler, error) {
|
||||
if len(hostMetrics) == 0 {
|
||||
return nil, errors.New("backend is not support llm scheduling")
|
||||
}
|
||||
var pms []*backend.PodMetrics
|
||||
for addr, metric := range hostMetrics {
|
||||
parser := expfmt.TextParser{}
|
||||
metricFamilies, err := parser.TextToMetricFamilies(strings.NewReader(metric))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pm := &backend.PodMetrics{
|
||||
Pod: backend.Pod{
|
||||
Name: addr,
|
||||
Address: addr,
|
||||
},
|
||||
Metrics: backend.Metrics{},
|
||||
UserSelectedMetric: backend.UserSelectedMetric{
|
||||
MetricName: targetMetric,
|
||||
},
|
||||
}
|
||||
pm, err = vllm.PromToPodMetrics(metricFamilies, pm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pms = append(pms, pm)
|
||||
}
|
||||
if metricPolicy == MetricPolicyLeast {
|
||||
filterFunc := func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
min := math.MaxFloat64
|
||||
max := 0.0
|
||||
filtered := []*backend.PodMetrics{}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.MetricValue <= min {
|
||||
min = pod.MetricValue
|
||||
}
|
||||
if pod.MetricValue >= max {
|
||||
max = pod.MetricValue
|
||||
}
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.MetricValue >= min && pod.MetricValue <= min+(max-min)/float64(len(pods)) {
|
||||
filtered = append(filtered, pod)
|
||||
}
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
filter := filter{
|
||||
name: "least user selected metric",
|
||||
filter: filterFunc,
|
||||
}
|
||||
return NewScheduler(pms, &filter), nil
|
||||
} else if metricPolicy == MetricPolicyMost {
|
||||
filterFunc := func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
|
||||
min := math.MaxFloat64
|
||||
max := 0.0
|
||||
filtered := []*backend.PodMetrics{}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.MetricValue <= min {
|
||||
min = pod.MetricValue
|
||||
}
|
||||
if pod.MetricValue >= max {
|
||||
max = pod.MetricValue
|
||||
}
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if pod.MetricValue <= max && pod.MetricValue >= max-(max-min)/float64(len(pods)) {
|
||||
filtered = append(filtered, pod)
|
||||
}
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
filter := filter{
|
||||
name: "most user selected metric",
|
||||
filter: filterFunc,
|
||||
}
|
||||
return NewScheduler(pms, &filter), nil
|
||||
}
|
||||
return NewScheduler(pms, defaultFilter), nil
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package scheduling
|
||||
|
||||
// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body.
|
||||
type LLMRequest struct {
|
||||
Model string
|
||||
Critical bool
|
||||
}
|
||||
Reference in New Issue
Block a user