vllm-project · zhangjyr · Feb 13, 2025 · Feb 26, 2025 · Mar 6, 2025 · Mar 7, 2025
diff --git a/cmd/plugins/main.go b/cmd/plugins/main.go
@@ -34,6 +34,7 @@ import (
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
 	"github.com/vllm-project/aibrix/pkg/cache"
 	"github.com/vllm-project/aibrix/pkg/plugins/gateway"
+	routing "github.com/vllm-project/aibrix/pkg/plugins/gateway/algorithms"
 	"github.com/vllm-project/aibrix/pkg/utils"
 	"google.golang.org/grpc/health"
 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
@@ -77,7 +78,7 @@ func main() {
 		panic(err)
 	}
 
-	cache.InitForGateway(config, stopCh, redisClient)
+	cache.InitForGateway(config, stopCh, redisClient, routing.ModelRouterFactory)
 
 	// Connect to K8s cluster
 	k8sClient, err := kubernetes.NewForConfig(config)

diff --git a/config/gateway/gateway-plugin/gateway-plugin.yaml b/config/gateway/gateway-plugin/gateway-plugin.yaml
@@ -2,7 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
   name: gateway-plugins
-  namespace: aibrix-system
+  namespace: system
 spec:
   selector:
     app: gateway-plugins
@@ -20,7 +20,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: gateway-plugins
-  namespace: aibrix-system
+  namespace: system
 spec:
   strategy:
     type: RollingUpdate

diff --git a/config/overlays/dev/gateway-plugin/kustomization.yaml b/config/overlays/dev/gateway-plugin/kustomization.yaml
@@ -0,0 +1,38 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: aibrix-system
+
+namePrefix: aibrix-
+
+resources:
+- ../../../gateway/gateway-plugin
+
+images:
+- name: gateway-plugins
+  newName: aibrix/gateway-plugins
+  newTag: nightly
+
+patches:
+- patch: |-  # Use the '|' and '-' for inline patching
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: gateway-plugins
+    spec:
+      template:
+        spec:
+          containers:
+            - name: gateway-plugin
+              args:
+                - -v=5
+              env:
+                - name: AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS
+                  value: "60000"
+                - name: AIBRIX_GPU_OPTIMIZER_TRACING_FLAG
+                  value: "true"
+  target:
+    kind: Deployment
+    name: gateway-plugins
+    namespace: system
+    version: v1
diff --git a/config/overlays/dev/manager/kustomization.yaml b/config/overlays/dev/manager/kustomization.yaml
@@ -28,6 +28,4 @@ patches:
     kind: Deployment
     name: controller-manager
     namespace: system
-    version: v1
-
-apiVersion: kustomize.config.k8s.io/v1beta1
+    version: v1
diff --git a/config/overlays/vke-dev/gateway-plugin/gateway_plugins_patch.yaml b/config/overlays/vke-dev/gateway-plugin/gateway_plugins_patch.yaml
@@ -0,0 +1,39 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway-plugins
+  namespace: aibrix-system
+spec:
+  replicas: 1
+  template:
+    spec:
+      affinity:
+        nodeAffinity: # prevent gateway pod to be placed on gpu node.
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: vke.node.gpu.schedule
+                  operator: NotIn
+                  values:
+                    - nvidia
+      containers:
+        - name: gateway-plugin
+          resources:
+            limits:
+              cpu: "2"
+              memory: 8Gi
+            requests:
+              cpu: "2"
+              memory: 8Gi
+          env:
+            - name: AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE
+              value: "character"
+            - name: AIBRIX_PREFIX_CACHE_BLOCK_SIZE
+              value: "128"
+            - name: AIBRIX_PREFIX_CACHE_BLOCK_NUMBER
+              value: "200000"
+            - name: AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT
+              value: "16"
+            - name: AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR
+              value: "2"
diff --git a/config/overlays/vke-dev/gateway-plugin/kustomization.yaml b/config/overlays/vke-dev/gateway-plugin/kustomization.yaml
@@ -1,17 +1,16 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-namespace: aibrix-system
-
-namePrefix: aibrix-
-
 resources:
-- ../../../gateway/gateway-plugin
+- ../../dev/gateway-plugin
+
+patches:
+  - path: gateway_plugins_patch.yaml
 
 images:
 - name: busybox
   newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/busybox
   newTag: stable
-- name: gateway-plugins
+- name: aibrix/gateway-plugins
   newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gateway-plugins
   newTag: nightly
diff --git a/development/app/Makefile b/development/app/Makefile
@@ -113,5 +113,17 @@ test-gateway2:
             "max_tokens": 512 \
         }'
 
+test-router:
+	curl -v http://localhost:8888/v1/chat/completions \
+		-H "model: llama2-7b" \
+		-H "Content-Type: application/json" \
+		-H "Authorization: Bearer any_key" \
+		-H "routing-strategy: slo" \
+		-d '{ \
+			"model": "llama2-7b", \
+			"messages": [{"role": "user", "content": "Say this is a test!"}], \
+			"temperature": 0.7 \
+		}'
+
 metrics:
 	curl http://localhost:8000/metrics
diff --git a/pkg/cache/cache_api.go b/pkg/cache/cache_api.go
@@ -28,6 +28,9 @@ type Cache interface {
 	ModelCache
 	MetricCache
 	RequestTracker
+	ProfileCache
+	types.OutputPredictorProvider
+	types.RouterProvider
 }
 
 // PodCache defines operations for pod information caching
@@ -106,7 +109,10 @@ type MetricCache interface {
 
 // RequestTracker defines operations for track workload statistics
 type RequestTracker interface {
-	// AddRequestCount starts tracking request count
+	// AddRequestCount tracks the start of a request after routing.
+	// To support realtime statistics update and access, AddRequestCount can be called multiple times for a request.
+	// As the result, implementation should ensure thread-safe access to the counterm and idempotency.
+	//
 	// Parameters:
 	//   ctx: Routing context
 	//   requestID: Unique request identifier
@@ -115,14 +121,18 @@ type RequestTracker interface {
 	//   int64: Trace term identifier
 	AddRequestCount(ctx *types.RoutingContext, requestID string, modelName string) (traceTerm int64)
 
-	// DoneRequestCount completes request count tracking, only one DoneRequestXXX should be called for a request
+	// DoneRequestCount tracks the completion of a request without usage information like inputTokens and outputTokens.
+	// Only one DoneRequestXXX should be called for a request. Idemptency is not required.
+	//
 	// Parameters:
 	//   requestID: Unique request identifier
 	//   modelName: Name of the model
 	//   traceTerm: Trace term identifier
 	DoneRequestCount(ctx *types.RoutingContext, requestID string, modelName string, traceTerm int64)
 
-	// DoneRequestTrace completes request tracing, only one DoneRequestXXX should be called for a request
+	// DoneRequestTrace tracks the completion of a request with usage information like inputTokens and outputTokens.
+	// Only one DoneRequestXXX should be called for a request. Idemptency is not required.
+	//
 	// Parameters:
 	//   ctx: Routing context
 	//   requestID: Unique request identifier
@@ -132,3 +142,18 @@ type RequestTracker interface {
 	//   traceTerm: Trace term identifier
 	DoneRequestTrace(ctx *types.RoutingContext, requestID string, modelName string, inputTokens, outputTokens, traceTerm int64)
 }
+
+// ProfileCache defines operations for model profiles
+type ProfileCache interface {
+	// GetModelProfileByPod gets model profile for a pod
+	// Parameters:
+	//   pod: Pod object
+	//   modelName: Name of the model
+	GetModelProfileByPod(pod *v1.Pod, modelName string) (*ModelGPUProfile, error)
+
+	// GetModelProfileByDeploymentName gets model profile for a deployment
+	// Parameters:
+	//   deploymentName: Name of the deployment
+	//   modelName: Name of the model
+	GetModelProfileByDeploymentName(deploymentName string, modelName string) (*ModelGPUProfile, error)
+}