Sglang Model Gateway#

sgl-project/sglang Github stars Github forks Language Last Tag Last commit

cat << 'EOF' > sglang-pd-stack.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: llm-pd-demo
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: pod-proxy-env
  namespace: llm-pd-demo
data:
  http_proxy: "http://218.16.121.13:1080"
  https_proxy: "http://218.16.121.13:1080"
  HTTP_PROXY: "http://218.16.121.13:1080"
  HTTPS_PROXY: "http://218.16.121.13:1080"
  NO_PROXY: "127.0.0.1,localhost,.svc,.cluster.local"
  no_proxy: "127.0.0.1,localhost,.svc,.cluster.local"
  HF_ENDPOINT: "https://hf-mirror.com"
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: sglang-model-gateway
  namespace: llm-pd-demo
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: sglang-model-gateway
  namespace: llm-pd-demo
rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: sglang-model-gateway
  namespace: llm-pd-demo
subjects:
  - kind: ServiceAccount
    name: sglang-model-gateway
    namespace: llm-pd-demo
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: sglang-model-gateway
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: sglang-prefill
  namespace: llm-pd-demo
spec:
  replicas: 1
  selector:
    matchLabels:
      app: sglang-worker
      component: prefill
  template:
    metadata:
      labels:
        app: sglang-worker
        component: prefill
        model: qwen25-05b
      annotations:
        sglang.ai/bootstrap-port: "8998"
    spec:
      runtimeClassName: nvidia
      nodeSelector:
        accelerator: nvidia
      terminationGracePeriodSeconds: 120
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values: ["sglang-worker"]
              topologyKey: kubernetes.io/hostname
      containers:
        - name: sglang-prefill
          image: lmsysorg/sglang:v0.5.9-cu129-amd64
          imagePullPolicy: IfNotPresent
          envFrom:
            - configMapRef:
                name: pod-proxy-env
          env:
            - name: PYTHONUNBUFFERED
              value: "1"
          command: ["python3", "-m", "sglang.launch_server"]
          args:
            - "--model-path"
            - "Qwen/Qwen2.5-0.5B-Instruct"
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "30000"
            - "--tp-size"
            - "1"
            - "--disaggregation-mode"
            - "prefill"
            - "--disaggregation-transfer-backend"
            - "fake"
            - "--disaggregation-bootstrap-port"
            - "8998"
            - "--mem-fraction-static"
            - "0.65"
          ports:
            - name: http
              containerPort: 30000
            - name: bootstrap
              containerPort: 8998
          resources:
            requests:
              cpu: "2"
              memory: "8Gi"
            limits:
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: hf-cache
              mountPath: /root/.cache/huggingface
            - name: shm
              mountPath: /dev/shm
          startupProbe:
            tcpSocket:
              port: http
            periodSeconds: 10
            failureThreshold: 180
          readinessProbe:
            tcpSocket:
              port: http
            periodSeconds: 5
            failureThreshold: 6
          livenessProbe:
            tcpSocket:
              port: http
            periodSeconds: 10
            failureThreshold: 6
      volumes:
        - name: hf-cache
          emptyDir: {}
        - name: shm
          emptyDir:
            medium: Memory
            sizeLimit: 2Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: sglang-decode
  namespace: llm-pd-demo
spec:
  replicas: 1
  selector:
    matchLabels:
      app: sglang-worker
      component: decode
  template:
    metadata:
      labels:
        app: sglang-worker
        component: decode
        model: qwen25-05b
    spec:
      runtimeClassName: nvidia
      nodeSelector:
        accelerator: nvidia
      terminationGracePeriodSeconds: 120
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: app
                    operator: In
                    values: ["sglang-worker"]
              topologyKey: kubernetes.io/hostname
      containers:
        - name: sglang-decode
          image: lmsysorg/sglang:v0.5.9-cu129-amd64
          imagePullPolicy: IfNotPresent
          envFrom:
            - configMapRef:
                name: pod-proxy-env
          env:
            - name: PYTHONUNBUFFERED
              value: "1"
          command: ["python3", "-m", "sglang.launch_server"]
          args:
            - "--model-path"
            - "Qwen/Qwen2.5-0.5B-Instruct"
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "30000"
            - "--tp-size"
            - "1"
            - "--disaggregation-mode"
            - "decode"
            - "--disaggregation-transfer-backend"
            - "fake"
            - "--mem-fraction-static"
            - "0.65"
          ports:
            - name: http
              containerPort: 30000
          resources:
            requests:
              cpu: "2"
              memory: "8Gi"
            limits:
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: hf-cache
              mountPath: /root/.cache/huggingface
            - name: shm
              mountPath: /dev/shm
          startupProbe:
            tcpSocket:
              port: http
            periodSeconds: 10
            failureThreshold: 180
          readinessProbe:
            tcpSocket:
              port: http
            periodSeconds: 5
            failureThreshold: 6
          livenessProbe:
            tcpSocket:
              port: http
            periodSeconds: 10
            failureThreshold: 6
      volumes:
        - name: hf-cache
          emptyDir: {}
        - name: shm
          emptyDir:
            medium: Memory
            sizeLimit: 2Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: sglang-model-gateway
  namespace: llm-pd-demo
spec:
  replicas: 1
  selector:
    matchLabels:
      app: sglang-model-gateway
  template:
    metadata:
      labels:
        app: sglang-model-gateway
    spec:
      serviceAccountName: sglang-model-gateway
      terminationGracePeriodSeconds: 60
      containers:
        - name: gateway
          image: lmsysorg/sglang:v0.5.9-cu129-amd64
          imagePullPolicy: IfNotPresent
          envFrom:
            - configMapRef:
                name: pod-proxy-env
          env:
            - name: PYTHONUNBUFFERED
              value: "1"
          command: ["python3", "-m", "sglang_router.launch_router"]
          args:
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "30000"
            - "--log-level"
            - "debug"
            - "--service-discovery"
            - "--service-discovery-namespace"
            - "llm-pd-demo"
            - "--service-discovery-port"
            - "30000"
            - "--pd-disaggregation"
            - "--prefill-selector"
            - "app=sglang-worker"
            - "component=prefill"
            - "--decode-selector"
            - "app=sglang-worker"
            - "component=decode"
            - "--prefill-policy"
            - "cache_aware"
            - "--decode-policy"
            - "power_of_two"
            - "--max-concurrent-requests"
            - "32"
            - "--worker-startup-timeout-secs"
            - "1800"
          ports:
            - name: http
              containerPort: 30000
            - name: metrics
              containerPort: 29000
          resources:
            requests:
              cpu: "500m"
              memory: "512Mi"
          readinessProbe:
            tcpSocket:
              port: http
            periodSeconds: 5
          livenessProbe:
            tcpSocket:
              port: http
            periodSeconds: 10
---
apiVersion: v1
kind: Service
metadata:
  name: sglang-model-gateway
  namespace: llm-pd-demo
spec:
  selector:
    app: sglang-model-gateway
  ports:
    - name: http
      port: 30000
      targetPort: 30000
    - name: metrics
      port: 29000
      targetPort: 29000
  type: ClusterIP
EOF

kubectl apply -f sglang-pd-stack.yaml

kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-prefill --timeout=30m
kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-decode --timeout=30m
kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-model-gateway --timeout=10m

# 导入到整个集群(server+agents)
k3d image import -c cluster-1 lmsysorg/sglang:v0.5.9-cu129-amd64

# 只导入到指定节点(可多次 --nodes)
k3d image import -c cluster-1 --nodes k3d-cluster-1-agent-0 lmsysorg/sglang:v0.5.9-cu129-amd64

# 导入多个镜像
k3d image import -c cluster-1 lmsysorg/sglang:v0.5.9-cu129-amd64 lmsysorg/sglang:v0.5.9-cu129-amd64