Sglang Model Gateway#
sgl-project/sglang

cat << 'EOF' > sglang-pd-stack.yaml
apiVersion: v1
kind: Namespace
metadata:
name: llm-pd-demo
---
apiVersion: v1
kind: ConfigMap
metadata:
name: pod-proxy-env
namespace: llm-pd-demo
data:
http_proxy: "http://218.16.121.13:1080"
https_proxy: "http://218.16.121.13:1080"
HTTP_PROXY: "http://218.16.121.13:1080"
HTTPS_PROXY: "http://218.16.121.13:1080"
NO_PROXY: "127.0.0.1,localhost,.svc,.cluster.local"
no_proxy: "127.0.0.1,localhost,.svc,.cluster.local"
HF_ENDPOINT: "https://hf-mirror.com"
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: sglang-model-gateway
namespace: llm-pd-demo
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: sglang-model-gateway
namespace: llm-pd-demo
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: sglang-model-gateway
namespace: llm-pd-demo
subjects:
- kind: ServiceAccount
name: sglang-model-gateway
namespace: llm-pd-demo
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: sglang-model-gateway
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sglang-prefill
namespace: llm-pd-demo
spec:
replicas: 1
selector:
matchLabels:
app: sglang-worker
component: prefill
template:
metadata:
labels:
app: sglang-worker
component: prefill
model: qwen25-05b
annotations:
sglang.ai/bootstrap-port: "8998"
spec:
runtimeClassName: nvidia
nodeSelector:
accelerator: nvidia
terminationGracePeriodSeconds: 120
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values: ["sglang-worker"]
topologyKey: kubernetes.io/hostname
containers:
- name: sglang-prefill
image: lmsysorg/sglang:v0.5.9-cu129-amd64
imagePullPolicy: IfNotPresent
envFrom:
- configMapRef:
name: pod-proxy-env
env:
- name: PYTHONUNBUFFERED
value: "1"
command: ["python3", "-m", "sglang.launch_server"]
args:
- "--model-path"
- "Qwen/Qwen2.5-0.5B-Instruct"
- "--host"
- "0.0.0.0"
- "--port"
- "30000"
- "--tp-size"
- "1"
- "--disaggregation-mode"
- "prefill"
- "--disaggregation-transfer-backend"
- "fake"
- "--disaggregation-bootstrap-port"
- "8998"
- "--mem-fraction-static"
- "0.65"
ports:
- name: http
containerPort: 30000
- name: bootstrap
containerPort: 8998
resources:
requests:
cpu: "2"
memory: "8Gi"
limits:
nvidia.com/gpu: "1"
volumeMounts:
- name: hf-cache
mountPath: /root/.cache/huggingface
- name: shm
mountPath: /dev/shm
startupProbe:
tcpSocket:
port: http
periodSeconds: 10
failureThreshold: 180
readinessProbe:
tcpSocket:
port: http
periodSeconds: 5
failureThreshold: 6
livenessProbe:
tcpSocket:
port: http
periodSeconds: 10
failureThreshold: 6
volumes:
- name: hf-cache
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
sizeLimit: 2Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sglang-decode
namespace: llm-pd-demo
spec:
replicas: 1
selector:
matchLabels:
app: sglang-worker
component: decode
template:
metadata:
labels:
app: sglang-worker
component: decode
model: qwen25-05b
spec:
runtimeClassName: nvidia
nodeSelector:
accelerator: nvidia
terminationGracePeriodSeconds: 120
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values: ["sglang-worker"]
topologyKey: kubernetes.io/hostname
containers:
- name: sglang-decode
image: lmsysorg/sglang:v0.5.9-cu129-amd64
imagePullPolicy: IfNotPresent
envFrom:
- configMapRef:
name: pod-proxy-env
env:
- name: PYTHONUNBUFFERED
value: "1"
command: ["python3", "-m", "sglang.launch_server"]
args:
- "--model-path"
- "Qwen/Qwen2.5-0.5B-Instruct"
- "--host"
- "0.0.0.0"
- "--port"
- "30000"
- "--tp-size"
- "1"
- "--disaggregation-mode"
- "decode"
- "--disaggregation-transfer-backend"
- "fake"
- "--mem-fraction-static"
- "0.65"
ports:
- name: http
containerPort: 30000
resources:
requests:
cpu: "2"
memory: "8Gi"
limits:
nvidia.com/gpu: "1"
volumeMounts:
- name: hf-cache
mountPath: /root/.cache/huggingface
- name: shm
mountPath: /dev/shm
startupProbe:
tcpSocket:
port: http
periodSeconds: 10
failureThreshold: 180
readinessProbe:
tcpSocket:
port: http
periodSeconds: 5
failureThreshold: 6
livenessProbe:
tcpSocket:
port: http
periodSeconds: 10
failureThreshold: 6
volumes:
- name: hf-cache
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
sizeLimit: 2Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sglang-model-gateway
namespace: llm-pd-demo
spec:
replicas: 1
selector:
matchLabels:
app: sglang-model-gateway
template:
metadata:
labels:
app: sglang-model-gateway
spec:
serviceAccountName: sglang-model-gateway
terminationGracePeriodSeconds: 60
containers:
- name: gateway
image: lmsysorg/sglang:v0.5.9-cu129-amd64
imagePullPolicy: IfNotPresent
envFrom:
- configMapRef:
name: pod-proxy-env
env:
- name: PYTHONUNBUFFERED
value: "1"
command: ["python3", "-m", "sglang_router.launch_router"]
args:
- "--host"
- "0.0.0.0"
- "--port"
- "30000"
- "--log-level"
- "debug"
- "--service-discovery"
- "--service-discovery-namespace"
- "llm-pd-demo"
- "--service-discovery-port"
- "30000"
- "--pd-disaggregation"
- "--prefill-selector"
- "app=sglang-worker"
- "component=prefill"
- "--decode-selector"
- "app=sglang-worker"
- "component=decode"
- "--prefill-policy"
- "cache_aware"
- "--decode-policy"
- "power_of_two"
- "--max-concurrent-requests"
- "32"
- "--worker-startup-timeout-secs"
- "1800"
ports:
- name: http
containerPort: 30000
- name: metrics
containerPort: 29000
resources:
requests:
cpu: "500m"
memory: "512Mi"
readinessProbe:
tcpSocket:
port: http
periodSeconds: 5
livenessProbe:
tcpSocket:
port: http
periodSeconds: 10
---
apiVersion: v1
kind: Service
metadata:
name: sglang-model-gateway
namespace: llm-pd-demo
spec:
selector:
app: sglang-model-gateway
ports:
- name: http
port: 30000
targetPort: 30000
- name: metrics
port: 29000
targetPort: 29000
type: ClusterIP
EOF
kubectl apply -f sglang-pd-stack.yaml
kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-prefill --timeout=30m
kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-decode --timeout=30m
kubectl -n llm-pd-demo wait --for=condition=Available deployment/sglang-model-gateway --timeout=10m
# 导入到整个集群(server+agents)
k3d image import -c cluster-1 lmsysorg/sglang:v0.5.9-cu129-amd64
# 只导入到指定节点(可多次 --nodes)
k3d image import -c cluster-1 --nodes k3d-cluster-1-agent-0 lmsysorg/sglang:v0.5.9-cu129-amd64
# 导入多个镜像
k3d image import -c cluster-1 lmsysorg/sglang:v0.5.9-cu129-amd64 lmsysorg/sglang:v0.5.9-cu129-amd64
叶王 © 2013-2026 版权所有。如果本文档对你有所帮助,可以请作者喝饮料。