Added llamacpp for CUDA

2026-06-17 12:53:43 +01:00
parent 47adf8e718
commit 6b5a0fc31f
4 changed files with 102 additions and 0 deletions
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: llamacpp-cuda-config
+data:
+  LLAMA_CACHE: /models
+  LLAMA_ARG_HOST: 0.0.0.0
+  LLAMA_ARG_PORT: "8080"
+  LLAMA_ARG_HF_REPO: "unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K"
+  LLAMA_ARG_CTX_SIZE: "32768"
+  LLAMA_ARG_FLASH_ATTN: auto
+  LLAMA_ARG_FIT: "on"
@@ -0,0 +1,72 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llamacpp-cuda
+  annotations:
+    reloader.stakater.com/auto: "true"
+  labels:
+    app: llamacpp-cuda
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: llamacpp-cuda
+  template:
+    metadata:
+      labels:
+        app: llamacpp-cuda
+    spec:
+      dnsPolicy: Default
+      runtimeClassName: nvidia
+      nodeSelector:
+        kubernetes.io/hostname: uk-desktop.tail2fe2d.ts.net
+      tolerations:
+        - key: workload
+          operator: Equal
+          value: desktop
+          effect: NoSchedule
+      containers:
+        - name: llamacpp
+          image: ghcr.io/ggml-org/llama.cpp:server-cuda-b9501
+          imagePullPolicy: IfNotPresent
+          envFrom:
+            - configMapRef:
+                name: llamacpp-cuda-config
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: llamacpp-hf-token
+                  key: token
+                  optional: true
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: http
+            failureThreshold: 180
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            failureThreshold: 3
+            periodSeconds: 10
+            timeoutSeconds: 5
+          volumeMounts:
+            - name: models
+              mountPath: /models
+      volumes:
+        - name: models
+          hostPath:
+            path: /data/llama.cpp/models
+            type: DirectoryOrCreate
@@ -3,6 +3,9 @@ kind: Kustomization

 resources:
  - app.yaml
+  - configmap-cuda.yaml
  - configmap.yaml
+  - deployment-cuda.yaml
  - deployment.yaml
+  - service-cuda.yaml
  - service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llamacpp-cuda
+  labels:
+    app: llamacpp-cuda
+spec:
+  type: ClusterIP
+  selector:
+    app: llamacpp-cuda
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
+      protocol: TCP