diff --git a/k8s/apps/llamacpp/configmap-cuda.yaml b/k8s/apps/llamacpp/configmap-cuda.yaml new file mode 100644 index 0000000..5475c2d --- /dev/null +++ b/k8s/apps/llamacpp/configmap-cuda.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: llamacpp-cuda-config +data: + LLAMA_CACHE: /models + LLAMA_ARG_HOST: 0.0.0.0 + LLAMA_ARG_PORT: "8080" + LLAMA_ARG_HF_REPO: "unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K" + LLAMA_ARG_CTX_SIZE: "32768" + LLAMA_ARG_FLASH_ATTN: auto + LLAMA_ARG_FIT: "on" diff --git a/k8s/apps/llamacpp/deployment-cuda.yaml b/k8s/apps/llamacpp/deployment-cuda.yaml new file mode 100644 index 0000000..a28c409 --- /dev/null +++ b/k8s/apps/llamacpp/deployment-cuda.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llamacpp-cuda + annotations: + reloader.stakater.com/auto: "true" + labels: + app: llamacpp-cuda +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: llamacpp-cuda + template: + metadata: + labels: + app: llamacpp-cuda + spec: + dnsPolicy: Default + runtimeClassName: nvidia + nodeSelector: + kubernetes.io/hostname: uk-desktop.tail2fe2d.ts.net + tolerations: + - key: workload + operator: Equal + value: desktop + effect: NoSchedule + containers: + - name: llamacpp + image: ghcr.io/ggml-org/llama.cpp:server-cuda-b9501 + imagePullPolicy: IfNotPresent + envFrom: + - configMapRef: + name: llamacpp-cuda-config + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: llamacpp-hf-token + key: token + optional: true + ports: + - name: http + containerPort: 8080 + protocol: TCP + resources: + limits: + nvidia.com/gpu: 1 + startupProbe: + httpGet: + path: /health + port: http + failureThreshold: 180 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + failureThreshold: 3 + periodSeconds: 10 + timeoutSeconds: 5 + volumeMounts: + - name: models + mountPath: /models + volumes: + - name: models + hostPath: + path: /data/llama.cpp/models + type: DirectoryOrCreate diff --git a/k8s/apps/llamacpp/kustomization.yaml b/k8s/apps/llamacpp/kustomization.yaml index 15d9dc2..b8bbd83 100644 --- a/k8s/apps/llamacpp/kustomization.yaml +++ b/k8s/apps/llamacpp/kustomization.yaml @@ -3,6 +3,9 @@ kind: Kustomization resources: - app.yaml + - configmap-cuda.yaml - configmap.yaml + - deployment-cuda.yaml - deployment.yaml + - service-cuda.yaml - service.yaml diff --git a/k8s/apps/llamacpp/service-cuda.yaml b/k8s/apps/llamacpp/service-cuda.yaml new file mode 100644 index 0000000..54d0580 --- /dev/null +++ b/k8s/apps/llamacpp/service-cuda.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: llamacpp-cuda + labels: + app: llamacpp-cuda +spec: + type: ClusterIP + selector: + app: llamacpp-cuda + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP