homelab/k8s/apps/llamacpp/deployment-cuda.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: llamacpp-cuda
  annotations:
    reloader.stakater.com/auto: "true"
  labels:
    app: llamacpp-cuda
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: llamacpp-cuda
  template:
    metadata:
      labels:
        app: llamacpp-cuda
    spec:
      dnsPolicy: Default
      runtimeClassName: nvidia
      nodeSelector:
        kubernetes.io/hostname: uk-desktop.tail2fe2d.ts.net
      tolerations:
        - key: workload
          operator: Equal
          value: desktop
          effect: NoSchedule
      containers:
        - name: llamacpp
          image: ghcr.io/ggml-org/llama.cpp:server-cuda-b9501
          imagePullPolicy: IfNotPresent
          envFrom:
            - configMapRef:
                name: llamacpp-cuda-config
          env:
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: llamacpp-hf-token
                  key: token
                  optional: true
          ports:
            - name: http
              containerPort: 8080
              protocol: TCP
          resources:
            limits:
              nvidia.com/gpu: 1
          startupProbe:
            httpGet:
              path: /health
              port: http
            failureThreshold: 180
            periodSeconds: 10
            timeoutSeconds: 5
          readinessProbe:
            httpGet:
              path: /health
              port: http
            failureThreshold: 3
            periodSeconds: 10
            timeoutSeconds: 5
          volumeMounts:
            - name: models
              mountPath: /models
      volumes:
        - name: models
          hostPath:
            path: /data/llama.cpp/models
            type: DirectoryOrCreate