apiVersion: apps/v1 kind: Deployment metadata: name: llamacpp annotations: reloader.stakater.com/auto: "true" labels: app: llamacpp spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: llamacpp template: metadata: labels: app: llamacpp spec: nodeSelector: kubernetes.io/hostname: ai.tail2fe2d.ts.net tolerations: - key: workload operator: Equal value: ai effect: NoSchedule containers: - name: llamacpp image: ghcr.io/ggml-org/llama.cpp:server-rocm imagePullPolicy: Always envFrom: - configMapRef: name: llamacpp-config env: - name: HF_TOKEN valueFrom: secretKeyRef: name: llamacpp-hf-token key: token optional: true ports: - name: http containerPort: 8080 protocol: TCP resources: limits: amd.com/gpu: 1 startupProbe: httpGet: path: /health port: http failureThreshold: 180 periodSeconds: 10 timeoutSeconds: 5 readinessProbe: httpGet: path: /health port: http failureThreshold: 3 periodSeconds: 10 timeoutSeconds: 5 volumeMounts: - name: models mountPath: /models volumes: - name: models hostPath: path: /k8s/llamacpp/models type: DirectoryOrCreate