diff --git a/k8s/core/gpu/amd-gpu-values.yaml b/k8s/core/gpu/amd-gpu-values.yaml new file mode 100644 index 0000000..5c0ce5a --- /dev/null +++ b/k8s/core/gpu/amd-gpu-values.yaml @@ -0,0 +1,31 @@ +nfd: + enabled: false + +labeller: + enabled: false + +dp: + image: + repository: docker.io/rocm/k8s-device-plugin + tag: "1.31.0.9" + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + +tolerations: + - key: workload + operator: Equal + value: ai + effect: NoSchedule + +node_selector_enabled: true +node_selector: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: ai.tail2fe2d.ts.net diff --git a/k8s/core/gpu/kustomization.yaml b/k8s/core/gpu/kustomization.yaml index 45a573e..d447210 100644 --- a/k8s/core/gpu/kustomization.yaml +++ b/k8s/core/gpu/kustomization.yaml @@ -13,3 +13,24 @@ helmCharts: namespace: gpu-system valuesFile: values.yaml includeCRDs: true + - name: amd-gpu + repo: https://rocm.github.io/k8s-device-plugin/ + version: 0.21.0 + releaseName: amd-gpu-device-plugin + namespace: gpu-system + valuesFile: amd-gpu-values.yaml + includeCRDs: true + +patches: + - target: + group: apps + version: v1 + kind: DaemonSet + name: amd-gpu-device-plugin-daemonset + namespace: gpu-system + patch: |- + - op: replace + path: /spec/template/spec/nodeSelector + value: + kubernetes.io/arch: amd64 + kubernetes.io/hostname: ai.tail2fe2d.ts.net