Validate the node level NCCL test with 8 GPUs#
Create a yaml named nccl-local.yaml with the following content
apiVersion: v1
kind: Pod
metadata:
name: nccl-local-test
spec:
restartPolicy: Never
containers:
- name: nvidia-smi
image: docker.io/deepops/nccl-tests:2312
command: ["/bin/bash", "-c", "nvidia-smi && tail -f /dev/null"]
resources:
limits:
nvidia.com/gpu: 8 # Request 8 GPUs
nodeSelector:
nvidia.com/gpu.present: "true" # Ensure it runs on a node with a GPU
Start the container with
kubectl apply -f nccl-local.yaml
Verify the container is running
k8suser@bcm10-headnode1:~$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nccl-local-test 1/1 Running 0 17s
Login to the container and run NCCL broadcast perf with 8 GPUs
Reference: NCCL primitives
Reference: NCCL Tests
root@nccl-local-test:/workspace# broadcast_perf -b 64k -e 2G -f 2 -g 8
# nThread 1 nGpus 8 minBytes 65536 maxBytes 2147483648 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Group 0 Pid 323 on nccl-local-test device 0 [0x1b] NVIDIA H100 80GB HBM3
# Rank 1 Group 0 Pid 323 on nccl-local-test device 1 [0x43] NVIDIA H100 80GB HBM3
# Rank 2 Group 0 Pid 323 on nccl-local-test device 2 [0x52] NVIDIA H100 80GB HBM3
# Rank 3 Group 0 Pid 323 on nccl-local-test device 3 [0x61] NVIDIA H100 80GB HBM3
# Rank 4 Group 0 Pid 323 on nccl-local-test device 4 [0x9d] NVIDIA H100 80GB HBM3
# Rank 5 Group 0 Pid 323 on nccl-local-test device 5 [0xc3] NVIDIA H100 80GB HBM3
# Rank 6 Group 0 Pid 323 on nccl-local-test device 6 [0xd1] NVIDIA H100 80GB HBM3
# Rank 7 Group 0 Pid 323 on nccl-local-test device 7 [0xdf] NVIDIA H100 80GB HBM3
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
65536 16384 float none 0 28.77 2.28 2.28 0 27.55 2.38 2.38 0
131072 32768 float none 0 30.42 4.31 4.31 0 30.28 4.33 4.33 0
262144 65536 float none 0 35.31 7.42 7.42 0 33.20 7.90 7.90 0
524288 131072 float none 0 42.05 12.47 12.47 0 43.85 11.96 11.96 0
1048576 262144 float none 0 53.98 19.42 19.42 0 52.38 20.02 20.02 0
2097152 524288 float none 0 54.29 38.63 38.63 0 54.07 38.79 38.79 0
4194304 1048576 float none 0 56.67 74.01 74.01 0 55.61 75.43 75.43 0
8388608 2097152 float none 0 60.71 138.17 138.17 0 62.26 134.74 134.74 0
16777216 4194304 float none 0 82.29 203.88 203.88 0 82.32 203.80 203.80 0
33554432 8388608 float none 0 130.4 257.30 257.30 0 130.1 257.98 257.98 0
67108864 16777216 float none 0 225.7 297.33 297.33 0 226.5 296.35 296.35 0
134217728 33554432 float none 0 411.5 326.17 326.17 0 412.5 325.41 325.41 0
268435456 67108864 float none 0 780.8 343.80 343.80 0 783.4 342.64 342.64 0
536870912 134217728 float none 0 1508.3 355.95 355.95 0 1513.4 354.74 354.74 0
1073741824 268435456 float none 0 2975.8 360.82 360.82 0 2977.4 360.64 360.64 0
2147483648 536870912 float none 0 5896.4 364.20 364.20 0 5911.1 363.29 363.29 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 175.205
#
root@nccl-local-test:/workspace# exit
exit