Validate the node level NCCL test with 8 GPUs#

Create a yaml named nccl-local.yaml with the following content

apiVersion: v1
kind: Pod
metadata:
  name: nccl-local-test
spec:
  restartPolicy: Never
  containers:
    - name: nvidia-smi
      image: docker.io/deepops/nccl-tests:2312
      command: ["/bin/bash", "-c", "nvidia-smi && tail -f /dev/null"]
      resources:
        limits:
          nvidia.com/gpu: 8  # Request 8 GPUs
  nodeSelector:
    nvidia.com/gpu.present: "true"  # Ensure it runs on a node with a GPU

Start the container with

kubectl apply -f nccl-local.yaml

Verify the container is running

k8suser@bcm10-headnode1:~$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nccl-local-test 1/1 Running 0 17s

Login to the container and run NCCL broadcast perf with 8 GPUs

Reference: NCCL primitives

Reference: NCCL Tests

root@nccl-local-test:/workspace# broadcast_perf -b 64k -e 2G  -f 2  -g 8
# nThread 1 nGpus 8 minBytes 65536 maxBytes 2147483648 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid    323 on nccl-local-test device  0 [0x1b] NVIDIA H100 80GB HBM3
#  Rank  1 Group  0 Pid    323 on nccl-local-test device  1 [0x43] NVIDIA H100 80GB HBM3
#  Rank  2 Group  0 Pid    323 on nccl-local-test device  2 [0x52] NVIDIA H100 80GB HBM3
#  Rank  3 Group  0 Pid    323 on nccl-local-test device  3 [0x61] NVIDIA H100 80GB HBM3
#  Rank  4 Group  0 Pid    323 on nccl-local-test device  4 [0x9d] NVIDIA H100 80GB HBM3
#  Rank  5 Group  0 Pid    323 on nccl-local-test device  5 [0xc3] NVIDIA H100 80GB HBM3
#  Rank  6 Group  0 Pid    323 on nccl-local-test device  6 [0xd1] NVIDIA H100 80GB HBM3
#  Rank  7 Group  0 Pid    323 on nccl-local-test device  7 [0xdf] NVIDIA H100 80GB HBM3
#
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
       65536         16384     float    none       0    28.77    2.28    2.28      0    27.55    2.38    2.38      0
      131072         32768     float    none       0    30.42    4.31    4.31      0    30.28    4.33    4.33      0
      262144         65536     float    none       0    35.31    7.42    7.42      0    33.20    7.90    7.90      0
      524288        131072     float    none       0    42.05   12.47   12.47      0    43.85   11.96   11.96      0
     1048576        262144     float    none       0    53.98   19.42   19.42      0    52.38   20.02   20.02      0
     2097152        524288     float    none       0    54.29   38.63   38.63      0    54.07   38.79   38.79      0
     4194304       1048576     float    none       0    56.67   74.01   74.01      0    55.61   75.43   75.43      0
     8388608       2097152     float    none       0    60.71  138.17  138.17      0    62.26  134.74  134.74      0
    16777216       4194304     float    none       0    82.29  203.88  203.88      0    82.32  203.80  203.80      0
    33554432       8388608     float    none       0    130.4  257.30  257.30      0    130.1  257.98  257.98      0
    67108864      16777216     float    none       0    225.7  297.33  297.33      0    226.5  296.35  296.35      0
   134217728      33554432     float    none       0    411.5  326.17  326.17      0    412.5  325.41  325.41      0
   268435456      67108864     float    none       0    780.8  343.80  343.80      0    783.4  342.64  342.64      0
   536870912     134217728     float    none       0   1508.3  355.95  355.95      0   1513.4  354.74  354.74      0
  1073741824     268435456     float    none       0   2975.8  360.82  360.82      0   2977.4  360.64  360.64      0
  2147483648     536870912     float    none       0   5896.4  364.20  364.20      0   5911.1  363.29  363.29      0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 175.205
#
root@nccl-local-test:/workspace# exit
exit