make -j MPI=1 MPI_HOME=/data/support/soft/openmpi/4.1.4 CUDA_HOME=/usr/local/cuda-11.8 NCCL_HOME=/data/support/soft/anaconda3/envs/megatron/lib/python3.9/site-packages/nvidia/nccl
mpirun -x NCCL_IB_HCA=mlx5_1:1 -x NCCL_IB_DISABLE=0 -x NCCL_SOCKET_IFNAME=eth1 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_TIMEOUT=23 -x NCCL_IB_RETRY_CNT=7 -hostfile hostfile -n 16 -N 8 --allow-run-as-root --mca btl tcp,self --mca btl_tcp_if_exclude lo,mlx5_0 ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100