Skip to content

Commit

Permalink
Add L3 short test for RN50 training (NVIDIA#4614)
Browse files Browse the repository at this point in the history
* Add L3 short test for RN50 training

Signed-off-by: Janusz Lisiecki <[email protected]>
  • Loading branch information
JanuszL authored Jul 14, 2023
1 parent f905301 commit 270c141
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 7 deletions.
12 changes: 5 additions & 7 deletions qa/TL3_RN50_convergence/test_mxnet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@ python /opt/mxnet/example/image-classification/train_imagenet_runner \
--data-root=/data/imagenet/train-val-recordio-passthrough/ -b 144 \
-n $NUM_GPUS --seed 42 2>&1 | tee dali.log

cat dali.log | grep -o "Validation-accuracy=0\.[0-9]*" > tmp2.log
cat dali.log | grep -o "Speed: [0-9]*\.[0-9]*" > tmp3.log
cat tmp2.log | grep -o "0\.[0-9]*" > dali.log
cat tmp3.log | grep -o "[0-9]*\.[0-9]*" > tmp2.log
cat dali.log | grep -o "Validation-accuracy=0\.[0-9]*" | grep -o "0\.[0-9]*" > acc.log
cat dali.log | grep -o "Speed: [0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*" > speed.log

best=`awk 'BEGIN { max = -inf } { if ($1 > max) { max = $1 } } END { print max }' dali.log`
mean=`awk 'BEGIN { sum = 0; n = 0 } { sum += $1; n += 1 } END { print sum / n }' tmp2.log`
best=`awk 'BEGIN { max = -inf } { if ($1 > max) { max = $1 } } END { print max }' acc.log`
mean=`awk 'BEGIN { sum = 0; n = 0 } { sum += $1; n += 1 } END { print sum / n }' speed.log`

rm tmp2.log tmp3.log
rm -rf acc.log speed.log

if [[ `echo "$best $threshold" | awk '{ print ($1 >= $2) ? "1" : "0" }'` -eq "0" ]]; then
echo "acc = $best; TEST FAILED"
Expand Down
30 changes: 30 additions & 0 deletions qa/TL3_RN50_short/test_mxnet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash -e

threshold=0.25
min_perf=4000

NUM_GPUS=`nvidia-smi -L | wc -l`

python /opt/mxnet/example/image-classification/train_imagenet_runner \
--data-root=/data/imagenet/train-val-recordio-passthrough/ -b 408 \
-n $NUM_GPUS -e 5 --seed 42 --dali-threads 8 2>&1 | tee dali.log

cat dali.log | grep -o "Validation-accuracy=0\.[0-9]*" | grep -o "0\.[0-9]*" > acc.log
cat dali.log | grep -o "Speed: [0-9]*\.[0-9]*" | grep -o "[0-9]*\.[0-9]*" > speed.log

best=`awk 'BEGIN { max = -inf } { if ($1 > max) { max = $1 } } END { print max }' acc.log`
mean=`awk 'BEGIN { sum = 0; n = 0 } { sum += $1; n += 1 } END { print sum / n }' speed.log`

rm -rf acc.log speed.log

if [[ `echo "$best $threshold" | awk '{ print ($1 >= $2) ? "1" : "0" }'` -eq "0" ]]; then
echo "acc = $best; TEST FAILED"
exit -1
fi

if [[ `echo "$mean $min_perf" | awk '{ print ($1 >= $2) ? "1" : "0" }'` -eq "0" ]]; then
echo "perf = $mean; TEST FAILED"
exit -1
fi

echo "DONE! best accuracy = $best; mean speed = $mean samples/sec"
66 changes: 66 additions & 0 deletions qa/TL3_RN50_short/test_paddle.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash -e

set -o nounset
set -o errexit
set -o pipefail

function CLEAN_AND_EXIT {
exit $1
}

export USE_CUDA_VERSION=$(echo $(nvcc --version) | sed 's/.*\(release \)\([0-9]\+\)\.\([0-9]\+\).*/\2\3/')
pip install $(python /opt/dali/qa/setup_packages.py -i 0 -u paddlepaddle-gpu --cuda ${USE_CUDA_VERSION})

cd /opt/dali/docs/examples/use_cases/paddle/resnet50

GPUS=$(nvidia-smi -L | sed "s/GPU \([0-9]*\):.*/\1/g")

if [ ! -d "val" ]; then
ln -sf /data/imagenet/val-jpeg/ val
fi
if [ ! -d "train" ]; then
ln -sf /data/imagenet/train-jpeg/ train
fi

LOG=dali.log

SECONDS=0
EPOCHS=25 # limiting to 25 epochs to save time
export FLAGS_fraction_of_gpu_memory_to_use=.80
python -m paddle.distributed.launch --selected_gpus $(echo $GPUS | tr ' ' ',') \
main.py -b 96 -j 4 --lr=0.3 --epochs ${EPOCHS} ./ 2>&1 | tee $LOG

RET=${PIPESTATUS[0]}
echo "Training ran in $SECONDS seconds"
if [[ $RET -ne 0 ]]; then
echo "Error in training script."
CLEAN_AND_EXIT 2
fi

MIN_TOP1=45.0 # would be 75.0 if we run 90 epochs
MIN_TOP5=70.0 # would be 92.0 if we run 90 epochs
MIN_PERF=2000

TOP1=$(grep "^##Top-1" $LOG | awk '{print $2}')
TOP5=$(grep "^##Top-5" $LOG | awk '{print $2}')
PERF=$(grep "^##Perf" $LOG | awk '{print $2}')

if [[ -z "$TOP1" || -z "$TOP5" ]]; then
echo "Incomplete output."
CLEAN_AND_EXIT 3
fi

TOP1_RESULT=$(echo "$TOP1 $MIN_TOP1" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
TOP5_RESULT=$(echo "$TOP5 $MIN_TOP5" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
PERF_RESULT=$(echo "$PERF $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')

echo
printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP1_RESULT
printf "TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
printf "Average perf: %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT

if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" && "$PERF_RESULT" == "OK" ]]; then
CLEAN_AND_EXIT 0
fi

CLEAN_AND_EXIT 4
60 changes: 60 additions & 0 deletions qa/TL3_RN50_short/test_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash -e

set -o nounset
set -o errexit
set -o pipefail

function CLEAN_AND_EXIT {
exit $1
}

cd /opt/dali/docs/examples/use_cases/pytorch/resnet50

NUM_GPUS=$(nvidia-smi -L | wc -l)

if [ ! -d "val" ]; then
ln -sf /data/imagenet/val-jpeg/ val
fi
if [ ! -d "train" ]; then
ln -sf /data/imagenet/train-jpeg/ train
fi

LOG=dali.log

SECONDS=0
torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG

RET=${PIPESTATUS[0]}
echo "Training ran in $SECONDS seconds"
if [[ $RET -ne 0 ]]; then
echo "Error in training script."
CLEAN_AND_EXIT 2
fi

MIN_TOP1=20.0
MIN_TOP5=40.0
MIN_PERF=400

TOP1=$(grep "^##Top-1" $LOG | awk '{print $2}')
TOP5=$(grep "^##Top-5" $LOG | awk '{print $2}')
PERF=$(grep "^##Perf" $LOG | awk '{print $2}')

if [[ -z "$TOP1" || -z "$TOP5" ]]; then
echo "Incomplete output."
CLEAN_AND_EXIT 3
fi

TOP1_RESULT=$(echo "$TOP1 $MIN_TOP1" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
TOP5_RESULT=$(echo "$TOP5 $MIN_TOP5" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
PERF_RESULT=$(echo "$PERF $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')

echo
printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP1_RESULT
printf "TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
printf "Average perf: %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT

if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" && "$PERF_RESULT" == "OK" ]]; then
CLEAN_AND_EXIT 0
fi

CLEAN_AND_EXIT 4
72 changes: 72 additions & 0 deletions qa/TL3_RN50_short/test_tensorflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash -e

cd /opt/dali/docs/examples/use_cases/tensorflow/resnet-n

mkdir -p idx-files/

NUM_GPUS=$(nvidia-smi -L | wc -l)

DATA_SET_DIR=/data/imagenet/train-val-tfrecord
for file in $(ls $DATA_SET_DIR/*-of-*);
do
file=$(basename ${file})
echo ${file}
python /opt/dali/tools/tfrecord2idx $DATA_SET_DIR/${file} \
idx-files/${file}.idx &
done
wait

function CLEAN_AND_EXIT {
exit $1
}

LOG=dali.log
OUT=${LOG%.log}.dir
mkdir -p $OUT

SECONDS=0
export TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"

mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
python -u resnet.py \
--data_dir=$DATA_SET_DIR --data_idx_dir=idx-files/ \
--precision=fp16 --num_iter=5 --iter_unit=epoch --display_every=50 \
--batch=256 --use_xla --log_dir=$OUT --dali_threads 8 \
--dali_mode="GPU" 2>&1 | tee $LOG

RET=${PIPESTATUS[0]}
echo "Training ran in $SECONDS seconds"
if [[ $RET -ne 0 ]]; then
echo "Error in training script."
CLEAN_AND_EXIT 2
fi


MIN_TOP1=0.25
MIN_TOP5=0.50
MIN_PERF=4000

TOP1=$(grep "loss:" $LOG | awk '{print $18}' | tail -1)
TOP5=$(grep "loss:" $LOG | awk '{print $21}' | tail -1)

PERF=$(cat "$LOG" | grep "^global_step:" | awk " { sum += \$4; count+=1 } END {print sum/count}")

if [[ -z "$TOP1" || -z "$TOP5" ]]; then
echo "Incomplete output."
CLEAN_AND_EXIT 3
fi

TOP1_RESULT=$(echo "$TOP1 $MIN_TOP1" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
TOP5_RESULT=$(echo "$TOP5 $MIN_TOP5" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
PERF_RESULT=$(echo "$PERF $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')

echo
printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP1_RESULT
printf "TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
printf "mean speed %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT

if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" && "$PERF_RESULT" == "OK" ]]; then
CLEAN_AND_EXIT 0
fi

CLEAN_AND_EXIT 4

0 comments on commit 270c141

Please sign in to comment.