lower hlb_cifar acc to 93.3 (#3865)

ran 30 runs and the lowest i see is 93.35. lowered to 93.3 for now. maybe reenable ema later if it reduces variance
2026-01-09 06:58:11 -05:00 · 2024-03-21 17:58:53 -04:00
parent e50b7abe4f
commit bc482729d0
1 changed files with 3 additions and 3 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -115,7 +115,7 @@ jobs:
    - name: Run GPT2 w HALF/BEAM
      run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
    - name: Run full CIFAR training
-      run: time CUDA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.5 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+      run: time CUDA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA)
@@ -229,9 +229,9 @@ jobs:
    - name: Run 10 CIFAR training steps w HALF
      run: HSA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
    - name: Run full CIFAR training w 1 GPU
-      run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.5 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+      run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
    - name: Run full CIFAR training steps w 6 GPUS
-      run: time HSA=1 HALF=1 STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.5 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
+      run: time HSA=1 HALF=1 STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - name: Run MLPerf resnet eval on training data
      run: time HSA=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)