add mixtral and 6 gpus cifar to tinybox ci (#3676)

* add mixtral and 6 gpus cifar to tinybox ci

* print total ram used at the end of loading
This commit is contained in:
chenyu
2024-03-10 18:25:31 -04:00
committed by GitHub
parent 44a67bf783
commit bad6adaf8c
2 changed files with 11 additions and 6 deletions

View File

@@ -139,6 +139,7 @@ jobs:
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
- name: Run model inference benchmark
run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
@@ -153,6 +154,8 @@ jobs:
run: |
HSA=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
HSA=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run Mixtral 8x7B
run: time HSA=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt
- name: Run GPT2
run: |
HSA=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
@@ -161,11 +164,10 @@ jobs:
run: HSA=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: HSA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
# # TODO: enable this. it took 3 minutes in CI and made the full training one more than 5 minutes
# - name: Run 10 CIFAR training steps w 6 GPUS
# run: time HALF=1 STEPS=10 BS=1536 GPUS=6 python3 examples/hlb_cifar10.py
- name: Run full CIFAR training
- name: Run full CIFAR training w 1 GPU
run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- name: Run full CIFAR training steps w 6 GPUS
run: time HSA=1 HALF=1 STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD)
@@ -176,9 +178,11 @@ jobs:
train_cifar_half.txt
train_cifar_wino.txt
train_cifar_one_gpu.txt
train_cifar_six_gpu.txt
llama_unjitted.txt
llama_jitted.txt
gpt2_unjitted.txt
gpt2_jitted.txt
matmul.txt
sd.txt
mixtral.txt

View File

@@ -1,7 +1,7 @@
import functools, argparse, pathlib
from tqdm import tqdm
from tinygrad import Tensor, nn, Device, GlobalCounters, Variable
from tinygrad.helpers import Timing, Profiling
from tinygrad.helpers import Timing, Profiling, CI
from tinygrad.nn.state import torch_load, get_state_dict
from extra.models.llama import FeedForward, Transformer
@@ -34,7 +34,7 @@ if __name__ == "__main__":
model = Transformer(n_layers=32, dim=4096, hidden_dim=14336, n_heads=32, n_kv_heads=8, norm_eps=1e-5, vocab_size=32000, feed_forward=functools.partial(MixtureFeedForward, 8), jit=False)
model_state_dict = get_state_dict(model)
for k in (t := tqdm(state)):
for k in (t := tqdm(state, disable=CI)):
if 'feed_forward.experts.' in k:
expert_no = int(k.split('feed_forward.experts.')[1].split('.')[0])
device = Device.DEFAULT + ":" + str((expert_no//2)+1)
@@ -43,6 +43,7 @@ if __name__ == "__main__":
t.set_description(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, loading {k} to {device}")
# NOTE: we have to copy through CLANG to avoid the HIP hang bug when copying directly from the DISK
model_state_dict[k].assign(state[k].to("CLANG").contiguous().to(device).half()).realize()
if CI: print(f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB")
from sentencepiece import SentencePieceProcessor
spp = SentencePieceProcessor(model_file=args.weights + "/tokenizer.model")