mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
training cifar with BF16 on CUDA (#3905)
* training cifar with BF16 on CUDA memory usage is between float and half due to numpy calls on dataset preprocessing, which converts into float. * simpler bf16 functions * bf16 cifar works for HSA too just very slow * simpler bf16 functions, we love cuda
This commit is contained in:
12
.github/workflows/benchmark.yml
vendored
12
.github/workflows/benchmark.yml
vendored
@@ -114,6 +114,12 @@ jobs:
|
|||||||
run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt
|
||||||
- name: Run GPT2 w HALF/BEAM
|
- name: Run GPT2 w HALF/BEAM
|
||||||
run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
|
||||||
|
- name: Run 10 CIFAR training steps
|
||||||
|
run: CUDA=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||||
|
- name: Run 10 CIFAR training steps w HALF
|
||||||
|
run: CUDA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||||
|
- name: Run 10 CIFAR training steps w BF16
|
||||||
|
run: CUDA=1 STEPS=10 BF16=1 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||||
- name: Run full CIFAR training
|
- name: Run full CIFAR training
|
||||||
run: time CUDA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
run: time CUDA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
@@ -130,6 +136,9 @@ jobs:
|
|||||||
gpt2_jitted.txt
|
gpt2_jitted.txt
|
||||||
gpt2_half.txt
|
gpt2_half.txt
|
||||||
gpt2_half_beam.txt
|
gpt2_half_beam.txt
|
||||||
|
train_cifar.txt
|
||||||
|
train_cifar_half.txt
|
||||||
|
train_cifar_bf16.txt
|
||||||
train_cifar_one_gpu.txt
|
train_cifar_one_gpu.txt
|
||||||
|
|
||||||
testamdbenchmark:
|
testamdbenchmark:
|
||||||
@@ -228,6 +237,8 @@ jobs:
|
|||||||
run: HSA=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
run: HSA=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
|
||||||
- name: Run 10 CIFAR training steps w HALF
|
- name: Run 10 CIFAR training steps w HALF
|
||||||
run: HSA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
run: HSA=1 STEPS=10 HALF=1 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
|
||||||
|
- name: Run 10 CIFAR training steps w BF16
|
||||||
|
run: HSA=1 STEPS=10 BF16=1 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
|
||||||
- name: Run full CIFAR training w 1 GPU
|
- name: Run full CIFAR training w 1 GPU
|
||||||
run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
run: time HSA=1 HALF=1 LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.3 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
|
||||||
- name: Run full CIFAR training steps w 6 GPUS
|
- name: Run full CIFAR training steps w 6 GPUS
|
||||||
@@ -244,6 +255,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
train_cifar.txt
|
train_cifar.txt
|
||||||
train_cifar_half.txt
|
train_cifar_half.txt
|
||||||
|
train_cifar_bf16.txt
|
||||||
train_cifar_wino.txt
|
train_cifar_wino.txt
|
||||||
train_cifar_one_gpu.txt
|
train_cifar_one_gpu.txt
|
||||||
train_resnet.txt
|
train_resnet.txt
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ assert EVAL_BS % len(GPUS) == 0, f"{EVAL_BS=} is not a multiple of {len(GPUS)=},
|
|||||||
|
|
||||||
if getenv("HALF"):
|
if getenv("HALF"):
|
||||||
dtypes.default_float = dtypes.float16
|
dtypes.default_float = dtypes.float16
|
||||||
|
elif getenv("BF16"):
|
||||||
|
dtypes.default_float = dtypes.bfloat16
|
||||||
else:
|
else:
|
||||||
dtypes.default_float = dtypes.float32
|
dtypes.default_float = dtypes.float32
|
||||||
|
|
||||||
@@ -200,8 +202,8 @@ def train_cifar():
|
|||||||
BS, _, H, W = shape
|
BS, _, H, W = shape
|
||||||
low_x = Tensor.randint(BS, low=0, high=W-mask_size).reshape(BS,1,1,1)
|
low_x = Tensor.randint(BS, low=0, high=W-mask_size).reshape(BS,1,1,1)
|
||||||
low_y = Tensor.randint(BS, low=0, high=H-mask_size).reshape(BS,1,1,1)
|
low_y = Tensor.randint(BS, low=0, high=H-mask_size).reshape(BS,1,1,1)
|
||||||
idx_x = Tensor.arange(W).reshape((1,1,1,W))
|
idx_x = Tensor.arange(W, dtype=dtypes.int32).reshape((1,1,1,W))
|
||||||
idx_y = Tensor.arange(H).reshape((1,1,H,1))
|
idx_y = Tensor.arange(H, dtype=dtypes.int32).reshape((1,1,H,1))
|
||||||
return (idx_x >= low_x) * (idx_x < (low_x + mask_size)) * (idx_y >= low_y) * (idx_y < (low_y + mask_size))
|
return (idx_x >= low_x) * (idx_x < (low_x + mask_size)) * (idx_y >= low_y) * (idx_y < (low_y + mask_size))
|
||||||
|
|
||||||
def random_crop(X:Tensor, crop_size=32):
|
def random_crop(X:Tensor, crop_size=32):
|
||||||
|
|||||||
@@ -216,13 +216,11 @@ class MetalLanguage(CStyleLanguage):
|
|||||||
return super().render_kernel(function_name, kernel, bufs, uops, prefix)
|
return super().render_kernel(function_name, kernel, bufs, uops, prefix)
|
||||||
MetalRenderer = functools.partial(uops_to_cstyle, MetalLanguage())
|
MetalRenderer = functools.partial(uops_to_cstyle, MetalLanguage())
|
||||||
|
|
||||||
code_for_op_half = {
|
code_for_op_half = {BinaryOps.MAX: lambda a,b,dtype: f"__hmax({a},{b})" if dtype in (dtypes.half, dtypes.bfloat16) else f"max({a},{b})",
|
||||||
BinaryOps.MAX: lambda a,b,dtype: f"max({a},{b})" if dtype != dtypes.half else f"__hmax({a},{b})",
|
UnaryOps.SQRT: lambda x,dtype: f"hsqrt({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"sqrt({x})",
|
||||||
UnaryOps.SQRT: lambda x,dtype: f"sqrt({x})" if dtype != dtypes.half else f"hsqrt({x})",
|
UnaryOps.SIN: lambda x,dtype: f"hsin({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"sin({x})",
|
||||||
UnaryOps.SIN: lambda x,dtype: f"sin({x})" if dtype != dtypes.half else f"hsin({x})",
|
UnaryOps.LOG2: lambda x,dtype: f"hlog2({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"log2({x})",
|
||||||
UnaryOps.LOG2: lambda x,dtype: f"log2({x})" if dtype != dtypes.half else f"hlog2({x})",
|
UnaryOps.EXP2: lambda x,dtype: f"hexp2({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"exp2({x})",}
|
||||||
UnaryOps.EXP2: lambda x,dtype: f"exp2({x})" if dtype != dtypes.half else f"hexp2({x})",
|
|
||||||
}
|
|
||||||
|
|
||||||
class CUDALanguage(CStyleLanguage):
|
class CUDALanguage(CStyleLanguage):
|
||||||
kernel_prefix = "extern \"C\" __global__ "
|
kernel_prefix = "extern \"C\" __global__ "
|
||||||
|
|||||||
@@ -232,7 +232,7 @@ class Tensor:
|
|||||||
if not THREEFRY.value:
|
if not THREEFRY.value:
|
||||||
if dtype == dtypes.bfloat16:
|
if dtype == dtypes.bfloat16:
|
||||||
return Tensor.rand(*shape, **kwargs, device=device, dtype=dtypes.float).cast(dtypes.bfloat16)
|
return Tensor.rand(*shape, **kwargs, device=device, dtype=dtypes.float).cast(dtypes.bfloat16)
|
||||||
return Tensor._loadop(LoadOps.CUSTOM, argfix(*shape), arg=custom_random, device=device, dtype=dtype, **kwargs)
|
return Tensor._loadop(LoadOps.CUSTOM, argfix(*shape), arg=custom_random, device=device, dtype=dtype or dtypes.float32, **kwargs)
|
||||||
|
|
||||||
# threefry
|
# threefry
|
||||||
if (num := prod((shape:=argfix(*shape)))) == 0: return Tensor.zeros(shape, device=device, dtype=dtype, **kwargs)
|
if (num := prod((shape:=argfix(*shape)))) == 0: return Tensor.zeros(shape, device=device, dtype=dtype, **kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user