diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 603d44898f..368bd03b5b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -57,7 +57,7 @@ jobs: run: JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt # TODO: this is flaky # - name: Run GPT2 w HALF/BEAM - # run: JIT=0 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + # run: JIT=0 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Train MNIST run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=97.3 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps @@ -142,7 +142,7 @@ jobs: - name: Run GPT2 w HALF run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Train MNIST run: time PYTHONPATH=. CUDA=1 TARGET_EVAL_ACC_PCT=97.3 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps diff --git a/test/test_const_folding.py b/test/test_const_folding.py index 9e8d9a0693..37682e450f 100644 --- a/test/test_const_folding.py +++ b/test/test_const_folding.py @@ -106,12 +106,14 @@ class TestMovedConstFolding(unittest.TestCase): _check_ast_count(1, Tensor([1.0, 2, 3, 4]) * Tensor.ones(2).pad(((1, 1),))) def test_cast_padded(self): - _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) + # NOTE: this is folded due to CAST_BEFORE_VIEW + _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0]) + _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) + np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) + # not folded _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0]) - _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) - np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) class TestReduceOpsConstFolding(unittest.TestCase): def test_const_sum(self): diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index ce45adf453..a00f0a978e 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -96,6 +96,9 @@ class LazyBuffer: if self.device.startswith("DISK") and not bitcast: raise RuntimeError("attempted to cast disk buffer (bitcast only)") if self.is_unrealized_unmasked_const() and not bitcast: return create_lazybuffer(self.device, self.st, dtype, LoadOps.CONST, dtypes.as_const(self.base.arg, dtype)) + # TODO: applying this makes gpt2 slower + if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base: + return self.base.cast(dtype, bitcast)._view(self.st) new_shape = self.shape if bitcast and self.dtype.itemsize != dtype.itemsize: if not self.device.startswith("DISK"): raise RuntimeError("shape changing bitcast only supported on DISK right now")