diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7f163a5076..5c8b36bf6b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -53,7 +53,7 @@ jobs: - name: Run GPT2 w HALF run: JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Train MNIST run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=97.3 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps @@ -129,7 +129,7 @@ jobs: - name: Run GPT2 w HALF run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt - name: Run GPT2 w HALF/BEAM - run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt + run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 JIT_BATCH_SIZE=4 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt - name: Train MNIST run: time PYTHONPATH=. CUDA=1 TARGET_EVAL_ACC_PCT=97.3 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt - name: Run 10 CIFAR training steps diff --git a/test/external/verify_kernel.py b/test/external/verify_kernel.py index 756f701bda..de01478ca9 100644 --- a/test/external/verify_kernel.py +++ b/test/external/verify_kernel.py @@ -9,7 +9,7 @@ from tinygrad.features.search import time_linearizer # Use this with the LOGKERN options to verify that all executed kernels are valid and evaluate to the same ground truth results # Example for GPT2: -# 1) Run the model to log all kernels: `PYTHONPATH=. LOGKERN=/tmp/gpt2_kerns.txt JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing` # noqa: E501 +# 1) Run the model to log all kernels: `PYTHONPATH=. LOGKERN=/tmp/gpt2_kerns.txt JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing` # noqa: E501 # 2) Validate the kernel correctness: `PYTHONPATH=. python3 ./test/external/verify_kernel.py --file /tmp/gpt2_kerns.txt` if __name__ == "__main__": diff --git a/test/test_const_folding.py b/test/test_const_folding.py index 1dcdc5ada9..611ef54728 100644 --- a/test/test_const_folding.py +++ b/test/test_const_folding.py @@ -106,14 +106,12 @@ class TestMovedConstFolding(unittest.TestCase): _check_ast_count(1, Tensor([1.0, 2, 3, 4]) * Tensor.ones(2).pad(((1, 1),))) def test_cast_padded(self): - # NOTE: this is folded due to CAST_BEFORE_VIEW - _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) + _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0]) - _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) - np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) - # not folded _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0]) + _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) + np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) class TestReduceOpsConstFolding(unittest.TestCase): def test_const_sum(self): diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 4466203348..265751a337 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -84,9 +84,6 @@ class LazyBuffer: if self.device.startswith("DISK") and not bitcast: raise RuntimeError("attempted to cast disk buffer (bitcast only)") if self.is_unrealized_unmasked_const() and not bitcast: return create_lazybuffer(self.device, self.st, dtype, LoadOps.CONST, dtypes.as_const(self.base.arg, dtype)) - # TODO: applying this makes gpt2 slower - if getenv("CAST_BEFORE_VIEW", 1) and dtype.itemsize <= self.dtype.itemsize and self != self.base: - return self.base.cast(dtype, bitcast)._view(self.st) new_shape = self.shape if bitcast and self.dtype.itemsize != dtype.itemsize: if not self.device.startswith("DISK"): raise RuntimeError("shape changing bitcast only supported on DISK right now")