second try at block linearize (#7892)

* second try at block linearize

* weeee, works for lil matmul

* it's so beautiful

* test tiny passes

* fix bugs

* combine matching BLOCKENDS

* wrapping

* test lin failures passes

* those failures were fake

* flip sort order

* fix ptx tests

* deal with store better

* dumb ptx fix

* expect less

* reduce lines

* reduce lines

* less lines and cleaner

* no defaultdict

* tighter

* simpler block_parent_count
This commit is contained in:
George Hotz
2024-12-02 13:43:09 +08:00
committed by GitHub
parent 9b0859d717
commit cbcc1c20eb
2 changed files with 151 additions and 79 deletions

View File

@@ -88,7 +88,7 @@ class TestKernelSpeed(unittest.TestCase):
# def test_gemm_1024(self): self._test_matmul(1024, nv_tflops=8, amd_tflops=7)
# def test_gemm_2048(self): self._test_matmul(2048, nv_tflops=50, amd_tflops=30)
def test_gemm_4096(self): self._test_matmul(4096, nv_tflops=95, amd_tflops=70)
def test_gemm_8192(self): self._test_matmul(8192, nv_tflops=130, amd_tflops=70)
def test_gemm_8192(self): self._test_matmul(8192, nv_tflops=125, amd_tflops=70)
def test_gemv_16384_4096(self): self._test_matmul(16384, 4096, 1, nv_gbs=430, amd_gbs=400)
def test_gemv_4096_16384(self): self._test_matmul(4096, 16384, 1, nv_gbs=430, amd_gbs=380) # AMD was flaky at 400