use randn in speed_v_theoretical instead of rand (#7656)

* use randn in speed_v_theoretical instead of rand

this made green gemv 20% faster... but why?

* update threshold
This commit is contained in:
chenyu
2024-11-12 15:00:32 -05:00
committed by GitHub
parent 397a2e6eb6
commit 962dafb467

View File

@@ -14,8 +14,9 @@ class TestKernelSpeed(unittest.TestCase):
with Context(BEAM=3):
for _ in range(10):
with Context(BEAM=0, DEBUG=0):
a = Tensor.rand(M, K, dtype="half").realize()
b = Tensor.rand(K, N, dtype="half").realize()
# TODO: randn is 20% faster than rand for gemv
a = Tensor.randn(M, K, dtype="half").realize()
b = Tensor.randn(K, N, dtype="half").realize()
Device.default.synchronize()
st = time.perf_counter()
c = f(a, b)
@@ -52,11 +53,11 @@ class TestKernelSpeed(unittest.TestCase):
# TODO: smaller ones has other overhead in synchronize
# def test_gemm_1024(self): self._test_matmul(1024, nv_tflops=8, amd_tflops=7)
# def test_gemm_2048(self): self._test_matmul(2048, nv_tflops=50, amd_tflops=30)
def test_gemm_4096(self): self._test_matmul(4096, nv_tflops=95, amd_tflops=65)
def test_gemm_4096(self): self._test_matmul(4096, nv_tflops=100, amd_tflops=70)
def test_gemm_8192(self): self._test_matmul(8192, nv_tflops=130, amd_tflops=70)
def test_gemv_4096_16384(self): self._test_matmul(4096, 16384, 1, nv_gbs=350, amd_gbs=270)
def test_gemv_16384_4096(self): self._test_matmul(16384, 4096, 1, nv_gbs=320, amd_gbs=270)
def test_gemv_16384_4096(self): self._test_matmul(16384, 4096, 1, nv_gbs=430, amd_gbs=400)
def test_gemv_4096_16384(self): self._test_matmul(4096, 16384, 1, nv_gbs=430, amd_gbs=400)
if __name__ == '__main__':
unittest.main()