[TESTS][HOPPER] Prune hopper tests to speedup CI (#2193)

Co-authored-by: Goostav Zhu <gzhu@nvidia.com>
2026-04-05 03:01:17 -04:00 · 2023-08-28 11:45:23 +08:00
parent 5f448b2f08
commit 1465b573e8
2 changed files with 126 additions and 108 deletions
--- a/python/test/unit/hopper/test_gemm.py
+++ b/python/test/unit/hopper/test_gemm.py
@@ -211,102 +211,113 @@ def matmul_kernel(


@pytest.mark.parametrize('BLOCK_M,BLOCK_N,BLOCK_K,NUM_WARPS,NUM_CTAS,M,N,K,TRANS_A,TRANS_B,TRANS_OUTPUT,epilogue,out_dtype,USE_TMA_STORE,NUM_STAGES,ENABLE_WS',
-                         [(128, 128, 64, 4, 1, *shape_w_c, 'none', out_dtype, use_tma_store, 3, enable_ws)
-                          for shape_w_c in [
-                             # badcase from cublas-important-layers
-                             [4096, 1, 1024, False, False, True],
-                             [2048, 204, 1000, True, False, True],
-                             [4096, 1, 1024, False, False, False],
-                             [2048, 204, 1000, True, False, False],
-                         ]
+                         [
+                             # corner shapes
+                             (128, 128, 64, 4, 1, *shape_w_c, 'none', out_dtype, use_tma_store, 3, enable_ws)
+                             for shape_w_c in [
+                                 [4096, 1, 1024, False, False, True],
+                                 [2048, 204, 1000, True, False, True],
+                                 [4096, 1, 1024, False, False, False],
+                                 [2048, 204, 1000, True, False, False],
+                             ]
                             for out_dtype in ['float16', 'float32']
                             for use_tma_store in [False, True]
                             for enable_ws in [False, True]
-                         ] + [(*shape_w_c, trans_a, trans_b, trans_output, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
-                              # softmax works for one CTA
-                              for shape_w_c in [
-                             [64, 64, 16, 4, 1, 64, 64, 64],
-                             [128, 128, 64, 4, 1, None, None, None],
-                             [16, 16, 64, 4, 1, 16, 16, 64],
-                             [64, 64, 32, 8, 1, 64, 64, 64],
-                             [128, 128, 64, 4, 1, 128, 128, 128],
-                         ]
+                         ] + [
+                             # softmax epilogue
+                             (*shape_w_c, trans_a, trans_b, trans_output, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
+                             for shape_w_c in [
+                                 [64, 64, 16, 4, 1, 64, 64, 64],
+                                 [128, 128, 64, 4, 1, None, None, None],
+                                 [16, 16, 64, 4, 1, 16, 16, 64],
+                                 [64, 64, 32, 8, 1, 64, 64, 64],
+                                 [128, 128, 64, 4, 1, 128, 128, 128],
+                             ]
                             for epilogue in ['softmax']
                             for out_dtype in ['float16', 'float32']
                             for use_tma_store in [False, True]
-                             for trans_a in [False, True]
-                             for trans_b in [False, True]
-                             for trans_output in [False, True]
+                             for trans_a in [False,]
+                             for trans_b in [True,]
+                             for trans_output in [False,]
                             for num_stages in [3]
                             for enable_ws in [False, True]
-                         ] + [(*shape_w_c, trans_a, trans_b, trans_output, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
-                              for shape_w_c in [
-                             [64, 64, 16, 4, 1, 128, 128, 64],
-                             *[[256, 64, 16, num_warps, num_ctas, 256, 256, 64] for num_warps in [4, 8] for num_ctas in [1, 2, 4]],
-                             # for chain-dot
-                             [128, 128, 64, 4, 1, None, None, None],
-                             [64, 64, 16, 4, 1, None, None, None],
-                             # small BLOCK_M and BLOCK_K
-                             [16, 16, 64, 4, 1, 128, 128, 64],
-                             *[[16, 32, 64, num_warps, num_ctas, 256, 256, 256] for num_warps in [4, 8] for num_ctas in [1, 2]],
-                             # repeat
-                             [64, 64, 32, 8, 1, 128, 256, 64],
-                             [64, 64, 16, 8, 2, 128, 128, 64],
-                             # irregular shape
-                             [128, 128, 64, 4, 1, 500, 200, 128],
-                             [128, 128, 64, 4, 2, 513, 193, 192],
-                         ]
+                         ] + [
+                             # loop over epilogues besides of softmax
+                             (*shape_w_c, trans_a, trans_b, trans_output, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
+                             for shape_w_c in [
+                                 [64, 64, 16, 4, 1, 128, 128, 64],
+                                 *[[256, 64, 16, num_warps, num_ctas, 256, 256, 64] for num_warps in [4, 8] for num_ctas in [1, 2, 4]],
+                                 # for chain-dot
+                                 [128, 128, 64, 4, 1, None, None, None],
+                                 [64, 64, 16, 4, 1, None, None, None],
+                                 # small BLOCK_M and BLOCK_K
+                                 [16, 16, 64, 4, 1, 128, 128, 64],
+                                 *[[16, 32, 64, num_warps, num_ctas, 256, 256, 256] for num_warps in [4, 8] for num_ctas in [1, 2]],
+                                 # repeat
+                                 [64, 64, 32, 8, 1, 128, 256, 64],
+                                 [64, 64, 16, 8, 2, 128, 128, 64],
+                                 # irregular shape
+                                 [128, 128, 64, 4, 1, 500, 200, 128],
+                                 [128, 128, 64, 4, 2, 513, 193, 192],
+                             ]
                             for epilogue in ['none', 'add-matrix', 'add-rows', 'add-cols', 'chain-dot']
                             for out_dtype in ['float16', 'float32']
                             for use_tma_store in [False, True]
-                             for trans_a in [False, True]
-                             for trans_b in [False, True]
-                             for trans_output in [False, True]
+                             for trans_a in [False,]
+                             for trans_b in [True,]
+                             for trans_output in [False,]
                             for num_stages in [3]
                             for enable_ws in [False, True]
                             if not (epilogue == 'chain-dot' and (shape_w_c[6] is not None or shape_w_c[1] != shape_w_c[6]))
-                         ] + [(*shape_w_c, trans_a, trans_b, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
-                              for shape_w_c in [
-                             [64, 64, 32, 4, 1, 128, 256, 64],
-                             [128, 128, 16, 4, 4, 512, 256, 64],
-                             [128, 256, 32, 4, 8, 256, 256, 192],
-                             [512, 256, 32, 4, 8, 1024, 256, 192],
-                             # BLOCK_K >= 128
-                             [64, 128, 128, 4, 1, 512, 256, 256],
-                             [128, 128, 128, 4, 1, 256, 256, 192],
-                             [128, 128, 128, 4, 2, 256, 256, 192],
-                             # small BLOCK_M and BLOCK_K
-                             [16, 32, 32, 4, 1, 128, 256, 64],
-                             [32, 32, 16, 4, 1, 256, 256, 192],
-                             [16, 32, 64, 4, 4, 512, 256, 64],
-                         ]
-                             for out_dtype in ['float16', 'float32']
-                             for use_tma_store in [False, True]
+                         ] + [
+                             # loop over tile shapes and transpose combinations
+                             (*shape_w_c, trans_a, trans_b, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
+                             for shape_w_c in [
+                                 [64, 64, 32, 4, 1, 128, 256, 64],
+                                 [128, 128, 16, 4, 4, 512, 256, 64],
+                                 [128, 256, 32, 4, 8, 256, 256, 192],
+                                 [512, 256, 32, 4, 8, 1024, 256, 192],
+                                 # BLOCK_K >= 128
+                                 [64, 128, 128, 4, 1, 512, 256, 256],
+                                 [128, 128, 128, 4, 1, 256, 256, 192],
+                                 [128, 128, 128, 4, 2, 256, 256, 192],
+                                 # small BLOCK_M and BLOCK_K
+                                 [16, 32, 32, 4, 1, 128, 256, 64],
+                                 [32, 32, 16, 4, 1, 256, 256, 192],
+                                 [16, 32, 64, 4, 4, 512, 256, 64],
+                             ]
+                             for out_dtype in ['float32',]
+                             for use_tma_store in [False,]
                             for trans_a in [False, True]
                             for trans_b in [False, True]
                             for trans_output in [False, True]
                             for num_stages in [3]
                             for enable_ws in [False, True]
-                         ] + [(64, n, 16, 4, 1, 512, 256, 256, False, True, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
-                              # loop over instr shapes
-                              for n in [16, 32, 64, 128, 256]
-                              for trans_output in [False, True]
-                              for out_dtype in ['float16', 'float32']
-                              for use_tma_store in [False, True]
-                              for num_stages in [2, 4, 5, 7]
-                              for enable_ws in [False, True]
-                              ] + [(*shape_w_c, *shape, False, True, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
-                                   # irregular shapes
-                                   for shape_w_c in [
-                                       [128, 128, 64, 4, 1],
-                                       [256, 128, 64, 4, 2],
-                                       [128, 128, 128, 4, 2],
-                              ]
-                             for shape in list(itertools.product([*range(512, 4096, 360)], [*range(512, 4096, 360)], [512, 1024]))
-                             for trans_output in [False, True]
-                             for out_dtype in ['float16', 'float32']
+                         ] + [
+                             # loop over instr shapes & pipeline stages
+                             (64, n, 16, 4, 1, 512, 256, 256, False, True, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
+                             for n in [16, 32, 64, 128, 256]
+                             for trans_output in [False,]
+                             for out_dtype in ['float32',]
+                             for use_tma_store in [False,]
+                             for num_stages in [2, 4, 5, 7]
+                             for enable_ws in [False, True]
+                         ] + [
+                             # irregular shapes
+                             (*shape_w_c, *shape, False, True, trans_output, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
+                             for shape_w_c in [
+                                 [128, 128, 64, 4, 1],
+                                 [256, 128, 64, 4, 2],
+                                 [128, 128, 128, 4, 2],
+                             ]
+                             for shape in [
+                                 [512, 360, 1024],
+                                 [360, 4096, 512],
+                             ]
+                             for trans_output in [False,]
+                             for out_dtype in ['float32',]
                             for use_tma_store in [False, True]
-                             for num_stages in [2, 3, 4]
+                             for num_stages in [3, 4]
                             for enable_ws in [False, True]
                         ])
@pytest.mark.skipif(torch.cuda.get_device_capability()
--- a/python/test/unit/hopper/test_persistent_warp_specialized_gemm.py
+++ b/python/test/unit/hopper/test_persistent_warp_specialized_gemm.py
@@ -696,9 +696,9 @@ def full_static_persistent_matmul_kernel(

@pytest.mark.parametrize('BLOCK_M,BLOCK_N,BLOCK_K,NUM_WARPS,NUM_CTAS,M,N,K,TRANS_A,TRANS_B,epilogue,out_dtype,USE_TMA_STORE,NUM_STAGES,ENABLE_WS',
                         [
+                             # corner shapes
                             (128, 128, 64, 4, 1, *shape_w_c, 'none', out_dtype, use_tma_store, 3, enable_ws)
                             for shape_w_c in [
-                                 # bad from cublas-important-layers
                                 [4096, 1, 1024, False, False],
                                 [2048, 204, 1000, True, False],
                                 [16, 524288, 32, False, True],
@@ -707,6 +707,7 @@ def full_static_persistent_matmul_kernel(
                             for use_tma_store in [False, True]
                             for enable_ws in [True]
                         ] + [
+                             # softmax epilogue
                             (*shape_w_c, trans_a, trans_b, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
                             # softmax works for one CTA
                             for shape_w_c in [
@@ -720,11 +721,12 @@ def full_static_persistent_matmul_kernel(
                             for epilogue in ['softmax']
                             for out_dtype in ['float16', 'float32']
                             for use_tma_store in [False, True]
-                             for trans_a in [False, True]
-                             for trans_b in [False, True]
+                             for trans_a in [False,]
+                             for trans_b in [True,]
                             for num_stages in [3]
                             for enable_ws in [True]
                         ] + [
+                             # loop over tile shapes and transpose combinations
                             (*shape_w_c, trans_a, trans_b, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
                             for shape_w_c in [
                                 [64, 64, 32, 4, 1, 128, 256, 64],
@@ -740,58 +742,63 @@ def full_static_persistent_matmul_kernel(
                                 [32, 32, 16, 4, 1, 256, 256, 192],
                                 [16, 32, 64, 4, 4, 512, 256, 64],
                             ]
-                             for out_dtype in ['float16', 'float32']
-                             for use_tma_store in [False, True]
+                             for out_dtype in ['float32',]
+                             for use_tma_store in [False,]
                             for trans_a in [False, True]
                             for trans_b in [False, True]
                             for num_stages in [3]
                             for enable_ws in [True]
-                         ] + [(*shape_w_c, trans_a, trans_b, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
-                              for shape_w_c in [
-                             [64, 64, 16, 4, 1, 128, 128, 64],
-                             *[[256, 64, 16, num_warps, num_ctas, 256, 256, 64] for num_warps in [4] for num_ctas in [1, 2, 4]],
-                             # for chain-dot
-                             [128, 128, 64, 4, 1, None, None, None],
-                             [64, 64, 16, 4, 1, None, None, None],
-                             # small BLOCK_M and BLOCK_K
-                             [16, 16, 64, 4, 1, 128, 128, 64],
-                             *[[16, 32, 64, num_warps, num_ctas, 256, 256, 256] for num_warps in [4] for num_ctas in [1, 2]],
-                             #  # TODO: enable when num_warps != 4 is supported.
-                             #  # repeat
-                             #  # [64, 64, 32, 8, 1, 128, 256, 64],
-                             #  # [64, 64, 16, 8, 2, 128, 128, 64],
-                             # irregular shape
-                             [128, 128, 64, 4, 1, 500, 200, 128],
-                             [128, 128, 64, 4, 1, 513, 193, 192],
-                         ]
+                         ] + [
+                             # loop over epilogues besides of softmax
+                             (*shape_w_c, trans_a, trans_b, epilogue, out_dtype, use_tma_store, num_stages, enable_ws)
+                             for shape_w_c in [
+                                 [64, 64, 16, 4, 1, 128, 128, 64],
+                                 *[[256, 64, 16, num_warps, num_ctas, 256, 256, 64] for num_warps in [4] for num_ctas in [1, 2, 4]],
+                                 # for chain-dot
+                                 [128, 128, 64, 4, 1, None, None, None],
+                                 [64, 64, 16, 4, 1, None, None, None],
+                                 # small BLOCK_M and BLOCK_K
+                                 [16, 16, 64, 4, 1, 128, 128, 64],
+                                 *[[16, 32, 64, num_warps, num_ctas, 256, 256, 256] for num_warps in [4] for num_ctas in [1, 2]],
+                                 #  # TODO: enable when num_warps != 4 is supported.
+                                 #  # repeat
+                                 #  # [64, 64, 32, 8, 1, 128, 256, 64],
+                                 #  # [64, 64, 16, 8, 2, 128, 128, 64],
+                                 # irregular shape
+                                 [128, 128, 64, 4, 1, 500, 200, 128],
+                                 [128, 128, 64, 4, 1, 513, 193, 192],
+                             ]
                             for epilogue in ['none', 'add-matrix', 'add-rows', 'add-cols', 'chain-dot']
                             for out_dtype in ['float16', 'float32']
                             for use_tma_store in [False, True]
-                             for trans_a in [False, True]
-                             for trans_b in [False, True]
+                             for trans_a in [False,]
+                             for trans_b in [True,]
                             for num_stages in [3]
                             for enable_ws in [True]
                             if not (epilogue == 'chain-dot' and (shape_w_c[5] is not None or shape_w_c[0] != shape_w_c[1]))
                         ] + [
+                             # loop over instr shapes & pipeline stages
                             (64, n, 16, 4, 1, 512, 256, 256, False, True, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
-                             # loop over instr shapes
                             for n in [16, 32, 64, 128, 256]
-                             for out_dtype in ['float16', 'float32']
-                             for use_tma_store in [False, True]
+                             for out_dtype in ['float32']
+                             for use_tma_store in [False,]
                             for num_stages in [2, 4, 5, 7]
                             for enable_ws in [True]
                         ] + [
-                             (*shape_w_c, *shape, False, True, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
                             # irregular shapes
+                             (*shape_w_c, *shape, False, True, 'none', out_dtype, use_tma_store, num_stages, enable_ws)
                             for shape_w_c in [
                                 [128, 128, 64, 4, 1],
                                 [256, 128, 64, 4, 2],
                                 [128, 128, 128, 4, 2]
                             ]
-                             for shape in list(itertools.product([*range(512, 4096, 360)], [*range(512, 4096, 360)], [512, 1024]))
-                             for out_dtype in ['float16', 'float32']
+                             for shape in [
+                                 [512, 360, 1024],
+                                 [360, 4096, 512],
+                             ]
+                             for out_dtype in ['float32']
                             for use_tma_store in [False, True]
-                             for num_stages in [2, 3, 4]
+                             for num_stages in [3, 4]
                             for enable_ws in [True]
                         ]
                         )