[BACKEND] Add BarrierOp after AllocMBarrierOp when numCTAs == 1 (#2040)

Make sure that other threads within CTA do not operate on mbarrier until
it is initialized by thread 0.

Co-authored-by: Philippe Tillet <phil@openai.com>
This commit is contained in:
Qingyi Liu
2023-08-08 11:11:00 +08:00
committed by GitHub
parent 6a1ac65043
commit 341f5b61be
4 changed files with 9 additions and 35 deletions

View File

@@ -280,7 +280,11 @@ def tma_warp_specialized_matmul_kernel(
])
@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="Requires compute capability >= 9")
def test_non_persistent_warp_specialized_gemm(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_CTAS, TRANS_A, TRANS_B):
pytest.skip('hang')
if '-'.join(map(str, [M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_CTAS, TRANS_A, TRANS_B])) in [
'4096-4096-256-128-256-16-1-False-True',
'4096-4096-256-128-256-64-1-False-True'
]:
pytest.skip('Insufficient register resources')
if (TRANS_A):
a = .1 * torch.randn((K, M), device='cuda', dtype=torch.float16).T