mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[BACKEND] Add BarrierOp after AllocMBarrierOp when numCTAs == 1 (#2040)
Make sure that other threads within CTA do not operate on mbarrier until it is initialized by thread 0. Co-authored-by: Philippe Tillet <phil@openai.com>
This commit is contained in:
@@ -280,7 +280,11 @@ def tma_warp_specialized_matmul_kernel(
|
||||
])
|
||||
@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="Requires compute capability >= 9")
|
||||
def test_non_persistent_warp_specialized_gemm(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_CTAS, TRANS_A, TRANS_B):
|
||||
pytest.skip('hang')
|
||||
if '-'.join(map(str, [M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_CTAS, TRANS_A, TRANS_B])) in [
|
||||
'4096-4096-256-128-256-16-1-False-True',
|
||||
'4096-4096-256-128-256-64-1-False-True'
|
||||
]:
|
||||
pytest.skip('Insufficient register resources')
|
||||
|
||||
if (TRANS_A):
|
||||
a = .1 * torch.randn((K, M), device='cuda', dtype=torch.float16).T
|
||||
|
||||
Binary file not shown.
Reference in New Issue
Block a user