no barrier side effect (#3550)

* no barrier side effect

* finish barrier removal
This commit is contained in:
George Hotz
2024-02-29 18:10:04 -08:00
committed by GitHub
parent bd9c2ced07
commit 5a6e151844
3 changed files with 4 additions and 8 deletions

View File

@@ -294,6 +294,8 @@ jobs:
run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --durations=20 run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --durations=20
- name: Run ONNX - name: Run ONNX
run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test tensor core ops (fake)
run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
- name: Test tensor core ops (real) - name: Test tensor core ops (real)
run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
- name: Test LLaMA compile speed - name: Test LLaMA compile speed

View File

@@ -283,9 +283,6 @@ class Linearizer(Kernel):
# reduce loop # reduce loop
loop_ctx = render_loop(reduce_idxs) loop_ctx = render_loop(reduce_idxs)
# barrier for fast GEMM
if self.tensor_core: self.uop(UOps.BARRIER, None, (), cachable=False)
# compute local aliases # compute local aliases
locals_to_store = [] locals_to_store = []
for i in self.local_alias: for i in self.local_alias:
@@ -321,10 +318,7 @@ class Linearizer(Kernel):
for z in range(wmma_sz[2]): for z in range(wmma_sz[2]):
acc[offs[2]+z] = self.uop(UOps.PHI, tc.dtype_out, (op3[z], self.uop(UOps.GEP, tc.dtype_out, (ret,), z)) + loop_ctx) acc[offs[2]+z] = self.uop(UOps.PHI, tc.dtype_out, (op3[z], self.uop(UOps.GEP, tc.dtype_out, (ret,), z)) + loop_ctx)
else: else:
if locals_to_store: assert not locals_to_store, "storing locals isn't supported here"
self.uop(UOps.BARRIER, None, (), cachable=False)
for i, idxs, ll in locals_to_store: self.global_store(i, idxs, ll)
self.uop(UOps.BARRIER, None, (), cachable=False)
# load earlybufs # load earlybufs
loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs[1:], start=1) if b in self.earlybufs}) # noqa: E501 loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs[1:], start=1) if b in self.earlybufs}) # noqa: E501

View File

@@ -33,7 +33,7 @@ def get_recursive_children(uops:List[UOp], x:UOp) -> Set[UOp]:
deps.add(u) deps.add(u)
return deps return deps
UOPS_W_SIDE_EFFECTS = {UOps.STORE, UOps.BARRIER} UOPS_W_SIDE_EFFECTS = {UOps.STORE}
def remove_childless_uops(uops:List[UOp]) -> List[UOp]: def remove_childless_uops(uops:List[UOp]) -> List[UOp]:
while 1: while 1:
has_child: Set[UOp] = set() has_child: Set[UOp] = set()