mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
no barrier side effect (#3550)
* no barrier side effect * finish barrier removal
This commit is contained in:
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -294,6 +294,8 @@ jobs:
|
|||||||
run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --durations=20
|
run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --durations=20
|
||||||
- name: Run ONNX
|
- name: Run ONNX
|
||||||
run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||||
|
- name: Test tensor core ops (fake)
|
||||||
|
run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
|
||||||
- name: Test tensor core ops (real)
|
- name: Test tensor core ops (real)
|
||||||
run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
|
run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
|
||||||
- name: Test LLaMA compile speed
|
- name: Test LLaMA compile speed
|
||||||
|
|||||||
@@ -283,9 +283,6 @@ class Linearizer(Kernel):
|
|||||||
# reduce loop
|
# reduce loop
|
||||||
loop_ctx = render_loop(reduce_idxs)
|
loop_ctx = render_loop(reduce_idxs)
|
||||||
|
|
||||||
# barrier for fast GEMM
|
|
||||||
if self.tensor_core: self.uop(UOps.BARRIER, None, (), cachable=False)
|
|
||||||
|
|
||||||
# compute local aliases
|
# compute local aliases
|
||||||
locals_to_store = []
|
locals_to_store = []
|
||||||
for i in self.local_alias:
|
for i in self.local_alias:
|
||||||
@@ -321,10 +318,7 @@ class Linearizer(Kernel):
|
|||||||
for z in range(wmma_sz[2]):
|
for z in range(wmma_sz[2]):
|
||||||
acc[offs[2]+z] = self.uop(UOps.PHI, tc.dtype_out, (op3[z], self.uop(UOps.GEP, tc.dtype_out, (ret,), z)) + loop_ctx)
|
acc[offs[2]+z] = self.uop(UOps.PHI, tc.dtype_out, (op3[z], self.uop(UOps.GEP, tc.dtype_out, (ret,), z)) + loop_ctx)
|
||||||
else:
|
else:
|
||||||
if locals_to_store:
|
assert not locals_to_store, "storing locals isn't supported here"
|
||||||
self.uop(UOps.BARRIER, None, (), cachable=False)
|
|
||||||
for i, idxs, ll in locals_to_store: self.global_store(i, idxs, ll)
|
|
||||||
self.uop(UOps.BARRIER, None, (), cachable=False)
|
|
||||||
|
|
||||||
# load earlybufs
|
# load earlybufs
|
||||||
loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs[1:], start=1) if b in self.earlybufs}) # noqa: E501
|
loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs[1:], start=1) if b in self.earlybufs}) # noqa: E501
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ def get_recursive_children(uops:List[UOp], x:UOp) -> Set[UOp]:
|
|||||||
deps.add(u)
|
deps.add(u)
|
||||||
return deps
|
return deps
|
||||||
|
|
||||||
UOPS_W_SIDE_EFFECTS = {UOps.STORE, UOps.BARRIER}
|
UOPS_W_SIDE_EFFECTS = {UOps.STORE}
|
||||||
def remove_childless_uops(uops:List[UOp]) -> List[UOp]:
|
def remove_childless_uops(uops:List[UOp]) -> List[UOp]:
|
||||||
while 1:
|
while 1:
|
||||||
has_child: Set[UOp] = set()
|
has_child: Set[UOp] = set()
|
||||||
|
|||||||
Reference in New Issue
Block a user