diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py
index 100446d1db..2772455f4a 100644
--- a/tinygrad/codegen/uops.py
+++ b/tinygrad/codegen/uops.py
@@ -384,5 +384,5 @@ class UOpGraph:
         if u.arg.startswith("__metal_wmma"): flops += 2*(8*8*8)//32 * mults
         elif u.arg == "__hip_wmma_f16_f16" or u.arg == "__builtin_amdgcn_wmma_f32_16x16x16_f16_w32": flops += 2*(16*16*16)//32 * mults
         elif u.arg == "__cuda_mma_m16n8k16_f16_f32": flops += 2*(8*16*16)//32 * mults
-        else: raise Exception("not implemented")
+        else: raise NotImplementedError(f"not implemented wmma {u.arg=}")
     return flops, mem
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 93ca20078b..f7f1f4e4d5 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -126,7 +126,7 @@ class CUDAAllocator(LRUAllocator):
     if options.host:
       return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0)))
     else:
-      raise Exception("no options")
+      raise ValueError("no options")
   def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
   def copyin(self, dest, src:memoryview):
     host_mem = self._alloc_with_options(len(src), BufferOptions(host=True))
diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py
index a8f3bdf355..21b7f69b78 100644
--- a/tinygrad/runtime/ops_hsa.py
+++ b/tinygrad/runtime/ops_hsa.py
@@ -115,7 +115,7 @@ class HSAAllocator(LRUAllocator):
       check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
       check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
       return mem.value
-    else: raise Exception("no options")
+    else: raise ValueError("no options")
 
   def _free(self, opaque:T):
     HSADevice.synchronize_system()
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index d86707fcb6..0bc6426805 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -177,7 +177,7 @@ class PythonProgram:
             def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8) # (i, j), C, D (4 elements on 32 threads)
             ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
           else:
-            raise Exception(f"unimplemented tensor core {arg}")
+            raise NotImplementedError(f"unimplemented tensor core {arg}")
         elif uop is UOps.ALU:
           assert all_same([len(x) for x in inp]), f"{[len(x) for x in inp]} doesn't match on {arg}"
           assert all_same([dtype] + dtp) or arg in {BinaryOps.CMPEQ, BinaryOps.CMPLT, TernaryOps.WHERE}, f"dtype mismatch on {arg}"