diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 116505e47a..77e1af9149 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -888,7 +888,7 @@ class AMDDevice(HCQCompiled): self.max_cu_id = self.iface.props['simd_count'] // self.iface.props['simd_per_cu'] // self.iface.props.get('num_xcc', 1) - 1 self.max_wave_id = (self.iface.props['max_waves_per_simd'] * self.iface.props['simd_per_cu'] - 1) if self.target >= (10,1,0) else \ (min((self.max_cu_id+1)*40, self.se_cnt * 512) - 1) - self.xccs = self.iface.props.get('num_xcc', 1) if getenv("XCCS", 1) else 1 + self.xccs = self.iface.props.get('num_xcc', 1) # this is what llvm refers to as "architected flat scratch" self.has_scratch_base_registers = self.target >= (11,0,0) or self.target in {(9,4,2), (9,5,0)} diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 9f9db6031e..11ccc88993 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -591,7 +591,7 @@ class NVDevice(HCQCompiled[HCQSignal]): self.synchronize() def _ensure_has_local_memory(self, required): - if self.slm_per_thread >= required or ((maxlm:=getenv("NV_MAX_LOCAL_MEMORY_PER_THREAD")) > 0 and required >= maxlm): return + if self.slm_per_thread >= required: return self.slm_per_thread, old_slm_per_thread = round_up(required, 32), self.slm_per_thread bytes_per_tpc = round_up(round_up(self.slm_per_thread * 32, 0x200) * self.max_warps_per_sm * self.num_sm_per_tpc, 0x8000)