diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py
index f268272067..f33afee46d 100644
--- a/extra/sqtt/roc.py
+++ b/extra/sqtt/roc.py
@@ -1,5 +1,4 @@
 import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools, threading
-from tabulate import tabulate
 from typing import Generator
 from tinygrad.helpers import temp, unwrap, DEBUG
 from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
@@ -161,6 +160,7 @@ def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
 
 def print_pmc(events:list[ProfilePMCEvent]) -> None:
   from tinygrad.viz.serve import unpack_pmc
+  from tabulate import tabulate
   for e in events:
     print("**", e.kern)
     data = unpack_pmc(e)
diff --git a/extra/sqtt/test_pmc.py b/extra/sqtt/test_pmc.py
index ab6f23abcc..0143ff02dc 100644
--- a/extra/sqtt/test_pmc.py
+++ b/extra/sqtt/test_pmc.py
@@ -6,8 +6,11 @@ import unittest
 import functools, contextlib
 import numpy as np
 from tinygrad import Tensor, Context, Device
-from tinygrad.uop.ops import UOp, KernelInfo, AxisType
+from tinygrad.dtype import dtypes, AddrSpace
+from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
 from tinygrad.runtime.ops_amd import ProfilePMCEvent
+from tinygrad.engine.realize import get_runner
+from tinygrad.viz.serve import unpack_pmc
 from extra.sqtt.roc import print_pmc
 
 def copy_kernel(B, A, stride=1):
@@ -19,6 +22,16 @@ def copy_kernel(B, A, stride=1):
   index = (i * stride) % A.size
   return B[index].store(A[index]).sink(arg=KernelInfo(name=f"copy_{A.size}_stride_{stride}", opts_to_apply=()))
 
+def lds_kernel(offset:UOp, size:int, inst:str) -> UOp:
+  tid = UOp.range(offset.size, 0, AxisType.LOCAL)
+  dst = UOp.placeholder((size,), dtypes.float32, 1, AddrSpace.REG)
+  #lds = UOp.placeholder((1024,), dtypes.float32, 2, AddrSpace.LOCAL)
+  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_waitcnt(0);')
+  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_barrier();', src=(u,))
+  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_sched_barrier(0);', src=(u,))
+  u = UOp(Ops.CUSTOM, arg=f'asm volatile("{inst} '+'%0, %1" : "=v"({0}) : "v"({1}));', src=(dst, offset[tid], u))
+  return UOp.sink(u, arg=KernelInfo(name="test_lds", opts_to_apply=()))
+
 dev = Device[Device.DEFAULT]
 
 @contextlib.contextmanager
@@ -45,5 +58,30 @@ class TestPMC(unittest.TestCase):
 
   def test_copy_uncoalesced(self): return self.test_copy(stride=17)
 
+  # test with two threads issuing ds_reads at different offsets
+  def test_ds_read(self, size=1, inst='ds_read_b32'):
+    test_banks = 256
+    offsets = [Tensor([0, b*4]) for b in range(1, test_banks)]
+    with Context(DEBUG=0): Tensor.realize(*offsets)
+    k = Tensor.custom_kernel(offsets[0], fxn=functools.partial(lds_kernel, size=size, inst=inst))[0]
+    # sample all kernels
+    with save_pmc() as pmc_events:
+      runner = get_runner(Device.DEFAULT, k.schedule()[0].ast)
+      # TODO: llvm eliminates lds definition from the ELF, is there another way to pin lds size?
+      runner._prg.group_segment_size = 1024
+      for offset in offsets: runner([offset.uop.buffer])
+    # find read offsets that created bank conflicts from the pmc counters
+    found:list[Tensor] = []
+    for i,e in enumerate(pmc_events):
+      pmc = unpack_pmc(e)["rows"]
+      # SQ on gfx9, renamed to SQC after gfx10
+      val = next(total for name,total,_all_instances in pmc if name in {"SQ_LDS_BANK_CONFLICT", "SQC_LDS_BANK_CONFLICT"})
+      if val > 0: found.append(offsets[i])
+    print("Found bank conflicts at offsets:", [s.numpy() for s in found])
+
+  def test_ds_read_b64(self): self.test_ds_read(2, 'ds_read_b64')
+
+  def test_ds_read_b128(self): self.test_ds_read(4, 'ds_read_b128')
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index 791e4a4537..85b8fd5a4b 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -944,7 +944,8 @@ class AMDDevice(HCQCompiled):
       self.pmc_counters = import_pmc(self.target)
 
       # validate counters
-      pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT"
+      pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_IDX_ACTIVE,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 \
+          else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT"
       for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", pmc_default).split(",")):
         if k not in self.pmc_counters: raise RuntimeError(f"PMC counter {k} is not supported. Available: {','.join(self.pmc_counters.keys())}")
 
diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py
index f1a24a0802..6018a6a67e 100755
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -398,7 +398,7 @@ def get_render(i:int, j:int, fmt:str) -> dict:
         metadata.append(amd_readelf(compiler.compile(data.src)))
     return {"src":disasm_str, "lang":"amdgpu" if data.device.startswith("AMD") else None, "metadata":metadata}
   if fmt == "sqtt-insts":
-    columns = ["PC", "Instruction", "Hits", "Duration", "Stall", "Type"]
+    columns = ["PC", "Instruction", "Hits", "Cycles", "Stall", "Type"]
     inst_columns = ["N", "Clk", "Idle", "Dur", "Stall"]
     # Idle:     The total time gap between the completion of previous instruction and the beginning of the current instruction.
     #           The idle time can be caused by: