diff --git a/test/null/test_viz.py b/test/null/test_viz.py
index 1597045eeb..bff088052e 100644
--- a/test/null/test_viz.py
+++ b/test/null/test_viz.py
@@ -1,4 +1,4 @@
-import unittest, decimal, json, struct
+import unittest, decimal, json, struct, sys
 from dataclasses import dataclass
 from typing import Generator
 
@@ -6,7 +6,7 @@ from tinygrad.uop.ops import UOp, UPat, Ops, PatternMatcher, TrackedPatternMatch
 from tinygrad.uop.symbolic import sym
 from tinygrad.dtype import dtypes
 from tinygrad.helpers import PROFILE, colored, ansistrip, flatten, TracingKey, ProfileRangeEvent, ProfileEvent, Context, cpu_events, profile_marker
-from tinygrad.helpers import VIZ, cpu_profile
+from tinygrad.helpers import VIZ, cpu_profile, ProfilePointEvent
 from tinygrad.device import Buffer
 
 @track_rewrites(name=True)
@@ -424,6 +424,51 @@ class TestVizProfiler(BaseTestViz):
     self.assertEqual(graph_events[0]['st'], nv_events[0]['st'])
     self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], sdma_events[0]['st']+sdma_events[0]['dur'])
 
+  def test_block_ordering(self):
+    prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)),
+            ProfileDeviceEvent(device='NV:1', tdiff=decimal.Decimal(-500)),
+            ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-100)),
+            ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
+            ProfileRangeEvent(device='NV:1', name='E_3', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
+            ProfileRangeEvent(device='NV:SDMA:0', name='COPY', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
+            ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_2', st_id=0, en_id=1)],
+                              deps=[[]], sigs=[decimal.Decimal(1000), decimal.Decimal(1010)])]
+    j = load_profile(prof)
+    # graph grouped with its device, memory at the end
+    self.assertListEqual(list(j['layout']), ['NV', 'NV Graph', 'NV:SDMA:0', 'NV:1'])
+
+  @unittest.skipIf(sys.platform == 'win32', "TODO: ops_amd import fails on windows")
+  def test_multi_sdma_ordering(self):
+    props = {"gfx_target_version": 0}
+    D, St, En = decimal.Decimal, decimal.Decimal(1000), decimal.Decimal(1010)
+    prof = [# 2 AMD GPUs, 2 SDMA engines each
+            ProfileDeviceEvent(device='AMD', tdiff=D(-1000), props=props),
+            ProfileDeviceEvent(device='AMD:1', tdiff=D(-900), props=props),
+            ProfileDeviceEvent(device='AMD:SDMA:0', tdiff=D(-100), props=props),
+            ProfileDeviceEvent(device='AMD:SDMA:1', tdiff=D(-80), props=props),
+            ProfileDeviceEvent(device='AMD:1:SDMA:0', tdiff=D(-60), props=props),
+            ProfileDeviceEvent(device='AMD:1:SDMA:1', tdiff=D(-40), props=props),
+            # compute + copy events
+            ProfileRangeEvent(device='AMD', name='E_1', st=St, en=En),
+            ProfileRangeEvent(device='AMD:1', name='E_2', st=St, en=En),
+            ProfileRangeEvent(device='AMD:SDMA:0', name='COPY0', st=St, en=En),
+            ProfileRangeEvent(device='AMD:SDMA:1', name='COPY1', st=St, en=En),
+            ProfileRangeEvent(device='AMD:1:SDMA:0', name='COPY2', st=St, en=En),
+            ProfileRangeEvent(device='AMD:1:SDMA:1', name='COPY3', st=St, en=En),
+            # graph spanning compute + copy on GPU 0
+            ProfileGraphEvent(ents=[ProfileGraphEntry(device='AMD', name='E_1', st_id=0, en_id=1),
+                                    ProfileGraphEntry(device='AMD:SDMA:0', name='COPY0', st_id=2, en_id=3)],
+                              deps=[[], [0]], sigs=[St, En, St, En]),
+            # memory alloc on both GPUs
+            ProfilePointEvent(device='AMD', name='alloc', key=0, arg={"sz":1024, "dtype":dtypes.float}, ts=St),
+            ProfilePointEvent(device='AMD:1', name='alloc', key=1, arg={"sz":512, "dtype":dtypes.float}, ts=St)]
+    j = load_profile(prof)
+    # graph grouped with its device, memory at the end
+    self.assertListEqual(list(j['layout']),
+      ['AMD', 'AMD Graph', 'AMD:SDMA:0', 'AMD:SDMA:1',
+       'AMD:1', 'AMD:1:SDMA:0', 'AMD:1:SDMA:1',
+       'AMD Memory', 'AMD:1 Memory'])
+
   def test_bytes_per_kernel(self):
     step = 10
     n_events = 1_000
@@ -463,11 +508,11 @@ class TestVizProfiler(BaseTestViz):
 
   def test_layout_order(self):
     def fn(): return
-    for dname in ["TINY", "USER", "TEST:1 N1", "TEST:2 N1", "TEST:1 N2", "TEST:1:ENGINE:0", "TEST:1"]:
+    for dname in ["TINY", "USER", "TEST:1 N1", "TEST:2 N1", "TEST:1 N2", "TEST:1:ENGINE:0", "TEST:1:ENGINE:0 N1", "TEST:1"]:
       with cpu_profile("fn", dname): fn()
     layout = list(load_profile(cpu_events)["layout"])
     self.assertListEqual(layout[:2], ["USER","TINY"])
-    self.assertListEqual(layout[2:], ["TEST:1", "TEST:1:ENGINE:0", "TEST:1 N1","TEST:1 N2", "TEST:2 N1"])
+    self.assertListEqual(layout[2:], ["TEST:1", "TEST:1 N1", "TEST:1 N2", "TEST:1:ENGINE:0", "TEST:1:ENGINE:0 N1", "TEST:2 N1"])
 
 def _alloc(b:int):
   a = Tensor.empty(b, device="NULL", dtype=dtypes.char)
diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py
index f10d8692f8..b14a75b67d 100755
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -368,14 +368,12 @@ def unpack_sqtt(key:tuple[str, int], data:list, p:ProfileProgramEvent) -> tuple[
       events.append(ProfileRangeEvent(f"SIMD:{occ.simd}", f"OCC WAVE:{occ.wave_id} N:{next(units[u])}", Decimal(wave_start.pop(u)),Decimal(occ.time)))
   return cu_events, list(units), wave_insts
 
-def device_sort_fn(k:str) -> tuple[int, str, int]:
-  order = {"GC": 0, "USER": 1, "TINY": 2, "DISK": 999}
-  dname, *rest = k.split()
-  dev_rank = next((v for k,v in order.items() if dname.startswith(k)), len(order))
-  if len(parts:=dname.split(":")) < 2 or not parts[1].isdigit(): parts.insert(1, "0")
-  eng_rank = 2 if rest else 1 if len(parts) > 2 else 0
-  # 3 levels of hierarchy: device class, index in multi device, engine within device
-  return (dev_rank, parts[1], eng_rank)
+def device_sort_fn(k:str) -> tuple:
+  special = {"GC": 0, "USER": 1, "TINY": 2, "ALLDEVS":100, "DISK": 999}
+  is_memory = k.endswith(" Memory")
+  p = k.split(" ")[0].split(":")
+  dev_base = p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"
+  return (is_memory, special.get(p[0], special['ALLDEVS']), dev_base, k)
 
 def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]=device_sort_fn) -> bytes|None:
   # start by getting the time diffs