diff --git a/test/test_profiler.py b/test/test_profiler.py
index 71755d2afd..b307c45ac4 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -10,15 +10,15 @@ MOCKGPU = getenv("MOCKGPU")
 
 @contextlib.contextmanager
 def helper_collect_profile(*devs):
-  Compiled.profile_events = []
+  for dev in devs: dev.synchronize()
+  Compiled.profile_events = [x for x in Compiled.profile_events if isinstance(x, ProfileDeviceEvent) and x.device.startswith("METAL")]
 
   profile_list = []
   with Context(PROFILE=1):
-    try: yield profile_list
-    finally:
-      for dev in devs: dev.synchronize()
-      for dev in devs: dev._at_profile_finalize()
-      for x in Compiled.profile_events: profile_list.append(x)
+    yield profile_list
+    for dev in devs: dev.synchronize()
+    for dev in devs: dev._at_profile_finalize()
+    for x in Compiled.profile_events: profile_list.append(x)
 
 def helper_profile_filter_device(profile, device:str):
   assert any(getattr(x, "device", None) == device and isinstance(x, ProfileDeviceEvent) for x in profile), f"device {device} is not registred"
@@ -26,7 +26,7 @@ def helper_profile_filter_device(profile, device:str):
   assert len(dev_events) == 1, "only one device registration event is expected"
   return [x for x in profile if getattr(x, "device", None) == device], dev_events[0]
 
-@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "HCQ device required to run")
+@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled) or Device.DEFAULT in {"METAL"}, "HCQ device required to run")
 class TestProfiler(unittest.TestCase):
   @classmethod
   def setUpClass(self):
@@ -39,9 +39,6 @@ class TestProfiler(unittest.TestCase):
     TestProfiler.runner = get_runner(TestProfiler.d0.device, si.ast)
     TestProfiler.b.lazydata.buffer.allocate()
 
-    TestProfiler.kernargs_ba_ptr = TestProfiler.runner._prg.fill_kernargs([TestProfiler.b.lazydata.buffer._buf, TestProfiler.a.lazydata.buffer._buf])
-    TestProfiler.kernargs_ab_ptr = TestProfiler.runner._prg.fill_kernargs([TestProfiler.a.lazydata.buffer._buf, TestProfiler.b.lazydata.buffer._buf])
-
   def test_profile_kernel_run(self):
     runner_name = TestProfiler.runner._prg.name
     with helper_collect_profile(TestProfiler.d0) as profile:
@@ -71,12 +68,12 @@ class TestProfiler(unittest.TestCase):
     with helper_collect_profile(TestProfiler.d0) as profile:
       buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
       TestProfiler.runner([buf1, TestProfiler.a.lazydata.buffer], var_vals={})
-      buf1.as_buffer()
+      buf1.copyout(memoryview(bytearray(buf1.nbytes)))
 
     profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
     evs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
 
-    assert len(evs) == 3, "two kernel runs are expected"
+    assert len(evs) == 3, "3 kernel runs are expected"
     assert evs[0].is_copy, "kernel should be copy"
     assert evs[1].name == runner_name, "kernel name is not correct"
     assert not evs[1].is_copy, "kernel should not be copy"
@@ -102,7 +99,7 @@ class TestProfiler(unittest.TestCase):
       assert len(evs) == 1, "one kernel runs are expected"
       assert evs[0].is_copy, "kernel should be copy"
 
-  @unittest.skipIf(MOCKGPU and Device.DEFAULT == "AMD", "AMD mockgpu with indirect buffers does not support queue wait interrupts")
+  @unittest.skipIf(Device.DEFAULT in "METAL" or (MOCKGPU and Device.DEFAULT == "AMD"), "AMD mockgpu does not support queue wait interrupts")
   def test_profile_graph(self):
     d1 = Device[f"{Device.DEFAULT}:1"]
 
@@ -124,7 +121,7 @@ class TestProfiler(unittest.TestCase):
     assert len(graph_evs) == 1, "one graph event is expected"
     assert len(graph_evs[0].ents) == 2, "two entities are expected"
 
-  @unittest.skipIf(CI, "skip CI")
+  @unittest.skipIf(CI or not issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "skip CI")
   def test_dev_jitter_matrix(self):
     dev_cnt = 6
     devs = [Device[f"{Device.DEFAULT}:{i}"] for i in range(dev_cnt)]
diff --git a/tinygrad/device.py b/tinygrad/device.py
index 56d51fccff..99e52831dc 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 from dataclasses import dataclass, replace
 from collections import defaultdict
-from typing import Optional, Dict, Tuple, Any, Iterator, List, Set
-import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal
+from typing import Optional, Dict, Tuple, Any, Iterator, List, Set, Generator
+import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time
 from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
 from tinygrad.renderer import Renderer
@@ -61,6 +61,16 @@ class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:boo
 @dataclass(frozen=True)
 class ProfileGraphEvent(ProfileEvent): ents:List[ProfileGraphEntry]; deps:List[List[int]]; sigs:List[decimal.Decimal] # noqa: E702
 
+@dataclass
+class ProfileResult: st:Optional[int]=None; en:Optional[int]=None # noqa: E702
+
+@contextlib.contextmanager
+def cpu_profile(name, device="CPU", is_copy=False, display=True) -> Generator[ProfileResult, None, None]:
+  yield (res:=ProfileResult(st:=time.perf_counter_ns()))
+  res.en = en = time.perf_counter_ns()
+  if PROFILE and display:
+    Compiled.profile_events += [ProfileRangeEvent(device, name, decimal.Decimal(st) / 1000, decimal.Decimal(en) / 1000, is_copy=is_copy)]
+
 # **************** Buffer + Allocators ****************
 
 
@@ -221,7 +231,7 @@ class Compiler:
   def disassemble(self, lib:bytes): pass
 
 class Compiled:
-  profile_events:List[ProfileEvent] = []
+  profile_events:List[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device.
 
   def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
     self.device, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph
diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py
index 84c5f7d1dc..62afcaf439 100644
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@@ -7,7 +7,7 @@ from tinygrad.engine.realize import ExecItem, CompiledRunner
 from tinygrad.engine.jit import GraphRunner, GraphException
 from tinygrad.ops import Variable
 from tinygrad.runtime.ops_metal import wait_check, msg, libobjc, to_struct, objc_instance,\
-  MTLResourceOptions, elapsed_time, objc_id
+  MTLResourceOptions, cmdbuf_st_time, cmdbuf_en_time, objc_id, to_ns_str
 
 class MTLIndirectCommandType:
   MTLIndirectCommandTypeConcurrentDispatch = (1 << 5)
@@ -90,11 +90,12 @@ class MetalGraph(GraphRunner):
 
     msg(encoder, "executeCommandsInBuffer:withRange:", self.icb, self.range)
     msg(encoder, "endEncoding")
+    msg(command_buffer, "setLabel:", to_ns_str(f"batched {len(self.jit_cache)}"))
     msg(command_buffer, "commit")
     self.command_buffer = command_buffer
 
+    self.dev.mtl_buffers_in_flight.append(command_buffer)
     if wait:
       wait_check(command_buffer)
-      return elapsed_time(command_buffer)
-    self.dev.mtl_buffers_in_flight.append(command_buffer)
+      return cmdbuf_en_time(command_buffer) - cmdbuf_st_time(command_buffer)
     return None
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index 82b50c929d..69d96a20bd 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
-import os, pathlib, struct, ctypes, tempfile, functools
+import os, pathlib, struct, ctypes, tempfile, functools, decimal
 from typing import List, Any, Union, Tuple, cast
-from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t
-from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
+from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE
+from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, cpu_profile, ProfileDeviceEvent, ProfileRangeEvent
 from tinygrad.renderer.cstyle import MetalRenderer
 
 class objc_id(ctypes.c_void_p): # This prevents ctypes from converting response to plain int, and dict.fromkeys() can use it to dedup
@@ -44,6 +44,7 @@ def msg(ptr: objc_id, selector: str, /, *args: Any, restype: type[T] = objc_id)
   return sender(ptr, sel(selector), *args)
 
 def to_ns_str(s: str): return msg(libobjc.objc_getClass(b"NSString"), "stringWithUTF8String:", s.encode(), restype=objc_instance)
+def from_ns_str(s): return bytes(msg(s, "UTF8String", restype=ctypes.c_char_p)).decode()
 
 def to_struct(*t: int, _type: type = ctypes.c_ulong): return init_c_struct_t(tuple([(f"field{i}", _type) for i in range(len(t))]))(*t)
 
@@ -51,12 +52,13 @@ def wait_check(cbuf: Any):
   msg(cbuf, "waitUntilCompleted")
   error_check(msg(cbuf, "error", restype=objc_instance))
 
-def elapsed_time(cbuf: objc_id):
-  return cast(float, msg(cbuf, "GPUEndTime", restype=ctypes.c_double)) - cast(float, msg(cbuf, "GPUStartTime", restype=ctypes.c_double))
+def cmdbuf_label(cbuf: objc_id) -> str: return from_ns_str(msg(cbuf, "label", restype=objc_id))
+def cmdbuf_st_time(cbuf: objc_id) -> float: return cast(float, msg(cbuf, "GPUStartTime", restype=ctypes.c_double))
+def cmdbuf_en_time(cbuf: objc_id) -> float: return cast(float, msg(cbuf, "GPUEndTime", restype=ctypes.c_double))
 
 def error_check(error: objc_instance, error_constructor: type[Exception] = RuntimeError):
   if error.value is None: return None
-  raise error_constructor(bytes(msg(msg(error, "localizedDescription", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode())
+  raise error_constructor(from_ns_str(msg(error, "localizedDescription", restype=objc_instance)))
 
 def metal_src_to_library(device:MetalDevice, src:str) -> objc_instance:
   options = msg(libobjc.objc_getClass(b"MTLCompileOptions"), "new", restype=objc_instance)
@@ -136,11 +138,12 @@ class MetalProgram:
     for i,a in enumerate(vals, start=len(bufs)): msg(encoder, "setBytes:length:atIndex:", bytes(ctypes.c_int(a)), 4, i)
     msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size))
     msg(encoder, "endEncoding")
+    msg(command_buffer, "setLabel:", to_ns_str(self.name))
     msg(command_buffer, "commit")
+    self.dev.mtl_buffers_in_flight.append(command_buffer)
     if wait:
       wait_check(command_buffer)
-      return elapsed_time(command_buffer)
-    self.dev.mtl_buffers_in_flight.append(command_buffer)
+      return cmdbuf_en_time(command_buffer) - cmdbuf_st_time(command_buffer)
 
 class MetalBuffer:
   def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
@@ -172,11 +175,13 @@ class MetalAllocator(LRUAllocator):
       src_dev.timeline_value += 1
     msg(src_command_buffer, "commit")
     src_dev.mtl_buffers_in_flight.append(src_command_buffer)
+  def _cp_mv(self, dst, src, prof_desc):
+    with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src
   def _as_buffer(self, src:MetalBuffer) -> memoryview:
     self.dev.synchronize()
     return to_mv(cast(int, msg(src.buf, "contents", restype=objc_id).value), src.size + src.offset)[src.offset:]
-  def _copyin(self, dest:MetalBuffer, src:memoryview): self._as_buffer(dest)[:] = src
-  def _copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self._as_buffer(src)
+  def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "CPU -> METAL")
+  def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> CPU")
   def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
 
 class MetalDevice(Compiled):
@@ -185,14 +190,18 @@ class MetalDevice(Compiled):
     self.mtl_queue = msg(self.sysdevice, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance)
     if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
     self.mtl_buffers_in_flight: List[Any] = []
-    self.mv_in_metal: List[memoryview] = []
     self.timeline_signal = msg(self.sysdevice, "newSharedEvent", restype=objc_instance)
     self.timeline_value = 0
 
+    Compiled.profile_events += [ProfileDeviceEvent(device)]
+
     from tinygrad.runtime.graph.metal import MetalGraph
     super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler() if getenv("METAL_DIRECT", 1) else Compiler(),
                      functools.partial(MetalProgram, self), MetalGraph)
+
   def synchronize(self):
-    for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
-    self.mv_in_metal.clear()
+    for cbuf in self.mtl_buffers_in_flight:
+      wait_check(cbuf)
+      st, en = decimal.Decimal(cmdbuf_st_time(cbuf)) * 1000000, decimal.Decimal(cmdbuf_en_time(cbuf)) * 1000000
+      if PROFILE: Compiled.profile_events += [ProfileRangeEvent(self.device, cmdbuf_label(cbuf), st, en, is_copy=False)]
     self.mtl_buffers_in_flight.clear()
diff --git a/tinygrad/viz/README b/tinygrad/viz/README
index 95f503e6bd..4565529c67 100644
--- a/tinygrad/viz/README
+++ b/tinygrad/viz/README
@@ -8,16 +8,17 @@ https://tiny-tools-client.vercel.app/
 and a viewer for:
 SAVE_SCHEDULE=1
 TRACK_MATCH_STATS=2
-PROFILE=1 (eventually)
+PROFILE=1
 
 to use:
-1. Run tinygrad with VIZ=1 (this saves the pkls and launches the server (new process please!))
+1. Run tinygrad with VIZ=1 and/or PROFILE=1 (this saves the pkls and launches the server (new process please!))
 2. That's it!
 
 This should be able to:
-1. See all schedules
-2. See all graphs and how they were rewritten
-3. See generated code
+1. See all schedules (VIZ=1)
+2. See all graphs and how they were rewritten (VIZ=1)
+3. See generated code (VIZ=1)
+4. See profile (PROFILE=1)
 
 bunch of dev rules:
 * everything must be responsive to keyboard smashing! lag should never happen