rename lazydata to uop (#10698)

2026-01-10 07:28:15 -05:00 · 2025-06-08 08:42:22 -07:00
parent 8e3f337075
commit 32e9949052
57 changed files with 485 additions and 486 deletions
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -240,7 +240,6 @@ class LLaMa:
            #elif k.endswith('.weight'): v.shard_(device, axis=-1)
            #elif 'norm.' in k: v.shard_(device, axis=-1)
            else: v.shard_(device, axis=None)
-            #print(k, v.shape, v.lazydata.axis)

        # replace weights in model
        load_state_dict(model, weights, strict=False, consume=True)
@@ -446,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad.
  print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model")
  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model))

  outputted = pre_prompt if chatbot else args.prompt
  start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted)
--- a/examples/llama3.py
+++ b/examples/llama3.py
@@ -284,7 +284,7 @@ if __name__ == "__main__":

  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model))

  if not args.no_api and not args.benchmark:
    from bottle import Bottle, request, response, HTTPResponse, abort, static_file
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@@ -16,7 +16,7 @@ if __name__ == "__main__":
  #model.load_pretrained()
  for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained

-  #early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)])
+  #early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)])
  #print(f"built model {len(early_sched)}")

  #B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64)
@@ -56,7 +56,7 @@ if __name__ == "__main__":
  state_dict.update({'X': X, 'Y': Y, 'loss': loss})
  grad_state_dict = {}
  for k,v in state_dict.items():
-    if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}")
+    if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}")
    if v.grad is not None: grad_state_dict['grad_'+k] = v.grad
  state_dict.update(grad_state_dict)
  state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr})
@@ -65,7 +65,7 @@ if __name__ == "__main__":
    nm = inverse_state_dict[p]
    state_dict["adam_m_"+nm] = m
    state_dict["adam_v_"+nm] = v
-  named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
+  named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}

  c_code = ["#include <stdlib.h>", "#include <tgmath.h>", "#include <stdbool.h>"]
  if TIMING: c_code += ["#include <stdio.h>", "#include <time.h>"]
--- a/examples/mlperf/dataloader.py
+++ b/examples/mlperf/dataloader.py
@@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed):
      #storage_tensor._copyin(img_tensor.numpy())

      # faster
-      X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+      X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()

      # ideal
      #X[idx].assign(img.tobytes())   # NOTE: this is slow!
@@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens
      x = random_brightness_augmentation(x)
      x = gaussian_noise(x)

-    X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
-    Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
+    X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
+    Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()

    queue_out.put(idx)
  queue_out.put(None)
@@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue
      clipped_match_idxs = np.clip(match_idxs, 0, None)
      clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs]

-      boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
-      labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
-      matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
-      anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
+      boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
+      labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
+      matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
+      anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()

-    imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+    imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()

    queue_out.put(idx)
  queue_out.put(None)
--- a/examples/openpilot/compile4.py
+++ b/examples/openpilot/compile4.py
@@ -19,8 +19,8 @@ if __name__ == "__main__":

  inputs = run_onnx.get_empty_input_data("npy")
  out: Tensor = next(iter(run_onnx({k:v.to(None) for k,v in inputs.items()}).values())).to('cpu')
-  root = out.lazydata
-  targets = [x.lazydata for x in inputs.values()]
+  root = out.uop
+  targets = [x.uop for x in inputs.values()]
  print(targets)

  # TODO: abstract this from gradient?
@@ -42,7 +42,7 @@ if __name__ == "__main__":

  print("**** real ****")
  GlobalCounters.reset()
-  out.lazydata = root.substitute(kernelized).substitute(becomes_map)
+  out.uop = root.substitute(kernelized).substitute(becomes_map)
  out.kernelize()

  # realize
--- a/examples/qwq.py
+++ b/examples/qwq.py
@@ -66,7 +66,7 @@ if __name__ == "__main__":
  model_path = Path(args.weights) if args.weights else download_weights(model_info["total_num_weights"])
  transformer = load_model(model_path, model_info["model_params"])
  tokenizer = AutoTokenizer.from_pretrained(model_info["tokenizer"])
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(transformer))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(transformer))

  outputted = args.prompt
  start_pos, toks = 0, tokenizer(outputted)["input_ids"]
--- a/examples/tinychat/tinychat-browser/compile.py
+++ b/examples/tinychat/tinychat-browser/compile.py
@@ -13,8 +13,8 @@ def prepare_browser_chunks(model):
  chunk_size = 16 * 1024 * 1024 # small chunks based on iphone browser constraints
  metadata = {}
  # We won't export cache_kv bytes (because we start inference on client at start_pos=0), but we will tell the client how big cache_kv needs to be
-  t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k]
-  empty_t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k]
+  t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k]
+  empty_t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k]

  split_t_infos = []
  for size, name, dtype in t_infos:
@@ -48,7 +48,7 @@ def prepare_browser_chunks(model):
        weight_metadata = metadata.get(name, default)
        weight_metadata["parts"][part_num] = {"file": i, "file_start_pos": cursor, "size": size}
        metadata[name] = weight_metadata
-        data = bytes(state_dict[name].lazydata.base.realized.as_buffer())
+        data = bytes(state_dict[name].uop.base.realized.as_buffer())
        data = data if not offsets else data[offsets[0]:offsets[1]]
        writer.write(data)
        cursor += size
--- a/examples/webgpu/stable_diffusion/compile.py
+++ b/examples/webgpu/stable_diffusion/compile.py
@@ -114,7 +114,7 @@ if __name__ == "__main__":
    run, special_names = jit_model(step, *step.input)
    functions, statements, bufs, _ = compile_net(run, special_names)
    state = get_state_dict(model)
-    weights = {id(x.lazydata.base.realized): name for name, x in state.items()}
+    weights = {id(x.uop.base.realized): name for name, x in state.items()}
    kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()])
    kernel_names = ', '.join([name for (name, _, _, _) in statements])
    input_names = [name for _,name in special_names.items() if "input" in name]