rename lazydata to uop (#10698)

This commit is contained in:
George Hotz
2025-06-08 08:42:22 -07:00
committed by GitHub
parent 8e3f337075
commit 32e9949052
57 changed files with 485 additions and 486 deletions

View File

@@ -240,7 +240,6 @@ class LLaMa:
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
#elif 'norm.' in k: v.shard_(device, axis=-1)
else: v.shard_(device, axis=None)
#print(k, v.shape, v.lazydata.axis)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
@@ -446,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad.
print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model")
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device)
param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model))
param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model))
outputted = pre_prompt if chatbot else args.prompt
start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted)

View File

@@ -284,7 +284,7 @@ if __name__ == "__main__":
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device)
param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model))
param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model))
if not args.no_api and not args.benchmark:
from bottle import Bottle, request, response, HTTPResponse, abort, static_file

View File

@@ -16,7 +16,7 @@ if __name__ == "__main__":
#model.load_pretrained()
for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained
#early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)])
#early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)])
#print(f"built model {len(early_sched)}")
#B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64)
@@ -56,7 +56,7 @@ if __name__ == "__main__":
state_dict.update({'X': X, 'Y': Y, 'loss': loss})
grad_state_dict = {}
for k,v in state_dict.items():
if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}")
if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}")
if v.grad is not None: grad_state_dict['grad_'+k] = v.grad
state_dict.update(grad_state_dict)
state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr})
@@ -65,7 +65,7 @@ if __name__ == "__main__":
nm = inverse_state_dict[p]
state_dict["adam_m_"+nm] = m
state_dict["adam_v_"+nm] = v
named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
c_code = ["#include <stdlib.h>", "#include <tgmath.h>", "#include <stdbool.h>"]
if TIMING: c_code += ["#include <stdio.h>", "#include <time.h>"]

View File

@@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed):
#storage_tensor._copyin(img_tensor.numpy())
# faster
X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
# ideal
#X[idx].assign(img.tobytes()) # NOTE: this is slow!
@@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens
x = random_brightness_augmentation(x)
x = gaussian_noise(x)
X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
queue_out.put(idx)
queue_out.put(None)
@@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue
clipped_match_idxs = np.clip(match_idxs, 0, None)
clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs]
boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
queue_out.put(idx)
queue_out.put(None)

View File

@@ -19,8 +19,8 @@ if __name__ == "__main__":
inputs = run_onnx.get_empty_input_data("npy")
out: Tensor = next(iter(run_onnx({k:v.to(None) for k,v in inputs.items()}).values())).to('cpu')
root = out.lazydata
targets = [x.lazydata for x in inputs.values()]
root = out.uop
targets = [x.uop for x in inputs.values()]
print(targets)
# TODO: abstract this from gradient?
@@ -42,7 +42,7 @@ if __name__ == "__main__":
print("**** real ****")
GlobalCounters.reset()
out.lazydata = root.substitute(kernelized).substitute(becomes_map)
out.uop = root.substitute(kernelized).substitute(becomes_map)
out.kernelize()
# realize

View File

@@ -66,7 +66,7 @@ if __name__ == "__main__":
model_path = Path(args.weights) if args.weights else download_weights(model_info["total_num_weights"])
transformer = load_model(model_path, model_info["model_params"])
tokenizer = AutoTokenizer.from_pretrained(model_info["tokenizer"])
param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(transformer))
param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(transformer))
outputted = args.prompt
start_pos, toks = 0, tokenizer(outputted)["input_ids"]

View File

@@ -13,8 +13,8 @@ def prepare_browser_chunks(model):
chunk_size = 16 * 1024 * 1024 # small chunks based on iphone browser constraints
metadata = {}
# We won't export cache_kv bytes (because we start inference on client at start_pos=0), but we will tell the client how big cache_kv needs to be
t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k]
empty_t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k]
t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k]
empty_t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k]
split_t_infos = []
for size, name, dtype in t_infos:
@@ -48,7 +48,7 @@ def prepare_browser_chunks(model):
weight_metadata = metadata.get(name, default)
weight_metadata["parts"][part_num] = {"file": i, "file_start_pos": cursor, "size": size}
metadata[name] = weight_metadata
data = bytes(state_dict[name].lazydata.base.realized.as_buffer())
data = bytes(state_dict[name].uop.base.realized.as_buffer())
data = data if not offsets else data[offsets[0]:offsets[1]]
writer.write(data)
cursor += size

View File

@@ -114,7 +114,7 @@ if __name__ == "__main__":
run, special_names = jit_model(step, *step.input)
functions, statements, bufs, _ = compile_net(run, special_names)
state = get_state_dict(model)
weights = {id(x.lazydata.base.realized): name for name, x in state.items()}
weights = {id(x.uop.base.realized): name for name, x in state.items()}
kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()])
kernel_names = ', '.join([name for (name, _, _, _) in statements])
input_names = [name for _,name in special_names.items() if "input" in name]