From fa57c3e7ce88629f8afe5ab67ea24ec311f29fef Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Thu, 18 Apr 2024 10:57:54 +0400 Subject: [PATCH] continue llm.c (#4190) * continue llm.c * export more * progress on llm.c * simpler optim, names work --- examples/llm.c/export.py | 53 ++++++++++++++++++++++++++++++++++++++ tinygrad/engine/realize.py | 3 ++- tinygrad/nn/optim.py | 8 +++--- 3 files changed, 58 insertions(+), 6 deletions(-) create mode 100755 examples/llm.c/export.py diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py new file mode 100755 index 0000000000..6ef4371e76 --- /dev/null +++ b/examples/llm.c/export.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import os +#os.environ["NOOPT"] = "1" +from tinygrad import Device, nn, Tensor, dtypes +#Device.DEFAULT = "CLANG" +from train_gpt2 import GPT, GPTConfig +from tinygrad.helpers import dedup, to_function_name, flatten +from tinygrad.engine.schedule import create_schedule +from tinygrad.engine.realize import memory_planner, run_schedule +from tinygrad.ops import BufferOps, LoadOps +from tinygrad.runtime.ops_clang import CLANG_PROGRAM_HEADER + +if __name__ == "__main__": + model = GPT(GPTConfig(n_layer=12, n_head=12, n_embd=768)) + #model.load_pretrained() + seen = set() + early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)], seen) + print(f"built model {len(early_sched)}") + + optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4) + for i in range(3): # TODO: why does it take three and not two to stablize + x = Tensor.empty(4, 64, dtype=dtypes.int) + y = Tensor.empty(4, 64, dtype=dtypes.int) + _, loss = model(x, y) + optimizer.zero_grad() + loss.backward() + tensors = optimizer.schedule_step() + sched = create_schedule([loss.lazydata] + [x.lazydata for x in optimizer.params+optimizer.buffers+tensors], seen) + print(f"calls {i}:", len(sched)) + #run_schedule(sched[:]) + del seen # free the LazyBuffers + sched = memory_planner(sched) + ast_dedup = dedup([si.ast for si in sched if si.ast[0].op is BufferOps.STORE]) + srcs = {} + for ast in ast_dedup: + k = Device["CLANG"].get_linearizer(*ast) + k.linearize() + src = Device["CLANG"].compiler.render(to_function_name(k.name), k.uops).strip(CLANG_PROGRAM_HEADER) + srcs[ast] = (k.name, src) + print("functions:", len(srcs)) + numbered_bufs = {x:i for i,x in enumerate(dedup(flatten([si.outputs+si.inputs for si in sched])))} + print("buffers:", len(numbered_bufs)) + + # TODO: why don't the buffer names work for X and Y + state_dict = nn.state.get_state_dict(model) + named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()} + named_buffers['X'] = x.lazydata.base.buffer + named_buffers['Y'] = y.lazydata.base.buffer + + for si in sched: + if si.ast[0].op is not BufferOps.STORE: continue + bufs = [named_buffers.get(b, f"b{numbered_bufs[b]}") for b in si.outputs+si.inputs] + print(f"{srcs[si.ast][0]}({', '.join(bufs)})") \ No newline at end of file diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index 7be3b2dc25..8f3820638f 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -63,7 +63,8 @@ def _internal_memory_planner(buffers:List[Iterable[Buffer]], debug_prefix="") -> local_cache[key].append(assigned[buf]) if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())): - print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB to {sum([x.nbytes for x in av])/1e6:.2f} MB") + print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,", + f"{len(ak)} -> {len(av)} bufs") return assigned def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]: diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py index 5220d53537..63d522305c 100644 --- a/tinygrad/nn/optim.py +++ b/tinygrad/nn/optim.py @@ -1,5 +1,5 @@ # sorted in order of increasing complexity -from typing import List, Optional +from typing import List from tinygrad.helpers import dedup, flatten, getenv from tinygrad.tensor import Tensor @@ -18,10 +18,8 @@ class Optimizer: def zero_grad(self): for param in self.params: param.grad = None - def realize(self, extra=None): - Tensor.corealize(extra + self.params + self.buffers if extra is not None else self.params + self.buffers) - - def step(self, extra:Optional[List[Tensor]]=None): self.realize(self._step() + (extra if extra is not None else [])) + def step(self): Tensor.corealize(self.schedule_step()) + def schedule_step(self) -> List[Tensor]: return self._step()+self.params+self.buffers def _step(self) -> List[Tensor]: raise NotImplementedError class OptimizerGroup(Optimizer):