diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 0c705edc65..6e1680400f 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -273,7 +273,7 @@ def train_resnet(): else: it = iter(tqdm(batch_load_resnet(batch_size=EVAL_BS, val=True, shuffle=False, pad_first_batch=True), total=steps_in_val_epoch)) i, proc = 0, data_get(it) - + prev_cookies = [] while proc is not None: GlobalCounters.reset() @@ -692,7 +692,7 @@ def train_unet3d(): loss.backward() optim.step() return loss.realize() - + @Tensor.train(mode=False) @Tensor.test() def eval_step(model, x, y): @@ -701,7 +701,7 @@ def train_unet3d(): loss = dice_ce_loss(y_hat, y) score = dice_score(y_hat, y) return loss.realize(), score.realize() - + if WANDB: wandb.init(config=config, project=PROJ_NAME) step_times, start_epoch = [], 1 @@ -710,7 +710,7 @@ def train_unet3d(): next_eval_at = start_eval_at print(f"Training on {GPUS}") - + if BENCHMARK: print("Benchmarking UNet3D") else: print(f"Start evaluation at epoch {start_eval_at} and every {evaluate_every} epoch(s) afterwards") @@ -821,7 +821,8 @@ def train_rnnt(): def train_step_bert(model, optimizer, scheduler, loss_scaler:float, input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor, masked_positions:Tensor, masked_lm_ids:Tensor, masked_lm_weights:Tensor, next_sentence_labels:Tensor, GPUS): for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]: - t.shard_(GPUS, axis=0) + if len(GPUS) > 1: t.shard_(GPUS, axis=0) + else: t.to_(GPUS[0]) optimizer.zero_grad() lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids) @@ -829,7 +830,7 @@ def train_step_bert(model, optimizer, scheduler, loss_scaler:float, input_ids:Te (loss * loss_scaler).backward() global_norm = Tensor([0.0], dtype=dtypes.float32, device=optimizer[0].device).realize() - for p in optimizer.params: + for p in optimizer.params: p.grad = p.grad / loss_scaler global_norm += p.grad.float().square().sum() global_norm = global_norm.sqrt() @@ -843,7 +844,8 @@ def train_step_bert(model, optimizer, scheduler, loss_scaler:float, input_ids:Te def eval_step_bert(model, input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor, masked_positions:Tensor, masked_lm_ids:Tensor, masked_lm_weights:Tensor, next_sentence_labels:Tensor, GPUS): for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]: - t.shard_(GPUS, axis=0) + if len(GPUS) > 1: t.shard_(GPUS, axis=0) + else: t.to_(GPUS[0]) lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids) masked_lm_accuracy, seq_relationship_accuracy, masked_lm_loss, next_sentence_loss = \ model.accuracy(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels) @@ -942,8 +944,9 @@ def train_bert(): p = p.assign(Tensor.zeros_like(p).contiguous()).realize() parameters = get_parameters(model) - for p in parameters: - p.to_(GPUS) + if len(GPUS) > 1: + for p in parameters: + p.to_(GPUS) # ** Log run config ** for key, value in config.items(): print(f'HParam: "{key}": {value}') @@ -1061,7 +1064,7 @@ def train_bert(): if i % eval_step_freq == 0 or (BENCHMARK and i == BENCHMARK) or i == train_steps: if MLLOGGER and RUNMLPERF: MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*BS, "step_num": i}) - if getenv("RESET_STEP", 0) or INITMLPERF: train_step_bert.reset() + if getenv("RESET_STEP", 0): train_step_bert.reset() else: train_step_bert.captured.free_intermediates() eval_lm_losses = [] eval_clsf_losses = [] diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh index 7de3784389..6f17109784 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -8,6 +8,7 @@ export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 export IGNORE_JIT_FIRST_BEAM=1 export BASEDIR="/raid/datasets/wiki" +export RESET_STEP=1 export BENCHMARK=10 DEBUG=2 python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh index 69a8f6b0be..6a77928d89 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -17,7 +17,7 @@ DATETIME=$(date "+%m%d%H%M") LOGFILE="bert_green_${DATETIME}_${SEED}.log" # init -BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE +BENCHMARK=10 INITMLPERF=1 RESET_STEP=1 python3 examples/mlperf/model_train.py | tee $LOGFILE # run PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index 13472ffb85..f72acd8942 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -8,6 +8,7 @@ export BEAM=3 BEAM_UOPS_MAX=3000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 export IGNORE_JIT_FIRST_BEAM=1 export BASEDIR="/raid/datasets/wiki" +export RESET_STEP=1 export BENCHMARK=10 DEBUG=2 python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index 2ba5a672c2..ec2554b25f 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -17,7 +17,7 @@ DATETIME=$(date "+%m%d%H%M") LOGFILE="bert_red_${DATETIME}_${SEED}.log" # init -BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE +BENCHMARK=10 INITMLPERF=1 RESET_STEP=1 python3 examples/mlperf/model_train.py | tee $LOGFILE # run PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/test/mockgpu/nv/nvdriver.py b/test/mockgpu/nv/nvdriver.py index ed3e221258..d7c330c600 100644 --- a/test/mockgpu/nv/nvdriver.py +++ b/test/mockgpu/nv/nvdriver.py @@ -59,7 +59,7 @@ class NVDriver(VirtDriver): self.root_handle = None self.gpus = {} - self.next_fd = (1 << 30) + self.next_fd = (1 << 29) self.next_handle = 1 self.object_by_handle = {} diff --git a/test/test_hcq.py b/test/test_hcq.py index 13ef71bd87..42a432000c 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -499,5 +499,17 @@ class TestHCQ(unittest.TestCase): assert "0xDEADBEE1" in str(ctx.exception) os.environ.pop("MOCKGPU_EMU_FAULTADDR") + def test_multidevice(self): + try: amd_dev = Device["AMD"] + except Exception: self.skipTest("no AMD device, test skipped") + + try: nv_dev = Device["NV"] + except Exception: self.skipTest("no NV device, test skipped") + + x = amd_dev.signal_t() + y = nv_dev.signal_t() + assert type(x) is amd_dev.signal_t + assert type(y) is nv_dev.signal_t + if __name__ == "__main__": unittest.main() diff --git a/test/test_schedule.py b/test/test_schedule.py index 767d18ef93..80268a5c1f 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -72,6 +72,13 @@ def _test_conv2d(allowed:int, dtype:DType=dtypes.float, **kwargs): def schedule_graph_rewrite(big_sink:UOp): return graph_rewrite(big_sink, remove_movement_ops+sym, {}) class TestSchedule(unittest.TestCase): + @unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch") + def test_error_on_device_mismatch(self): + a = Tensor.empty(10) + b = Tensor.empty(10, device="CPU") + c = a+b + with self.assertRaises(RuntimeError): check_schedule(c, 1) + def test_basic_binop_fusion(self): a = Tensor.empty(10) b = Tensor.empty(10) diff --git a/test/unit/test_graph_rewrite.py b/test/unit/test_graph_rewrite.py index 40538daa7c..cc6d78ebec 100644 --- a/test/unit/test_graph_rewrite.py +++ b/test/unit/test_graph_rewrite.py @@ -2,7 +2,7 @@ import unittest, math from tinygrad import dtypes from tinygrad.helpers import all_same from tinygrad.ops import GroupOp, UOp, Ops, exec_alu -from tinygrad.codegen.devectorizer import full_graph_rewrite, mulacc_unrolled +from tinygrad.codegen.devectorizer import full_graph_rewrite # Helper function to apply the graph rewrite def apply_rewrite(expr): @@ -275,41 +275,6 @@ class TestSubstitute(unittest.TestCase): ret = substitute(ret, {a.sin():a.sqrt(), n1.sin():n1.sqrt()}) self.assertIs(ret, a.sqrt().sqrt()) -class TestMulaccUnrolledAcc(unittest.TestCase): - def test_unrolled2(self): - acc_range = (UOp.const(dtypes.int, 0), UOp.const(dtypes.int, 1)) - acc = UOp(Ops.DEFINE_ACC, dtypes.int, (UOp.const(dtypes.int, 0),) + acc_range, (0,)) - a = UOp.variable('a', 0, 10) - b = UOp.variable('b', 0, 10) - expr = acc.assign(acc + (a*2 + b*3)) - expr_with_mulacc = graph_rewrite(expr, mulacc_unrolled) - self.assertIs(expr_with_mulacc, acc.assign(acc + a*2 + b*3)) - - def test_unrolled4_float(self): - acc_range = (UOp.const(dtypes.int, 0), UOp.const(dtypes.int, 3)) - acc = UOp(Ops.DEFINE_ACC, dtypes.float32, (UOp.const(dtypes.int, 0),)+acc_range, (0,)) - - a = [UOp.variable(f'a{i}', float("-inf"), float("inf"), dtype=dtypes.float32) for i in range(4)] - b = [UOp.variable(f'b{i}', float("-inf"), float("inf"), dtype=dtypes.float32) for i in range(4)] - - expr = acc.assign(acc + (a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3])) - expr_with_mulacc = graph_rewrite(expr, mulacc_unrolled) - - # Verify it unrolls into individual multiply-accumulate operations - expected = acc.assign(acc + a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]) - self.assertIs(expr_with_mulacc, expected) - - def test_unrolled4_float_const(self): - acc_range = (UOp.const(dtypes.int, 0), UOp.const(dtypes.int, 3)) - acc = UOp(Ops.DEFINE_ACC, dtypes.float32, (UOp.const(dtypes.int, 0),)+acc_range, (0,)) - - a = [UOp.variable(f'a{i}', float("-inf"), float("inf"), dtype=dtypes.float32) for i in range(4)] - expr = acc.assign(acc + (a[0]*3.0 + a[1]*4.0 + a[2]*5.0 + a[3]*6.0)) - expr_with_mulacc = graph_rewrite(expr, mulacc_unrolled) - - # Verify it unrolls into individual multiply-accumulate operations - expected = acc.assign(acc + a[0]*3.0 + a[1]*4.0 + a[2]*5.0 + a[3]*6.0) - self.assertIs(expr_with_mulacc, expected) if __name__ == '__main__': unittest.main() diff --git a/tinygrad/codegen/devectorizer.py b/tinygrad/codegen/devectorizer.py index 46aab32361..10550358ba 100644 --- a/tinygrad/codegen/devectorizer.py +++ b/tinygrad/codegen/devectorizer.py @@ -4,7 +4,7 @@ from collections import defaultdict from tinygrad.dtype import dtypes, ImageDType, PtrDType from tinygrad.ops import UOp, Ops, UPat, PatternMatcher, resolve from tinygrad.ops import graph_rewrite, GroupOp -from tinygrad.codegen.symbolic import symbolic_simple, split_uop, uop_given_valid, parse_valid, simplify_valid, sym, mulacc_unrolled +from tinygrad.codegen.symbolic import symbolic_simple, split_uop, uop_given_valid, parse_valid, simplify_valid, sym from tinygrad.helpers import getenv, flatten, dedup, TRANSCENDENTAL, AMX, prod, DEVECTORIZE from tinygrad.codegen.transcendental import xexp2, xlog2, xsin, xpow, TRANSCENDENTAL_SUPPORTED_DTYPES from tinygrad.renderer import Renderer @@ -232,12 +232,11 @@ def full_graph_rewrite(sink:UOp, opts:Optional[Renderer]=None) -> UOp: extra_matcher = opts.extra_matcher if opts is not None and opts.extra_matcher is not None else PatternMatcher([]) if DEVECTORIZE: - # devectorize + load_store_indexing + mulacc_unrolled, mulacc_unrolled must be last because it can break loop_collapse - sink = graph_rewrite(sink, sym+(devectorize+float4_folding if opts is not None and opts.supports_float4 else devectorize)+load_store_indexing+ - mulacc_unrolled) + # devectorize + load_store_indexing + sink = graph_rewrite(sink, sym+(devectorize+float4_folding if opts is not None and opts.supports_float4 else devectorize)+load_store_indexing) else: # new devectorize only for load/store - sink = graph_rewrite(sink, sym+devectorize_load_store+mulacc_unrolled) + sink = graph_rewrite(sink, sym+devectorize_load_store) # optional pre matcher if opts is not None and opts.pre_matcher is not None: sink = graph_rewrite(sink, opts.pre_matcher) diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py index 7b8249289c..90861e3a9b 100644 --- a/tinygrad/codegen/lowerer.py +++ b/tinygrad/codegen/lowerer.py @@ -115,14 +115,17 @@ def lower_reduce_axis(ctx: IndexContext, x: UOp): assert all(x.op is Ops.UNROLL for x in reduce_expand), f"not all UNROLLS in {reduce_expand} for {x.axis_arg}" alu_op: Ops = x.arg[0] ret = x.src[0] - if len(contract_axis:=flatten(x.arg for x in reduce_expand)): - ret = UOp(Ops.CONTRACT, x.dtype.vec(prod(x[1] for x in contract_axis)), (ret,), tuple(contract_axis)) - ret = functools.reduce(lambda x,y: x.alu(alu_op, y), [ret.gep(i) for i in range(ret.dtype.count)]) - if not len(reduce_range): return ret - # create ACC and assign + # create acc acc = UOp(Ops.DEFINE_ACC, x.dtype, (x.const_like(identity_element(alu_op, x.dtype.scalar())),) + tuple(reduce_range), (ctx.acc_num,)) ctx.acc_num += 1 - return acc.assign(acc.alu(alu_op, ret)) + if len(contract_axis:=flatten(x.arg for x in reduce_expand)): + ret = UOp(Ops.CONTRACT, x.dtype.vec(prod(x[1] for x in contract_axis)), (ret,), tuple(contract_axis)) + ret = functools.reduce(lambda x,y: x.alu(alu_op, y), [acc]+[ret.gep(i) for i in range(ret.dtype.count)]) + else: + ret = acc.alu(alu_op, ret) + if not len(reduce_range): return ret + # create ACC and assign + return acc.assign(ret) def lower_load_store(ctx: IndexContext, x: UOp): idx, valid = x.st_arg.to_indexed_uops(ctx.ridxs if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL else ctx.idxs) diff --git a/tinygrad/codegen/symbolic.py b/tinygrad/codegen/symbolic.py index 7c8dd0e7d4..9cd5ca7623 100644 --- a/tinygrad/codegen/symbolic.py +++ b/tinygrad/codegen/symbolic.py @@ -318,9 +318,10 @@ def threefry2x32(x: UOp, key: UOp): # ******** phase 3 is the complete symbolic, and deals with very complex things like loop rewriting and threefry transform ******** -def loop_collapse(compval, multconst, rng:UOp, acc:UOp, idx2=None,idx3=None,extra=None,vec=None,ne=None, +def loop_collapse(compval, multconst, rng:UOp, acc:UOp, extra:UOp, idx2=None,idx3=None,vec=None,ne=None, add=UOp.const(dtypes.int, 0), mul:UOp=UOp.const(dtypes.int, 1)): if getenv("DISABLE_LOOP_COLLAPSE") or rng not in acc.src: return None # must be the right REDUCE + if acc not in split_uop(extra, Ops.ADD): return None loop_start, loop_end = rng.src if loop_start.arg != 0: # TODO: support and test this with other mul and loop_starts @@ -344,7 +345,7 @@ def loop_collapse(compval, multconst, rng:UOp, acc:UOp, idx2=None,idx3=None,extr # TODO: what does it mean to have the same numbered DEFINE_ACC with different ranges? new_acc = acc.replace(src=acc.src[0:1]+tuple(x for x in acc.src[1:] if x is not rng)) ret = new_acc.assign(new_acc+new_reduce_op) - if extra is not None: ret = ret + acc.assign(acc+extra) + if extra is not acc: ret = ret + acc.assign(extra) return ret def index_collapse(idx:UOp,rng:UOp,buf:UOp,ld:UOp,acc:UOp,add=UOp.const(dtypes.int, 0),mul=UOp.const(dtypes.int, 1)): @@ -383,9 +384,6 @@ index_load = UPat.var("buf").index(rng_aug).load(name="ld") arange_augrng = UPat.any(rng_aug, rng_aug+UPat.var("idx2"), rng_aug+UPat.var("idx2")+UPat.var("idx3"), UPat(Ops.VECTORIZE, name="vec", src=rng_aug)) arange_m = ((arange_augrng v.st.real_size()) else None), @@ -252,6 +256,9 @@ def append_to_kernel(ctx:KernelContext, x:UOp): create_kernels = merge_views+PatternMatcher([ (UPat(GroupOp.All-{Ops.KERNEL, Ops.BUFFER}, name="x"), create_kernel), (UPat(Ops.KERNEL, name="x"), append_to_kernel), + # remove CONST/BIND from the kernel graph + (UPat(Ops.SINK, name="x"), lambda x: x.replace(src=new_src) + if (new_src:=tuple(dedup(s.base for s in x.src if s.op not in {Ops.CONST,Ops.BIND}))) != x.src else None), ]) # **** fix kernel AST @@ -377,12 +384,13 @@ def schedule_uop(sink:UOp, var_vals:dict[Variable, int]) -> ScheduleItem: ast = sink.src[1].arg.ast.substitute({s.src[1].arg.ast:s.src[0] for s in sink.src[1].src if s.op is Ops.ASSIGN}).sink() # add buffer ops ast = graph_rewrite(ast, add_buffer_ops, bufs:=[sink.buf_uop], bottom_up=True) + if ast.op is Ops.SINK and not all_same(dev:=[x.device for x in bufs]): raise RuntimeError(f"all buffers must be on the same device: {dev}") # unbind_vars + push views to edges ast = graph_rewrite(graph_rewrite(ast, unbind_vars+view_left, ctx=var_vals), view_right) # fix_kernel_ops ast = graph_rewrite(ast, fix_kernel_ops, var_vals) # create subbuffer - if ast.op is Ops.BUFFER_VIEW: buffers[bufs[0]] = bufs[1].buffer.view(ast.size, ast.dtype, (x:=ast.src[0]).st_arg.views[0].offset*x.dtype.itemsize) + if ast.op is Ops.BUFFER_VIEW: buffers[bufs[0]] = (base:=bufs[1].buffer).view(ast.size, ast.dtype, ast.arg[1]*base.dtype.itemsize) return ScheduleItem(ast, tuple(dedup([x.buffer for x in bufs])), sink.src[1].arg.metadata) PROCESS_REPLAY_CAPTURE:dict[str, bytes] = {} diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 9745482e18..af2a0cd2a6 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -289,7 +289,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): if self.op is Ops.MULTI: return ShapeTracker.from_shape( tuple(sum(y.shape[a] for y in self.real_lbs) if a == self.axis else s for a,s in enumerate(self.real_lbs[0].shape))) - if self.op is Ops.BUFFER: return ShapeTracker.from_shape((self.size,)) + if self.op in {Ops.BUFFER, Ops.BUFFER_VIEW}: return ShapeTracker.from_shape((self.size,)) if self.op is Ops.KERNEL: return ShapeTracker.from_shape(self.arg.ast.shape) # these ops define a ShapeTracker from the arg if self.op is Ops.VIEW: return self.arg @@ -298,7 +298,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None if not (src_sts := [x.st for x in self.src if x.st is not None]): return None assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}" - if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}: + if self.op is Ops.BITCAST: shape = src_sts[0].shape if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,) # only reduce ops are allowed to change shape, everything else derives shape from sources @@ -316,7 +316,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): @property def shape(self) -> tuple[sint, ...]: return unwrap(self.st).shape @property - def size(self) -> int: return self.arg if self.op is Ops.BUFFER else unwrap(self.st).size + def size(self) -> int: return self.arg[0] if self.op is Ops.BUFFER_VIEW else self.arg if self.op is Ops.BUFFER else unwrap(self.st).size # *** uop evaluation *** diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index ba21fa7603..2818e4206c 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Any, cast +from typing import Any, cast, ClassVar import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select assert sys.platform != 'win32' from dataclasses import dataclass @@ -27,10 +27,7 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100) - - def __del__(self): - if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice) def _sleep(self, time_spent_waiting_ms:int): # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals. @@ -562,9 +559,11 @@ class PCIIface: def device_fini(self): self.adev.fini() class AMDDevice(HCQCompiled): + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) - signals_page:Any = None - signals_pool:list[int] = [] def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 @@ -573,11 +572,6 @@ class AMDDevice(HCQCompiled): self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100) if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}") - if AMDDevice.signals_page is None: - AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True) - AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] - else: self.dev_iface.map(AMDDevice.signals_page) - self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1 self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1 self.has_scratch_base_registers = self.target >= 110000 diff --git a/tinygrad/runtime/ops_dsp.py b/tinygrad/runtime/ops_dsp.py index 6a786b39b1..1cd7705ae4 100644 --- a/tinygrad/runtime/ops_dsp.py +++ b/tinygrad/runtime/ops_dsp.py @@ -169,7 +169,8 @@ class DSPDevice(Compiled): except (OSError, PermissionError): # DSP might ask for a connection reset or just fail with operation not permitted, try to reset connection. self.init_dsp() - _exec_lib() + try: _exec_lib() + except (OSError, PermissionError) as e: raise RuntimeError(e) def init_dsp(self): if hasattr(self, 'rpc_fd'): diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index d42b6b3ab3..9eed9a1c04 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -1,7 +1,7 @@ from __future__ import annotations import os, ctypes, contextlib, re, functools, mmap, struct, array, sys assert sys.platform != 'win32' -from typing import Any, cast, Union, Type +from typing import Any, cast, Union, Type, ClassVar from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU @@ -73,10 +73,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 class NVSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8) - - def __del__(self): - if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice) class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): def __init__(self): @@ -285,12 +282,14 @@ class GPFifo: MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 class NVDevice(HCQCompiled[NVSignal]): + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + root = None fd_ctl: HWInterface fd_uvm: HWInterface gpus_info: Union[list, ctypes.Array] = [] - signals_page: Any = None - signals_pool: list[int] = [] # TODO: Need a proper allocator for va addresses # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings @@ -433,11 +432,6 @@ class NVDevice(HCQCompiled[NVSignal]): try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid) except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e - if NVDevice.signals_page is None: - NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True) - NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)] - else: self._gpu_map(NVDevice.signals_page) - channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS) channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index e0ce3bdb7b..90982eb71c 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -2,7 +2,7 @@ from __future__ import annotations import os, ctypes, functools, mmap, struct, array, math, sys assert sys.platform != 'win32' from types import SimpleNamespace -from typing import Any, cast +from typing import Any, cast, ClassVar from tinygrad.device import BufferSpec from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.support.hcq import HWInterface @@ -38,10 +38,7 @@ class QCOMCompiler(CLCompiler): class QCOMSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2) - - def __del__(self): - if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice) def _sleep(self, time_spent_waiting_ms:int): # Sleep only for only timeline signals. Do it immediately to free cpu. @@ -320,16 +317,16 @@ class QCOMAllocator(HCQAllocatorBase): self.dev._gpu_free(opaque) class QCOMDevice(HCQCompiled): - signals_page: Any = None - signals_pool: list[int] = [] + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + gpu_id: int = 0 dummy_addr: int = 0 def __init__(self, device:str=""): self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR) QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr) - QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) - QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)] flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \ | kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 9c41f6fdc7..9b2134e3d6 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Type, TypeVar, Generic, Any +from typing import cast, Type, TypeVar, Generic, Any, ClassVar import contextlib, decimal, statistics, time, ctypes, array, os, fcntl from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer @@ -203,15 +203,20 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit") class HCQSignal(Generic[DeviceType]): - def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8): - self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off + def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None, + timestamp_divider=1, value_off=0, timestamp_off=8): + self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr + self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider) self.timeline_for_device:DeviceType|None = timeline_for_device - if isinstance(base_addr, int): + if isinstance(self.base_addr, int): self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q') self.value_mv[0] = value + def __del__(self): + if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr) + @property def value(self) -> int: return self.value_mv[0] @@ -332,23 +337,29 @@ class HCQCompiled(Compiled, Generic[SignalType]): """ A base class for devices compatible with the HCQ (Hardware Command Queue) API. """ - devices: list[HCQCompiled] = [] + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType], comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None): self.device_id:int = int(device.split(":")[1]) if ":" in device else 0 + + from tinygrad.runtime.graph.hcq import HCQGraph + super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) + + # Map signals if any + for sig_page in self.signal_pages: cast(HCQAllocator, self.allocator).map(sig_page) + self.devices.append(self) + self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = [] - from tinygrad.runtime.graph.hcq import HCQGraph - super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) - self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True)) self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True) - self.devices.append(self) def synchronize(self): try: self.timeline_signal.wait(self.timeline_value - 1) @@ -361,6 +372,14 @@ class HCQCompiled(Compiled, Generic[SignalType]): Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records] self.sig_prof_records = [] + @classmethod + def _alloc_signal_addr(cls) -> int: + if not cls.signal_pool: + cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True))) + cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)] + for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc) + return cls.signal_pool.pop() + def _at_profile_finalize(self): def _sync(d:HCQCompiled, q_t:Type[HWQueue]): q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d) diff --git a/tinygrad/spec.py b/tinygrad/spec.py index 56105ab6c4..3dfd29c9e2 100644 --- a/tinygrad/spec.py +++ b/tinygrad/spec.py @@ -1,13 +1,15 @@ from typing import cast from tinygrad.ops import PatternMatcher, UPat, GroupOp, Ops, UOp, print_uops from tinygrad.dtype import DType, ImageDType, dtypes, PtrDType -from tinygrad.helpers import all_same, dedup, prod +from tinygrad.helpers import all_same, all_int, dedup, prod buffer_spec = PatternMatcher([ (UPat(Ops.UNIQUE, dtypes.void, ()), lambda: True), (UPat(Ops.DEVICE, dtypes.void, (), name="device"), lambda device: isinstance(device.arg, str)), (UPat(Ops.BUFFER, src=(UPat(Ops.DEVICE), UPat(Ops.UNIQUE)), name="buf"), lambda buf: isinstance(buf.arg, int) and isinstance(buf.dtype, (DType, ImageDType))), + (UPat(Ops.BUFFER_VIEW, src=(UPat(Ops.BUFFER),), name="buf_view"), + lambda buf_view: isinstance(buf_view.arg, tuple) and len(buf_view.arg) == 2 and all_int(buf_view.arg)), ]) # *** this is the spec of a Tensor in UOp *** @@ -126,9 +128,7 @@ kernel_spec = buffer_spec+PatternMatcher([ (UPat(Ops.KERNEL, src=UPat((Ops.BUFFER, Ops.ASSIGN))), lambda: True), # assign has a buffer view and kernel source, it can optionally depend on other assigns (UPat(Ops.ASSIGN, src=UPat((Ops.BUFFER, Ops.VIEW, Ops.KERNEL, Ops.ASSIGN))), lambda: True), - # view/sink/const/bind/var can also exist in the kernel graph - (UPat((Ops.VIEW, Ops.SINK, Ops.CONST, Ops.BIND, Ops.DEFINE_VAR)), lambda: True), - (UPat(GroupOp.All), lambda: False), + (UPat(GroupOp.All-{Ops.SINK, Ops.VIEW}), lambda: False), ]) # *** this is the UOp shape spec *** diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index ecdcc8a8b4..a57391807c 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -13,7 +13,7 @@ uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.DEFINE_GLOBAL: "#ffe0b0", Ops.DEFINE_LOCAL: "#ffe0d0", Ops.DEFINE_ACC: "#f0ffe0", Ops.REDUCE_AXIS: "#FF6B6B", Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#e0ffc0", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff", Ops.INDEX: "#e8ffa0", Ops.WMMA: "#efefc0", Ops.VIEW: "#C8F9D4", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55", - **{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80", + **{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80", Ops.BUFFER_VIEW: "#E5EAFF", Ops.BLOCK: "#C4A484", Ops.BLOCKEND: "#C4A4A4", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0", Ops.NAME:"#808080"} # VIZ API