NOOP means contiguous

2026-04-29 03:00:14 -04:00 · 2023-03-01 21:54:51 -08:00
parent d062cc82b8
commit fca055bd66
3 changed files with 10 additions and 4 deletions
--- a/test/test_speed_v_torch.py
+++ b/test/test_speed_v_torch.py
@@ -1,5 +1,8 @@
 import os
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
 import unittest
 import torch
 torch.set_num_threads(1)
@@ -10,7 +13,7 @@ from functools import partial
 from tinygrad.ops import GlobalCounters
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
-from tinygrad.helpers import colored, getenv, DEBUG
+from tinygrad.helpers import colored, getenv, DEBUG, Timing
 from tinygrad.jit import TinyJit

 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
@@ -39,6 +42,9 @@ def helper_test_speed(f1, *args):
    # force syncing
    [x.numpy() if isinstance(x, Tensor) or str(torch_device) == "cpu" else x.cpu().numpy() for x in args if x is not None]

+    # L2 defeat (64 MB)
+    np.zeros((4096, 4096), dtype=np.float32) - 1
+
    GlobalCounters.global_ops = 0
    GlobalCounters.global_mem = 0
    if DEBUG >= 4: print("benchmark start")
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -5,7 +5,7 @@ from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps,
 from tinygrad.helpers import shape_to_axis

 base_fxn_for_op : Dict[Op, Callable] = {
-  UnaryOps.NOOP: lambda x: x[:], UnaryOps.NEG: lambda x: -x, UnaryOps.NOT: lambda x: (1.0 - x),
+  UnaryOps.NEG: lambda x: -x, UnaryOps.NOT: lambda x: (1.0 - x),
  BinaryOps.ADD: operator.add, BinaryOps.SUB: operator.sub, BinaryOps.MUL: operator.mul, BinaryOps.DIV: operator.truediv, BinaryOps.POW: operator.pow,
  ReduceOps.SUM: lambda x, new_shape: x.sum(shape_to_axis(x.shape, new_shape), keepdims=True) if tuple(x.shape) != tuple(new_shape) else x[:],
  ReduceOps.MAX: lambda x, new_shape: (x.amax if hasattr(x, 'amax') else x.max)(shape_to_axis(x.shape, new_shape), keepdims=True) if tuple(x.shape) != tuple(new_shape) else x[:],
@@ -23,7 +23,7 @@ def einsum_mulacc(einsum, get_strides, expand):
  return mulacc

 numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
-  UnaryOps.EXP: lambda x: np.exp(x), UnaryOps.LOG: lambda x: np.log(x),
+  UnaryOps.NOOP: lambda x: np.ascontiguousarray(x), UnaryOps.EXP: lambda x: np.exp(x), UnaryOps.LOG: lambda x: np.log(x),
  BinaryOps.MAX: np.maximum, BinaryOps.CMPEQ: lambda x,y: (x==y).astype(np.float32),
  MovementOps.FLIP: lambda x, axis: np.flip(x, axis), MovementOps.PERMUTE: lambda x, order: x.transpose(order),
  MovementOps.PAD: lambda x, padding: np.pad(x, padding), MovementOps.EXPAND: lambda x, new_shape: np.broadcast_to(x, new_shape),
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@@ -5,7 +5,7 @@ from tinygrad.helpers import getenv
 from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc

 torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
-  UnaryOps.EXP: lambda x: x.exp(), UnaryOps.LOG: lambda x: x.log(),
+  UnaryOps.NOOP: lambda x: x.contiguous(), UnaryOps.EXP: lambda x: x.exp(), UnaryOps.LOG: lambda x: x.log(),
  BinaryOps.MAX: torch.maximum, BinaryOps.CMPEQ: lambda x,y: (x==y).float(),
  MovementOps.PAD: lambda x, padding: torch.nn.functional.pad(x, [item for sublist in padding[::-1] for item in sublist]),
  FusedOps.MULACC: einsum_mulacc(torch.einsum, lambda x: x.stride(), lambda x,s: x.expand(s))