From a388d2cb1a1927481040f18e5f4e93ca08ffab7f Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:51:50 +0300 Subject: [PATCH] remove PROFILE=1 option, it's just VIZ=1 [pr] (#12176) * remove PROFILE=1 option, it's just VIZ=1 [pr] * sqtt * sqtt 2 * return last * rename --- .github/workflows/test.yml | 2 +- docs/env_vars.md | 1 - extra/sqtt/README.md | 2 +- test/test_profiler.py | 2 +- test/unit/test_viz.py | 2 +- tinygrad/device.py | 5 ++--- tinygrad/helpers.py | 3 ++- tinygrad/uop/ops.py | 12 +++++------- tinygrad/viz/README | 7 +++---- 9 files changed, 16 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1a0490da87..61f1605fb3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -656,7 +656,7 @@ jobs: run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run TestOps.test_add with SQTT run: | - PROFILE=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add + VIZ=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp - name: Run process replay tests uses: ./.github/actions/process-replay diff --git a/docs/env_vars.md b/docs/env_vars.md index 44be042bfa..e4129bc169 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -42,7 +42,6 @@ DEFAULT_FLOAT | [HALF, ...]| specify the default float dtype (FLOAT32, HAL IMAGE | [1-2] | enable 2d specific optimizations FLOAT16 | [1] | use float16 for images instead of float32 PTX | [1] | enable the specialized [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/) assembler for Nvidia GPUs. If not set, defaults to generic CUDA codegen backend. -PROFILE | [1] | enable profiling. This feature is supported in NV, AMD, QCOM and METAL backends. VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0). JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz) diff --git a/extra/sqtt/README.md b/extra/sqtt/README.md index 6d739ceb68..1d19ae8f32 100644 --- a/extra/sqtt/README.md +++ b/extra/sqtt/README.md @@ -4,7 +4,7 @@ Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs) -SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it. +SQTT is implemented on top of normal tinygrad profiling, `VIZ=1 SQTT=1` to get profile pickle with sqtt data embedded in it. `SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256. diff --git a/test/test_profiler.py b/test/test_profiler.py index 15cf8647fb..6143086ca0 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -17,7 +17,7 @@ def helper_collect_profile(*devs): cpu_events.clear() profile_list = [] - with Context(PROFILE=1): + with Context(VIZ=1): yield profile_list for dev in devs: dev.synchronize() for dev in devs: dev._at_profile_finalize() diff --git a/test/unit/test_viz.py b/test/unit/test_viz.py index 20dab1f7f4..7ecdbe4172 100644 --- a/test/unit/test_viz.py +++ b/test/unit/test_viz.py @@ -408,7 +408,7 @@ class TestVizProfiler(unittest.TestCase): get_profile(prof) def test_python_marker(self): - with Context(PROFILE=1): + with Context(VIZ=1): a = Tensor.empty(1, device="NULL") b = Tensor.empty(1, device="NULL") (a+b).realize() diff --git a/tinygrad/device.py b/tinygrad/device.py index bc0f6eb64c..6d08da0fd2 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -354,9 +354,8 @@ if PROFILE: with open(fn:=temp("profile.pkl", append_user=True), "wb") as f: pickle.dump(cpu_events+Compiled.profile_events+Buffer.profile_events, f) - if not getenv("SQTT", 0): - from tinygrad.uop.ops import launch_viz - launch_viz(PROFILE, fn) + from tinygrad.uop.ops import launch_viz + launch_viz("PROFILE", fn) if __name__ == "__main__": from tinygrad import Tensor, Device diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 76ddbc525e..82fe093f6e 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -135,7 +135,7 @@ USE_TC, TC_SELECT, TC_OPT, AMX = ContextVar("TC", 1), ContextVar("TC_SELECT", -1 TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0) FUSE_ARANGE, FUSE_CONV_BW = ContextVar("FUSE_ARANGE", 1), ContextVar("FUSE_CONV_BW", 0) SPLIT_REDUCEOP, NO_MEMORY_PLANNER, RING = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("RING", 1) -PICKLE_BUFFERS, PROFILE, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("PROFILE", getenv("VIZ")), ContextVar("LRU", 1) +PICKLE_BUFFERS, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("LRU", 1) CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1) DISABLE_COMPILER_CACHE, BLOCK_REORDER = ContextVar("DISABLE_COMPILER_CACHE", 0), ContextVar("BLOCK_REORDER", 1) DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES = ContextVar("DONT_REALIZE_EXPAND", 0), ContextVar("DONT_GROUP_REDUCES", 0) @@ -146,6 +146,7 @@ RANGEIFY, FUSE_ATTENTION = ContextVar("RANGEIFY", 0), ContextVar("FUSE_ATTENTION EMULATE = ContextVar("EMULATE", "") CPU_COUNT = ContextVar("CPU_COUNT", max(1, (os.cpu_count() or 1) // (4 if ARCH_X86 else 2))) # take 1/2 of the cores, accounting HT CPU_LLVM, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("AMD_LLVM", 1) +VIZ = PROFILE = ContextVar("VIZ", 0) @dataclass(frozen=True) class Metadata: diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py index f070e26f81..46c441d798 100644 --- a/tinygrad/uop/ops.py +++ b/tinygrad/uop/ops.py @@ -7,7 +7,7 @@ from tinygrad.uop import Ops, GroupOp from tinygrad.uop.mathtraits import MathTrait from tinygrad.dtype import ConstType, ImageDType, dtypes, DType, truncate, PtrDType, least_upper_dtype, Invalid, InvalidType from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Context, partition, temp, unwrap, T, argfix, Metadata, flatten, TRACEMETA -from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, RANGEIFY +from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, RANGEIFY, VIZ if TYPE_CHECKING: from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.device import Buffer, MultiBuffer @@ -835,7 +835,6 @@ def track_uop(u:UOp): # *** tracking pattern matcher *** -VIZ = ContextVar("VIZ", 0) TRACK_MATCH_STATS = ContextVar("TRACK_MATCH_STATS", 2 if VIZ else 0) match_stats:dict[UPat, list[int|float]] = dict() @@ -938,7 +937,7 @@ if TRACK_MATCH_STATS or PROFILE: with open(fn:=temp("rewrites.pkl", append_user=True), "wb") as f: print(f"rewrote {len(tracked_ctxs)} graphs and matched {sum(len(r.matches) for x in tracked_ctxs for r in x)} times, saved to {fn}") pickle.dump([(tracked_keys, tracked_ctxs, uop_fields)], f) - if VIZ: launch_viz(VIZ, temp("rewrites.pkl", append_user=True)) + if VIZ: return launch_viz("VIZ", temp("rewrites.pkl", append_user=True)) if getenv("PRINT_MATCH_STATS", TRACK_MATCH_STATS.value): ret = [0,0,0.0,0.0] for k,v in sorted(list(match_stats.items()), key=lambda x: x[1][2]+x[1][3]): @@ -948,11 +947,10 @@ if TRACK_MATCH_STATS or PROFILE: print(f"{ret[0]:6d} / {ret[1]:7d} -- {ret[3]*1000.:9.2f} / {(ret[2]+ret[3])*1000.:9.2f} ms -- TOTAL") print(f"{len(match_stats)} rules, {sum(v[0] > 0 for v in match_stats.values())} matched once") - def launch_viz(var:ContextVar, data:str): - os.environ[(env_str:=var.key)] = "0" + def launch_viz(env_str:str, data:str): + os.environ[env_str] = "0" os.environ[f"{env_str}_DATA"] = data - os.environ[f"{env_str}_VALUE"] = str(var.value) - if not int(os.getenv("VIZ", "0")) and not int(os.getenv("PROFILE", "0")): + if not int(os.getenv("VIZ", "0")) and not int(os.getenv("PROFILE", "0")) and not int(os.getenv("SQTT", "0")): args = ['--kernels', getenv("VIZ_DATA", "")] if getenv("VIZ_DATA", "") else [] args += ['--profile', getenv("PROFILE_DATA", "")] if getenv("PROFILE_DATA", "") else [] os.execv(sys.executable, [sys.executable] + [os.path.join(os.path.dirname(__file__), "../", "viz", "serve.py")] + args) diff --git a/tinygrad/viz/README b/tinygrad/viz/README index ce46d461cf..bdd038e44c 100644 --- a/tinygrad/viz/README +++ b/tinygrad/viz/README @@ -6,19 +6,18 @@ most uses of DEBUG >= 3 tiny-tools and a viewer for: -SAVE_SCHEDULE=1 TRACK_MATCH_STATS=2 -PROFILE=1 +ProfileEvents to use: -1. Run tinygrad with VIZ=1 and/or PROFILE=1 (this saves the pkls and launches the server (new process please!)) +1. Run tinygrad with VIZ=1 (this saves the pkls and launches the server (new process please!)) 2. That's it! This should be able to: 1. See all schedules (VIZ=1) 2. See all graphs and how they were rewritten (VIZ=1) 3. See generated code (VIZ=1) -4. See profile (PROFILE=1) +4. See profile (click on 'profiler') bunch of dev rules: * everything must be responsive to keyboard smashing! lag should never happen