remove PROFILE=1 option, it's just VIZ=1 [pr] (#12176)

* remove PROFILE=1 option, it's just VIZ=1 [pr]

* sqtt

* sqtt 2

* return last

* rename
This commit is contained in:
qazal
2025-09-15 12:51:50 +03:00
committed by GitHub
parent 65397bfdeb
commit a388d2cb1a
9 changed files with 16 additions and 20 deletions

View File

@@ -656,7 +656,7 @@ jobs:
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
- name: Run TestOps.test_add with SQTT
run: |
PROFILE=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
VIZ=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
- name: Run process replay tests
uses: ./.github/actions/process-replay

View File

@@ -42,7 +42,6 @@ DEFAULT_FLOAT | [HALF, ...]| specify the default float dtype (FLOAT32, HAL
IMAGE | [1-2] | enable 2d specific optimizations
FLOAT16 | [1] | use float16 for images instead of float32
PTX | [1] | enable the specialized [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/) assembler for Nvidia GPUs. If not set, defaults to generic CUDA codegen backend.
PROFILE | [1] | enable profiling. This feature is supported in NV, AMD, QCOM and METAL backends.
VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)

View File

@@ -4,7 +4,7 @@
Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs)
SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
SQTT is implemented on top of normal tinygrad profiling, `VIZ=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
`SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256.

View File

@@ -17,7 +17,7 @@ def helper_collect_profile(*devs):
cpu_events.clear()
profile_list = []
with Context(PROFILE=1):
with Context(VIZ=1):
yield profile_list
for dev in devs: dev.synchronize()
for dev in devs: dev._at_profile_finalize()

View File

@@ -408,7 +408,7 @@ class TestVizProfiler(unittest.TestCase):
get_profile(prof)
def test_python_marker(self):
with Context(PROFILE=1):
with Context(VIZ=1):
a = Tensor.empty(1, device="NULL")
b = Tensor.empty(1, device="NULL")
(a+b).realize()

View File

@@ -354,9 +354,8 @@ if PROFILE:
with open(fn:=temp("profile.pkl", append_user=True), "wb") as f: pickle.dump(cpu_events+Compiled.profile_events+Buffer.profile_events, f)
if not getenv("SQTT", 0):
from tinygrad.uop.ops import launch_viz
launch_viz(PROFILE, fn)
from tinygrad.uop.ops import launch_viz
launch_viz("PROFILE", fn)
if __name__ == "__main__":
from tinygrad import Tensor, Device

View File

@@ -135,7 +135,7 @@ USE_TC, TC_SELECT, TC_OPT, AMX = ContextVar("TC", 1), ContextVar("TC_SELECT", -1
TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
FUSE_ARANGE, FUSE_CONV_BW = ContextVar("FUSE_ARANGE", 1), ContextVar("FUSE_CONV_BW", 0)
SPLIT_REDUCEOP, NO_MEMORY_PLANNER, RING = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("RING", 1)
PICKLE_BUFFERS, PROFILE, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("PROFILE", getenv("VIZ")), ContextVar("LRU", 1)
PICKLE_BUFFERS, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("LRU", 1)
CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
DISABLE_COMPILER_CACHE, BLOCK_REORDER = ContextVar("DISABLE_COMPILER_CACHE", 0), ContextVar("BLOCK_REORDER", 1)
DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES = ContextVar("DONT_REALIZE_EXPAND", 0), ContextVar("DONT_GROUP_REDUCES", 0)
@@ -146,6 +146,7 @@ RANGEIFY, FUSE_ATTENTION = ContextVar("RANGEIFY", 0), ContextVar("FUSE_ATTENTION
EMULATE = ContextVar("EMULATE", "")
CPU_COUNT = ContextVar("CPU_COUNT", max(1, (os.cpu_count() or 1) // (4 if ARCH_X86 else 2))) # take 1/2 of the cores, accounting HT
CPU_LLVM, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("AMD_LLVM", 1)
VIZ = PROFILE = ContextVar("VIZ", 0)
@dataclass(frozen=True)
class Metadata:

View File

@@ -7,7 +7,7 @@ from tinygrad.uop import Ops, GroupOp
from tinygrad.uop.mathtraits import MathTrait
from tinygrad.dtype import ConstType, ImageDType, dtypes, DType, truncate, PtrDType, least_upper_dtype, Invalid, InvalidType
from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Context, partition, temp, unwrap, T, argfix, Metadata, flatten, TRACEMETA
from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, RANGEIFY
from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, RANGEIFY, VIZ
if TYPE_CHECKING:
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.device import Buffer, MultiBuffer
@@ -835,7 +835,6 @@ def track_uop(u:UOp):
# *** tracking pattern matcher ***
VIZ = ContextVar("VIZ", 0)
TRACK_MATCH_STATS = ContextVar("TRACK_MATCH_STATS", 2 if VIZ else 0)
match_stats:dict[UPat, list[int|float]] = dict()
@@ -938,7 +937,7 @@ if TRACK_MATCH_STATS or PROFILE:
with open(fn:=temp("rewrites.pkl", append_user=True), "wb") as f:
print(f"rewrote {len(tracked_ctxs)} graphs and matched {sum(len(r.matches) for x in tracked_ctxs for r in x)} times, saved to {fn}")
pickle.dump([(tracked_keys, tracked_ctxs, uop_fields)], f)
if VIZ: launch_viz(VIZ, temp("rewrites.pkl", append_user=True))
if VIZ: return launch_viz("VIZ", temp("rewrites.pkl", append_user=True))
if getenv("PRINT_MATCH_STATS", TRACK_MATCH_STATS.value):
ret = [0,0,0.0,0.0]
for k,v in sorted(list(match_stats.items()), key=lambda x: x[1][2]+x[1][3]):
@@ -948,11 +947,10 @@ if TRACK_MATCH_STATS or PROFILE:
print(f"{ret[0]:6d} / {ret[1]:7d} -- {ret[3]*1000.:9.2f} / {(ret[2]+ret[3])*1000.:9.2f} ms -- TOTAL")
print(f"{len(match_stats)} rules, {sum(v[0] > 0 for v in match_stats.values())} matched once")
def launch_viz(var:ContextVar, data:str):
os.environ[(env_str:=var.key)] = "0"
def launch_viz(env_str:str, data:str):
os.environ[env_str] = "0"
os.environ[f"{env_str}_DATA"] = data
os.environ[f"{env_str}_VALUE"] = str(var.value)
if not int(os.getenv("VIZ", "0")) and not int(os.getenv("PROFILE", "0")):
if not int(os.getenv("VIZ", "0")) and not int(os.getenv("PROFILE", "0")) and not int(os.getenv("SQTT", "0")):
args = ['--kernels', getenv("VIZ_DATA", "")] if getenv("VIZ_DATA", "") else []
args += ['--profile', getenv("PROFILE_DATA", "")] if getenv("PROFILE_DATA", "") else []
os.execv(sys.executable, [sys.executable] + [os.path.join(os.path.dirname(__file__), "../", "viz", "serve.py")] + args)

View File

@@ -6,19 +6,18 @@ most uses of DEBUG >= 3
tiny-tools
and a viewer for:
SAVE_SCHEDULE=1
TRACK_MATCH_STATS=2
PROFILE=1
ProfileEvents
to use:
1. Run tinygrad with VIZ=1 and/or PROFILE=1 (this saves the pkls and launches the server (new process please!))
1. Run tinygrad with VIZ=1 (this saves the pkls and launches the server (new process please!))
2. That's it!
This should be able to:
1. See all schedules (VIZ=1)
2. See all graphs and how they were rewritten (VIZ=1)
3. See generated code (VIZ=1)
4. See profile (PROFILE=1)
4. See profile (click on 'profiler')
bunch of dev rules:
* everything must be responsive to keyboard smashing! lag should never happen