Merge branch 'master' into asm_ucode

This commit is contained in:
George Hotz
2026-01-06 00:16:03 -08:00
committed by GitHub
7 changed files with 92 additions and 143 deletions

View File

@@ -14,10 +14,12 @@ on:
paths:
- 'tinygrad/runtime/autogen/**/*'
- 'tinygrad/runtime/support/autogen.py'
- '.github/workflows/autogen.yml'
workflow_dispatch:
paths:
- 'tinygrad/runtime/autogen/**/*'
- 'tinygrad/runtime/support/autogen.py'
- '.github/workflows/autogen.yml'
jobs:
autogen:
@@ -39,125 +41,45 @@ jobs:
pydeps: 'pyyaml mako'
- name: Install autogen support packages
run: sudo apt-get install -y --no-install-recommends libclang-20-dev llvm-20-dev hip-dev libusb-1.0-0-dev
- name: Verify OpenCL autogen
continue-on-error: true
- name: Regenerate autogen files
run: |
mv tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
rm tinygrad/runtime/autogen/opencl.py
python3 -c "from tinygrad.runtime.autogen import opencl"
diff /tmp/opencl.py.bak tinygrad/runtime/autogen/opencl.py
- name: Verify CUDA autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
mv tinygrad/runtime/autogen/nvrtc.py /tmp/nvrtc.py.bak
mv tinygrad/runtime/autogen/nvjitlink.py /tmp/nvjitlink.py.bak
mv tinygrad/runtime/autogen/nv_570.py /tmp/nv_570.py.bak
mv tinygrad/runtime/autogen/nv.py /tmp/nv.py.bak
rm tinygrad/runtime/autogen/{cuda,nvrtc,nvjitlink,nv_570,nv}.py
python3 -c "from tinygrad.runtime.autogen import cuda, nvrtc, nvjitlink, nv_570, nv"
diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
diff /tmp/nvrtc.py.bak tinygrad/runtime/autogen/nvrtc.py
diff /tmp/nvjitlink.py.bak tinygrad/runtime/autogen/nvjitlink.py
diff /tmp/nv_570.py.bak tinygrad/runtime/autogen/nv_570.py
diff /tmp/nv.py.bak tinygrad/runtime/autogen/nv.py
- name: Verify AMD autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
mv tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
mv tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak
mv tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
mv tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
mv tinygrad/runtime/autogen/rocprof.py /tmp/rocprof.py.bak
mv tinygrad/runtime/autogen/am/am.py /tmp/am_am.py.bak
mv tinygrad/runtime/autogen/am/pm4_soc15.py /tmp/am_pm4_soc15.py.bak
mv tinygrad/runtime/autogen/am/pm4_nv.py /tmp/am_pm4_nv.py.bak
mv tinygrad/runtime/autogen/am/sdma_4_0_0.py /tmp/am_sdma_4_0_0.py.bak
mv tinygrad/runtime/autogen/am/sdma_5_0_0.py /tmp/am_sdma_5_0_0.py.bak
mv tinygrad/runtime/autogen/am/sdma_6_0_0.py /tmp/am_sdma_6_0_0.py.bak
mv tinygrad/runtime/autogen/am/smu_v13_0_0.py /tmp/am_smu_v13_0_0.py.bak
mv tinygrad/runtime/autogen/am/smu_v14_0_2.py /tmp/am_smu_v14_0_2.py.bak
python3 -c "from tinygrad.runtime.autogen import comgr, hsa, hip, amd_gpu, sqtt, rocprof; from tinygrad.runtime.autogen.am import am, pm4_soc15, pm4_nv, sdma_4_0_0, sdma_5_0_0, sdma_6_0_0, smu_v13_0_0, smu_v14_0_2"
diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py
diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py
diff /tmp/rocprof.py.bak tinygrad/runtime/autogen/rocprof.py
diff /tmp/am_am.py.bak tinygrad/runtime/autogen/am/am.py
diff /tmp/am_pm4_soc15.py.bak tinygrad/runtime/autogen/am/pm4_soc15.py
diff /tmp/am_pm4_nv.py.bak tinygrad/runtime/autogen/am/pm4_nv.py
diff /tmp/am_sdma_4_0_0.py.bak tinygrad/runtime/autogen/am/sdma_4_0_0.py
diff /tmp/am_sdma_5_0_0.py.bak tinygrad/runtime/autogen/am/sdma_5_0_0.py
diff /tmp/am_sdma_6_0_0.py.bak tinygrad/runtime/autogen/am/sdma_6_0_0.py
diff /tmp/am_smu_v13_0_0.py.bak tinygrad/runtime/autogen/am/smu_v13_0_0.py
diff /tmp/am_smu_v14_0_2.py.bak tinygrad/runtime/autogen/am/smu_v14_0_2.py
- name: Verify Linux autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak
mv tinygrad/runtime/autogen/kfd.py /tmp/kfd.py.bak
mv tinygrad/runtime/autogen/io_uring.py /tmp/io_uring.py.bak
mv tinygrad/runtime/autogen/ib.py /tmp/ib.py.bak
mv tinygrad/runtime/autogen/pci.py /tmp/pci.py.bak
mv tinygrad/runtime/autogen/vfio.py /tmp/vfio.py.bak
rm tinygrad/runtime/autogen/{comgr,hsa,hip,amd_gpu,sqtt,rocprof}.py
python3 -c "from tinygrad.runtime.autogen import comgr, hsa, hip, amd_gpu, sqtt, rocprof"
rm tinygrad/runtime/autogen/am/{am,pm4_soc15,pm4_nv,sdma_4_0_0,sdma_5_0_0,sdma_6_0_0,smu_v13_0_0,smu_v14_0_2}.py
python3 -c "from tinygrad.runtime.autogen.am import am, pm4_soc15, pm4_nv, sdma_4_0_0, sdma_5_0_0, sdma_6_0_0, smu_v13_0_0, smu_v14_0_2"
rm tinygrad/runtime/autogen/{libc,kfd,io_uring,ib,pci,vfio}.py
python3 -c "from tinygrad.runtime.autogen import libc, kfd, io_uring, ib, pci, vfio"
diff /tmp/libc.py.bak tinygrad/runtime/autogen/libc.py
diff /tmp/kfd.py.bak tinygrad/runtime/autogen/kfd.py
diff /tmp/io_uring.py.bak tinygrad/runtime/autogen/io_uring.py
diff /tmp/ib.py.bak tinygrad/runtime/autogen/ib.py
diff /tmp/pci.py.bak tinygrad/runtime/autogen/pci.py
diff /tmp/vfio.py.bak tinygrad/runtime/autogen/vfio.py
- name: Verify LLVM autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/llvm.py /tmp/llvm.py.bak
rm tinygrad/runtime/autogen/llvm.py
python3 -c "from tinygrad.runtime.autogen import llvm"
diff /tmp/llvm.py.bak tinygrad/runtime/autogen/llvm.py
- name: Verify WebGPU autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/webgpu.py /tmp/webgpu.py.bak
rm tinygrad/runtime/autogen/webgpu.py
python3 -c "from tinygrad.runtime.autogen import webgpu"
diff /tmp/webgpu.py.bak tinygrad/runtime/autogen/webgpu.py
- name: Verify Qualcomm autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/kgsl.py /tmp/kgsl.py.bak
mv tinygrad/runtime/autogen/qcom_dsp.py /tmp/qcom_dsp.py.bak
rm tinygrad/runtime/autogen/{kgsl,qcom_dsp}.py
python3 -c "from tinygrad.runtime.autogen import kgsl, qcom_dsp"
diff /tmp/kgsl.py.bak tinygrad/runtime/autogen/kgsl.py
diff /tmp/qcom_dsp.py.bak tinygrad/runtime/autogen/qcom_dsp.py
- name: Verify libusb autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/libusb.py /tmp/libusb.py.bak
rm tinygrad/runtime/autogen/libusb.py
python3 -c "from tinygrad.runtime.autogen import libusb"
diff /tmp/libusb.py.bak tinygrad/runtime/autogen/libusb.py
- name: Verify mesa autogen
continue-on-error: true
run: |
mv tinygrad/runtime/autogen/mesa.py /tmp/mesa.py.bak
rm tinygrad/runtime/autogen/mesa.py
python3 -c "from tinygrad.runtime.autogen import mesa"
diff /tmp/mesa.py.bak tinygrad/runtime/autogen/mesa.py
- name: Verify libclang autogen
continue-on-error: true
run: |
cp tinygrad/runtime/autogen/libclang.py /tmp/libclang.py.bak
rm tinygrad/runtime/autogen/avcodec.py
python3 -c "from tinygrad.runtime.autogen import avcodec"
REGEN=1 python3 -c "from tinygrad.runtime.autogen import libclang"
diff /tmp/libclang.py.bak tinygrad/runtime/autogen/libclang.py
- name: Generate patch for differences
- name: Check for differences
run: |
if ! git diff --quiet; then
git diff > autogen-ubuntu.patch
echo "Autogen files out of date. Apply patch from: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts"
exit 1
fi
- name: Upload patch artifact
if: failure()
uses: actions/upload-artifact@v4
with:
name: autogen-ubuntu-patch
path: autogen-ubuntu.patch
if-no-files-found: ignore
- name: Fail if differences found
run: git diff --quiet
autogen-mac:
name: In-tree Autogen (macos)
runs-on: macos-14
@@ -169,25 +91,24 @@ jobs:
uses: ./.github/actions/setup-tinygrad
with:
llvm: 'true'
- name: Verify macos autogen
continue-on-error: true
- name: Regenerate autogen files
run: |
mv tinygrad/runtime/autogen/metal.py /tmp/metal.py.bak
rm tinygrad/runtime/autogen/metal.py
LIBCLANG_PATH=/opt/homebrew/opt/llvm@20/lib/libclang.dylib python3 -c "from tinygrad.runtime.autogen import metal"
diff /tmp/metal.py.bak tinygrad/runtime/autogen/metal.py
- name: Generate patch for differences
- name: Check for differences
run: |
if ! git diff --quiet; then
git diff > autogen-macos.patch
echo "Autogen files out of date. Apply patch from: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts"
exit 1
fi
- name: Upload patch artifact
if: failure()
uses: actions/upload-artifact@v4
with:
name: autogen-macos-patch
path: autogen-macos.patch
if-no-files-found: ignore
- name: Fail if differences found
run: git diff --quiet
autogen-comgr-3:
name: In-tree Autogen (comgr 3)
runs-on: ubuntu-24.04
@@ -206,22 +127,20 @@ jobs:
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt -qq update || true
sudo apt-get install -y --no-install-recommends libclang-20-dev comgr
- name: Verify comgr (3) autogen
continue-on-error: true
- name: Regenerate autogen files
run: |
mv tinygrad/runtime/autogen/comgr_3.py /tmp/comgr_3.py.bak
rm tinygrad/runtime/autogen/comgr_3.py
python3 -c "from tinygrad.runtime.autogen import comgr_3"
diff /tmp/comgr_3.py.bak tinygrad/runtime/autogen/comgr_3.py
- name: Generate patch for differences
- name: Check for differences
run: |
if ! git diff --quiet; then
git diff > autogen-comgr3.patch
echo "Autogen files out of date. Apply patch from: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts"
exit 1
fi
- name: Upload patch artifact
if: failure()
uses: actions/upload-artifact@v4
with:
name: autogen-comgr3-patch
path: autogen-comgr3.patch
if-no-files-found: ignore
- name: Fail if differences found
run: git diff --quiet

View File

@@ -312,28 +312,31 @@ class Group:
idxs = tuple(idx * st.cols if i == 3 else idx for i, idx in enumerate(idxs))
src_i = ((idxs[0] * src.shape[-3] + idxs[1]) * src.shape[-2] + idxs[2]) * src.shape[-1] + idxs[3]
for height in self.ker.range(dst.shape[-4], track=False):
for width in self.ker.range(dst.shape[-3], track=False):
elements_per_thread = st.base_shape.elements_per_thread
memcpy_per_row = st.base_shape.cols // elements_per_thread
total_calls = st.base_shape.num_elements // (self.group_threads * elements_per_thread)
elements_per_thread = st.base_shape.elements_per_thread
memcpy_per_row = st.cols // elements_per_thread
total_calls = (dst.shape[-4] * dst.shape[-3] * st.base_shape.num_elements) // (self.group_threads * elements_per_thread)
for outer in self.ker.range(total_calls, track=False):
for inner in self.ker.range(elements_per_thread, axis_type=AxisType.UPCAST, track=False):
load_idx = outer * self.group_threads + self.laneid
row = load_idx // memcpy_per_row
col = (load_idx * elements_per_thread) % st.base_shape.cols + inner
for outer in self.ker.range(total_calls, track=False):
for inner in self.ker.range(elements_per_thread, axis_type=AxisType.UPCAST, track=False):
load_idx = outer * self.group_threads + self.laneid
row = load_idx // memcpy_per_row
col = (load_idx * elements_per_thread) % st.cols + inner
height = row // st.base_shape.rows
width = col // st.base_shape.cols
srow, scol = cast(ST, dst).swizzle(row, col)
row = row % st.base_shape.rows
col = col % st.base_shape.cols
src_i += height * st.base_shape.rows * row_stride + width * st.base_shape.cols
src_i += row * row_stride + col
srow, scol = cast(ST, dst).swizzle(row, col)
src_load = srcf[src_i]
if src.dtype.base != dst.dtype.base:
src_load = src_load.cast(dst.dtype.base)
dst_store = dst[*dst_idxs, height, width, srow, scol].store(src_load)
dst_store = dst_store.end(height, width, outer, inner).barrier()
src_i += height * st.base_shape.rows * row_stride + width * st.base_shape.cols
src_i += row * row_stride + col
src_load = srcf[src_i]
if src.dtype.base != dst.dtype.base:
src_load = src_load.cast(dst.dtype.base)
dst_store = dst[*dst_idxs, height, width, srow, scol].store(src_load)
dst_store = dst_store.end(height, width, outer, inner).barrier()
elif dst_dtype.addrspace == AddrSpace.REG and src_dtype.addrspace == AddrSpace.GLOBAL and isinstance(dst, RT):
srcf = src.flatten()
row_stride = prod(src.shape[axis+1:])

View File

@@ -178,5 +178,21 @@ class TestCfg(unittest.TestCase):
s_endpgm(),
])
def test_colored_blocks(self):
N = 10
asm = ["entry:", s_branch("init0"),]
for i in range(N):
asm += [f"init{i}:", s_mov_b32(s[1], i + 1), s_branch(loop:=f"loop{i}")]
asm += [
f"{loop}:",
s_nop(i & 7),
s_add_u32(s[1], s[1], -1),
s_cmp_eq_i32(s[1], 0),
s_cbranch_scc0(loop),
s_branch(f"init{i+1}" if i + 1 < N else "end"),
]
asm += ["end:", s_endpgm()]
run_asm("test_colored_blocks", asm)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,7 +1,6 @@
# mypy: ignore-errors
import ctypes
from tinygrad.helpers import unwrap
from tinygrad.runtime.support.c import Struct, CEnum, _IO, _IOW, _IOR, _IOWR
from tinygrad.runtime.support.c import DLL, Struct, CEnum, _IO, _IOW, _IOR, _IOWR
enum_HEVCNALUnitType = CEnum(ctypes.c_uint32)
HEVC_NAL_TRAIL_N = enum_HEVCNALUnitType.define('HEVC_NAL_TRAIL_N', 0)
HEVC_NAL_TRAIL_R = enum_HEVCNALUnitType.define('HEVC_NAL_TRAIL_R', 1)

View File

@@ -55,8 +55,11 @@ function addTags(root) {
root.selectAll("text").data(d => [d]).join("text").text(d => d).attr("dy", "0.35em");
}
const colorScale = d3.scaleSequential(t => t > 0 ? d3.interpolateLab(colorScheme.ACTIVE[1], colorScheme.ACTIVE[2])(t) : colorScheme.ACTIVE[0]).clamp(true);
const drawGraph = (data) => {
const g = dagre.graphlib.json.read(data);
if (data.value.colorDomain != null) colorScale.domain(data.value.colorDomain);
// draw nodes
d3.select("#graph-svg").on("click", () => d3.selectAll(".highlight").classed("highlight", false));
const nodes = d3.select("#nodes").selectAll("g").data(g.nodes().map(id => g.node(id)), d => d).join("g").attr("class", d => d.className ?? "node")
@@ -88,7 +91,7 @@ const drawGraph = (data) => {
}
return [ret];
}).join("text").selectAll("tspan").data(d => d).join("tspan").attr("x", "0").attr("dy", 14).selectAll("tspan").data(d => d).join("tspan")
.attr("fill", d => d.color).text(d => d.st).attr("xml:space", "preserve").style("font-family", g.graph().font);
.attr("fill", d => typeof d.color === "string" ? d.color : colorScale(d.color)).text(d => d.st).attr("xml:space", "preserve").style("font-family", g.graph().font);
addTags(nodes.selectAll("g.tag").data(d => d.tag != null ? [d] : []).join("g").attr("class", "tag")
.attr("transform", d => `translate(${-d.width/2+8}, ${-d.height/2+8})`).datum(e => e.tag));
// draw edges
@@ -154,7 +157,8 @@ const formatUnit = (d, unit="") => d3.format(".3~s")(d)+unit;
const colorScheme = {TINY:new Map([["Schedule","#1b5745"],["get_program","#1d2e62"],["compile","#63b0cd"],["DEFAULT","#354f52"]]),
DEFAULT:["#2b2e39", "#2c2f3a", "#31343f", "#323544", "#2d303a", "#2e313c", "#343746", "#353847", "#3c4050", "#404459", "#444862", "#4a4e65"],
BUFFER:["#342483", "#3E2E94", "#4938A4", "#5442B4", "#5E4CC2", "#674FCA"], SE:new Map([["OCC", "#101725"], ["INST", "#0A2042"]]),}
BUFFER:["#342483", "#3E2E94", "#4938A4", "#5442B4", "#5E4CC2", "#674FCA"], SE:new Map([["OCC", "#101725"], ["INST", "#0A2042"]]),
ACTIVE:["#565f89", "#c8d3f5", "#7aa2f7"]}
const cycleColors = (lst, i) => lst[i%lst.length];
const rescaleTrack = (source, tid, k) => {
@@ -811,6 +815,7 @@ async function main() {
}
return table;
}
if (ret.data != null) renderDag(ret, { recenter:true });
if (ret.cols != null) renderTable(root, ret);
else if (ret.src != null) root.append(() => codeBlock(ret.src, ret.lang));
return document.querySelector("#custom").replaceChildren(root.node());

View File

@@ -13,21 +13,26 @@ onmessage = (e) => {
self.close();
}
const layoutCfg = (g, { blocks, paths, pc_table, colors }) => {
const layoutCfg = (g, { blocks, paths, pc_table, counters, colors }) => {
g.setGraph({ rankdir:"TD", font:"monospace" });
ctx.font = `350 ${LINE_HEIGHT}px ${g.graph().font}`;
// basic blocks render the assembly in nodes
let maxColor = 0;
for (const [lead, members] of Object.entries(blocks)) {
let [width, height, label] = [0, 0, []];
for (const m of members) {
const text = pc_table[m][0];
if (counters != null) {
const num = counters[m]?.hit_count || 0;
if (num > maxColor) maxColor = num;
label.push([{st:text, color:num}]);
} else { const [inst, ...operands] = text.split(" "); label.push([{st:inst+" ", color:"#7aa2f7"}, {st:operands.join(" "), color:"#9aa5ce"}]); }
width = Math.max(width, ctx.measureText(text).width);
height += LINE_HEIGHT;
const [inst, ...operands] = text.split(" ");
label.push([{st:inst+" ", color:"#7aa2f7"}, {st:operands.join(" "), color:"#9aa5ce"}]);
}
g.setNode(lead, { ...rectDims(width, height), label, id:lead, color:"#1a1b26" });
}
g.graph().colorDomain = [0, maxColor];
// paths become edges between basic blocks
for (const [lead, value] of Object.entries(paths)) {
for (const [id, color] of Object.entries(value)) g.setEdge(lead, id, {label:{type:"port", text:""}, color:colors[color]});

View File

@@ -285,7 +285,7 @@ def unpack_sqtt(key:tuple[str, int], data:list, p:ProfileProgramEvent) -> tuple[
n = next(inst_units[u])
if (events:=cu_events.get(w.cu_loc)) is None: cu_events[w.cu_loc] = events = []
events.append(ProfileRangeEvent(w.simd_loc, loc:=f"INST WAVE:{w.wave_id} N:{n}", Decimal(w.begin_time), Decimal(w.end_time)))
wave_insts.setdefault(w.cu_loc, {})[f"{u} N:{n}"] = {"wave":w, "disasm":disasm, "run_number":n, "loc":loc}
wave_insts.setdefault(w.cu_loc, {})[f"{u} N:{n}"] = {"wave":w, "disasm":disasm, "prg":p, "run_number":n, "loc":loc}
# * OCC waves
units:dict[str, itertools.count] = {}
wave_start:dict[str, int] = {}
@@ -490,7 +490,9 @@ def get_render(query:str) -> dict:
prev_instr = max(prev_instr, e.time + e.dur)
summary = [{"label":"Total Cycles", "value":w.end_time-w.begin_time}, {"label":"SE", "value":w.se}, {"label":"CU", "value":w.cu},
{"label":"SIMD", "value":w.simd}, {"label":"Wave ID", "value":w.wave_id}, {"label":"Run number", "value":data["run_number"]}]
return {"rows":[tuple(v.values()) for v in rows.values()], "cols":columns, "metadata":[summary]}
cfg = amdgpu_cfg((p:=data["prg"]).lib, device_props[p.device]["gfx_target_version"])["data"]
cfg["counters"] = {pc-p.base:v for pc,v in rows.items()}
return {"rows":[tuple(v.values()) for v in rows.values()], "cols":columns, "metadata":[summary], "data":cfg}
return data
# ** HTTP server