From 6ea358610189bfa4397a94346af14f3cd30774b7 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Fri, 2 Jan 2026 16:45:34 -0800 Subject: [PATCH] short --- extra/assembly/amd/emu.py | 86 ++++++--------------------------------- 1 file changed, 13 insertions(+), 73 deletions(-) diff --git a/extra/assembly/amd/emu.py b/extra/assembly/amd/emu.py index 906722c469..8aa2638799 100644 --- a/extra/assembly/amd/emu.py +++ b/extra/assembly/amd/emu.py @@ -423,65 +423,17 @@ def exec_wmma(st: WaveState, inst, op: VOP3POp) -> None: # SQTT TRACING # ═══════════════════════════════════════════════════════════════════════════════ -WAVESTART_TO_INST_CYCLES = 32 # cycles from WAVESTART to first instruction - -# Issue intervals (fixed, independent of lane count) -VALU_ISSUE_CYCLES = 1 -TRANS_ISSUE_CYCLES = 4 -DP_ISSUE_CYCLES = 32 -SALU_ISSUE_CYCLES = 1 - -# ALU latencies (cycles from dispatch to result ready / ALUEXEC) +WAVESTART_TO_INST_CYCLES = 32 VALU_LATENCY = 6 -SALU_LATENCY = 2 -TRANS_LATENCY = 9 -DP_LATENCY = 38 - -# Pipeline delay from last ALU dispatch to first s_nop IMMEDIATE SNOP_PIPELINE_DELAY = 3 - -# s_nop(N) delays the next instruction's issue by: -# issue_delay = N + 1 + SNOP_ISSUE_OVERHEAD + bypass_penalty + extra_stall -# -# where: -# - SNOP_ISSUE_OVERHEAD = 3 (pipeline overhead) -# - bypass_penalty = 4 if N >= 4 and pending ALUEXEC (register cache bypass timeout) -# - extra_stall = 4 if N in 11-22 and pending ALUEXEC (additional pipeline hazard) -# -# For s_nop IMMEDIATE packet timing (without pending ALUEXEC): -# - N in 7-18 has +4 extra delay -SNOP_ISSUE_OVERHEAD = 3 -SNOP_EXTRA_DELAY_MIN = 7 -SNOP_EXTRA_DELAY_MAX = 18 -SNOP_EXTRA_DELAY_MIN_PENDING = 11 -SNOP_EXTRA_DELAY_MAX_PENDING = 22 +SNOP_EXTRA_DELAY_MIN, SNOP_EXTRA_DELAY_MAX = 7, 18 # extra +4 delay range (no pending) +SNOP_EXTRA_DELAY_MIN_PENDING, SNOP_EXTRA_DELAY_MAX_PENDING = 11, 22 # extra +4 delay range (pending) SNOP_EXTRA_DELAY_CYCLES = 4 - -# Forwarding latencies (cycles until result available for dependent instruction) -VALU_FORWARD_LATENCY = 5 # result available 5 cycles after dispatch (writeback at 6) -TRANS_FORWARD_LATENCY = 13 # result available 13 cycles after dispatch -SALU_FORWARD_LATENCY = 1 # result available 1 cycle after dispatch (writeback at 2) - -# Forwarding depth limit: after this many dependent ops in a chain, latency increases -FORWARD_DEPTH_LIMIT = 4 -FORWARD_DEEP_LATENCY = 9 # latency for deep dependency chains (beyond depth limit) - -# Register cache bypass timeout: s_nop(N) with N >= 4 causes VALU results to go through -# the full register file instead of bypass path, adding +4 cycles to ALUEXEC -REGCACHE_BYPASS_TIMEOUT = 4 +FORWARD_DEPTH_LIMIT = 4 # chain depth where forwarding exhaustion starts +FORWARD_DEEP_LATENCY = 9 # latency for exhausted forwarding +REGCACHE_BYPASS_TIMEOUT = 4 # s_nop(N>=4) triggers bypass penalty REGCACHE_BYPASS_PENALTY = 4 -# Transcendental ops (use TRANS unit) -_TRANS_OPS = {'V_RCP_F32', 'V_RCP_F64', 'V_RSQ_F32', 'V_RSQ_F64', 'V_SQRT_F32', 'V_SQRT_F64', - 'V_LOG_F32', 'V_EXP_F32', 'V_SIN_F32', 'V_COS_F32', 'V_RCP_F16', 'V_RSQ_F16', 'V_SQRT_F16'} - -# Double precision ops (use DP unit) -_DP_OPS = {'V_ADD_F64', 'V_MUL_F64', 'V_FMA_F64', 'V_DIV_F64', 'V_MIN_F64', 'V_MAX_F64', - 'V_LDEXP_F64', 'V_FREXP_MANT_F64', 'V_FREXP_EXP_I32_F64', 'V_FRACT_F64', - 'V_TRUNC_F64', 'V_CEIL_F64', 'V_RNDNE_F64', 'V_FLOOR_F64', 'V_DIV_SCALE_F64', - 'V_DIV_FMAS_F64', 'V_DIV_FIXUP_F64', 'V_CVT_F64_I32', 'V_CVT_F64_U32', - 'V_CVT_I32_F64', 'V_CVT_U32_F64', 'V_CVT_F32_F64', 'V_CVT_F64_F32'} - from extra.assembly.amd.sqtt import WAVESTART, WAVEEND, IMMEDIATE, VALUINST, ALUEXEC, AluSrc class SQTTState: @@ -542,24 +494,12 @@ class SQTTState: if dst_reg is not None: self.vgpr[dst_reg] = (completion_cycle, self.vgpr.get(dst_reg, (0, 0))[1]) - def _get_valu_src_regs(self, inst: Inst) -> list[int]: + def _get_src_vgprs(self, inst: Inst) -> list[int]: """Extract source VGPR indices from a VALU instruction.""" - src_vgprs = [] - if isinstance(inst, VOP1): - if inst.src0 >= 256: src_vgprs.append(inst.src0 - 256) - elif isinstance(inst, VOP2): - if inst.src0 >= 256: src_vgprs.append(inst.src0 - 256) - src_vgprs.append(inst.vsrc1) # vsrc1 is always a VGPR index - elif isinstance(inst, VOP3): - for src in [inst.src0, inst.src1, getattr(inst, 'src2', None)]: - if src is not None and src >= 256: src_vgprs.append(src - 256) - return src_vgprs - - def _get_valu_dst_reg(self, inst: Inst) -> int | None: - """Extract destination VGPR index from a VALU instruction.""" - if isinstance(inst, (VOP1, VOP2, VOP3)): - return inst.vdst - return None + if isinstance(inst, VOP1): return [inst.src0 - 256] if inst.src0 >= 256 else [] + if isinstance(inst, VOP2): return ([inst.src0 - 256] if inst.src0 >= 256 else []) + [inst.vsrc1] + if isinstance(inst, VOP3): return [s - 256 for s in [inst.src0, inst.src1, getattr(inst, 'src2', None)] if s is not None and s >= 256] + return [] def process_instruction(self, inst: Inst): if inst.op == SOPPOp.S_NOP: self._process_snop(inst.simm16) @@ -589,10 +529,10 @@ class SQTTState: def _process_valu(self, inst: Inst): """Process VALU instruction - emit VALUINST and schedule ALUEXEC.""" - dispatch, dst = self.cycle, self._get_valu_dst_reg(inst) + dispatch, dst = self.cycle, inst.vdst # Find critical dependency: source VGPR with latest ready time - deps = [(r, self.vgpr[r]) for r in self._get_valu_src_regs(inst) if r in self.vgpr] + deps = [(r, self.vgpr[r]) for r in self._get_src_vgprs(inst) if r in self.vgpr] src_vgpr, (source_ready, src_depth) = max(deps, key=lambda x: x[1][0]) if deps else (None, (0, 0)) depth = src_depth + 1 if deps else 0