assembly/amd: make pdf.py code shine (#14029)

* assembly/amd: make pdf.py code shine * no merge * pdf2 is the future * something * regen enums * test * work * remove junk * write * pcode extraction * pdf2 passes all tests * simplify * simpler pdf * late filter * remove hacks * simplify pdf2.py * field type * remove defaults * don't export srcenum * simple pdf.py * simpler * cleaner * less hack in PDF
2026-01-09 15:08:02 -05:00 · 2026-01-05 18:49:40 -08:00
parent ea7b149ca5
commit 20653d2996
18 changed files with 4115 additions and 4035 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -670,6 +670,10 @@ jobs:
          deps: testing_minimal
          amd: 'true'
          python-version: '3.13'
+      - name: Verify AMD autogen is up to date
+        run: |
+          python -m extra.assembly.amd.pdf
+          git diff --exit-code extra/assembly/amd/autogen/
      - name: Install LLVM 21
        run: |
          wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
@@ -690,23 +694,6 @@ jobs:
      - name: Run RDNA3 ops tests
        run: SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=auto test/test_ops.py -k "test_sparse_categorical_crossentropy or test_tril"

-  testamdautogen:
-    name: AMD autogen
-    runs-on: ubuntu-24.04
-    timeout-minutes: 10
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Setup Environment
-        uses: ./.github/actions/setup-tinygrad
-        with:
-          key: rdna3-autogen
-          pydeps: "pdfplumber"
-      - name: Verify AMD autogen is up to date
-        run: |
-          python -m extra.assembly.amd.pdf --arch all
-          git diff --exit-code extra/assembly/amd/autogen/
-
  testnvidia:
    strategy:
      fail-fast: false
--- a/extra/assembly/amd/autogen/cdna/enum.py
+++ b/extra/assembly/amd/autogen/cdna/enum.py
@@ -1,46 +1,6 @@
-# autogenerated from AMD CDNA3+CDNA4 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 from enum import IntEnum

-class SrcEnum(IntEnum):
-  S_ADD_U32 = 0
-  S_SUB_U32 = 1
-  S_ADD_I32 = 2
-  S_SUB_I32 = 3
-  S_ADDC_U32 = 4
-  S_SUBB_U32 = 5
-  S_MIN_I32 = 6
-  FLAT_SCRATCH_LO = 102
-  FLAT_SCRATCH_HI = 103
-  XNACK_MASK_LO = 104
-  XNACK_MASK_HI = 105
-  VCC_LO = 106
-  VCC_HI = 107
-  M0 = 124
-  EXEC_LO = 126
-  EXEC_HI = 127
-  ZERO = 128
-  DPP8 = 233
-  DPP8FI = 234
-  SHARED_BASE = 235
-  SHARED_LIMIT = 236
-  PRIVATE_BASE = 237
-  PRIVATE_LIMIT = 238
-  RESERVED = 239
-  POS_HALF = 240
-  NEG_HALF = 241
-  POS_ONE = 242
-  NEG_ONE = 243
-  POS_TWO = 244
-  NEG_TWO = 245
-  POS_FOUR = 246
-  NEG_FOUR = 247
-  INV_2PI = 248
-  DPP16 = 250
-  VCCZ = 251
-  EXECZ = 252
-  SCC = 253
-  LDS_DIRECT = 254
-
 class DSOp(IntEnum):
  DS_ADD_U32 = 0
  DS_SUB_U32 = 1
@@ -155,12 +115,6 @@ class DSOp(IntEnum):
  DS_READ2ST64_B64 = 120
  DS_ADD_RTN_F64 = 124
  DS_CONDXCHG32_RTN_B64 = 126
-  DS_GWS_SEMA_RELEASE_ALL = 152
-  DS_GWS_INIT = 153
-  DS_GWS_SEMA_V = 154
-  DS_GWS_SEMA_BR = 155
-  DS_GWS_SEMA_P = 156
-  DS_GWS_BARRIER = 157
  DS_READ_ADDTID_B32 = 182
  DS_PK_ADD_RTN_F16 = 183
  DS_PK_ADD_RTN_BF16 = 184
@@ -174,7 +128,6 @@ class DSOp(IntEnum):
  DS_READ_B64_TR_B16 = 227
  DS_READ_B96 = 254
  DS_READ_B128 = 255
-  CDNA4 = 600

 class FLATOp(IntEnum):
  FLAT_LOAD_UBYTE = 16
@@ -231,7 +184,6 @@ class FLATOp(IntEnum):
  FLAT_ATOMIC_XOR_X2 = 106
  FLAT_ATOMIC_INC_X2 = 107
  FLAT_ATOMIC_DEC_X2 = 108
-  CDNA4 = 600

 class GLOBALOp(IntEnum):
  GLOBAL_LOAD_UBYTE = 16
@@ -295,7 +247,6 @@ class GLOBALOp(IntEnum):
  GLOBAL_ATOMIC_DEC_X2 = 108
  GLOBAL_LOAD_LDS_DWORDX4 = 125
  GLOBAL_LOAD_LDS_DWORDX3 = 126
-  CDNA4 = 600

 class MTBUFOp(IntEnum):
  TBUFFER_LOAD_FORMAT_X = 0
@@ -390,7 +341,6 @@ class MUBUFOp(IntEnum):
  BUFFER_ATOMIC_XOR_X2 = 106
  BUFFER_ATOMIC_INC_X2 = 107
  BUFFER_ATOMIC_DEC_X2 = 108
-  CDNA4 = 600

 class SCRATCHOp(IntEnum):
  SCRATCH_LOAD_UBYTE = 16
@@ -504,7 +454,6 @@ class SMEMOp(IntEnum):
  S_ATOMIC_XOR_X2 = 170
  S_ATOMIC_INC_X2 = 171
  S_ATOMIC_DEC_X2 = 172
-  CDNA4 = 600

 class SOP1Op(IntEnum):
  S_MOV_B32 = 0
@@ -561,7 +510,6 @@ class SOP1Op(IntEnum):
  S_ANDN1_WREXEC_B64 = 53
  S_ANDN2_WREXEC_B64 = 54
  S_BITREPLICATE_B64_B32 = 55
-  CDNA4 = 600

 class SOP2Op(IntEnum):
  S_ADD_U32 = 0
@@ -616,7 +564,6 @@ class SOP2Op(IntEnum):
  S_PACK_LL_B32_B16 = 50
  S_PACK_LH_B32_B16 = 51
  S_PACK_HH_B32_B16 = 52
-  CDNA4 = 600

 class SOPCOp(IntEnum):
  S_CMP_EQ_I32 = 0
@@ -639,7 +586,6 @@ class SOPCOp(IntEnum):
  S_SET_GPR_IDX_ON = 17
  S_CMP_EQ_U64 = 18
  S_CMP_LG_U64 = 19
-  CDNA4 = 600

 class SOPKOp(IntEnum):
  S_MOVK_I32 = 0
@@ -695,7 +641,6 @@ class SOPPOp(IntEnum):
  S_ENDPGM_SAVED = 27
  S_SET_GPR_IDX_OFF = 28
  S_SET_GPR_IDX_MODE = 29
-  CDNA4 = 600

 class VOP1Op(IntEnum):
  V_NOP = 0
@@ -783,7 +728,6 @@ class VOP1Op(IntEnum):
  V_PERMLANE16_SWAP_B32 = 89
  V_PERMLANE32_SWAP_B32 = 90
  V_CVT_F32_BF16 = 91
-  CDNA4 = 600

 class VOP2Op(IntEnum):
  V_CNDMASK_B32 = 0
@@ -848,7 +792,6 @@ class VOP2Op(IntEnum):
  V_FMAC_F32 = 59
  V_PK_FMAC_F16 = 60
  V_XNOR_B32 = 61
-  CDNA4 = 600

 class VOP3AOp(IntEnum):
  V_CMP_CLASS_F32 = 16
@@ -1268,7 +1211,7 @@ class VOP3AOp(IntEnum):
  V_CVT_SCALEF32_SR_PK32_BF6_F32 = 597
  V_CVT_SCALEF32_PK32_F32_FP6 = 598
  V_CVT_SCALEF32_PK32_F32_BF6 = 599
-  CDNA4 = 600
+  V_CVT_SCALEF32_PK32_FP6_F16 = 600
  V_CVT_SCALEF32_PK32_FP6_BF16 = 601
  V_CVT_SCALEF32_PK32_BF6_F16 = 602
  V_CVT_SCALEF32_PK32_BF6_BF16 = 603
@@ -1338,7 +1281,6 @@ class VOP3BOp(IntEnum):
  V_DIV_SCALE_F64 = 481
  V_MAD_U64_U32 = 488
  V_MAD_I64_I32 = 489
-  CDNA4 = 600

 class VOP3POp(IntEnum):
  V_PK_MAD_I16 = 0
@@ -1388,8 +1330,6 @@ class VOP3POp(IntEnum):
  V_SMFMAC_F32_16X16X128_BF8_BF8 = 59
  V_SMFMAC_F32_16X16X128_BF8_FP8 = 60
  V_SMFMAC_F32_16X16X128_FP8_BF8 = 61
-  V_MFMA_F32_16X16X8_XF32 = 62
-  V_MFMA_F32_32X32X4_XF32 = 63
  V_MFMA_F32_32X32X1_2B_F32 = 64
  V_MFMA_F32_16X16X1_4B_F32 = 65
  V_MFMA_F32_4X4X1_16B_F32 = 66
@@ -1447,7 +1387,6 @@ class VOP3POp(IntEnum):
  V_SMFMAC_F32_32X32X32_BF8_FP8 = 125
  V_SMFMAC_F32_32X32X32_FP8_BF8 = 126
  V_SMFMAC_F32_32X32X32_FP8_FP8 = 127
-  CDNA4 = 600

 class VOPCOp(IntEnum):
  V_CMP_CLASS_F32 = 16
@@ -1648,4 +1587,3 @@ class VOPCOp(IntEnum):
  V_CMPX_NE_U64 = 253
  V_CMPX_GE_U64 = 254
  V_CMPX_T_U64 = 255
-  CDNA4 = 600
--- a/extra/assembly/amd/autogen/cdna/ins.py
+++ b/extra/assembly/amd/autogen/cdna/ins.py
@@ -1,19 +1,18 @@
-# autogenerated from AMD CDNA3+CDNA4 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 # ruff: noqa: F401,F403
 from typing import Annotated
-from extra.assembly.amd.dsl import bits, BitField, Inst32, Inst64, Inst96, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
+from extra.assembly.amd.dsl import *
 from extra.assembly.amd.autogen.cdna.enum import *
 import functools

-# instruction formats
-class DPP(Inst64):
+class DPP(Inst):
  encoding = bits[8:0] == 0b11111010
-  vop_op = bits[16:9]
  vdst:VGPRField = bits[24:17]
-  vop2_op = bits[31:25]
  src0:Src = bits[39:32]
+  vop_op = bits[16:9]
+  vop2_op = bits[31:25]
  dpp_ctrl = bits[48:40]
-  bound_ctrl = bits[51]
+  bc = bits[51]
  src0_neg = bits[52]
  src0_abs = bits[53]
  src1_neg = bits[54]
@@ -21,7 +20,7 @@ class DPP(Inst64):
  bank_mask = bits[59:56]
  row_mask = bits[63:60]

-class DS(Inst64):
+class DS(Inst):
  encoding = bits[31:26] == 0b110110
  op:Annotated[BitField, DSOp] = bits[24:17]
  vdst:VGPRField = bits[63:56]
@@ -33,7 +32,7 @@ class DS(Inst64):
  gds = bits[16]
  acc = bits[25]

-class FLAT(Inst64):
+class FLAT(Inst):
  encoding = bits[31:26] == 0b110111
  op:Annotated[BitField, FLATOp] = bits[24:18]
  vdst:VGPRField = bits[63:56]
@@ -48,7 +47,7 @@ class FLAT(Inst64):
  sc1 = bits[25]
  acc = bits[55]

-class MTBUF(Inst64):
+class MTBUF(Inst):
  encoding = bits[31:26] == 0b111010
  op:Annotated[BitField, MTBUFOp] = bits[18:15]
  vdata:VGPRField = bits[47:40]
@@ -56,15 +55,16 @@ class MTBUF(Inst64):
  srsrc:SGPRField = bits[52:48]
  soffset:SSrc = bits[63:56]
  offset:Imm = bits[11:0]
-  format = bits[25:19]
  offen = bits[12]
  idxen = bits[13]
+  sc0 = bits[14]
+  dfmt = bits[22:19]
+  nfmt = bits[25:23]
  sc1 = bits[53]
  nt = bits[54]
  acc = bits[55]
-  sc0 = bits[14]

-class MUBUF(Inst64):
+class MUBUF(Inst):
  encoding = bits[31:26] == 0b111000
  op:Annotated[BitField, MUBUFOp] = bits[24:18]
  vdata:VGPRField = bits[47:40]
@@ -80,16 +80,16 @@ class MUBUF(Inst64):
  nt = bits[17]
  acc = bits[55]

-class SDWA(Inst64):
+class SDWA(Inst):
  encoding = bits[8:0] == 0b11111001
-  vop_op = bits[16:9]
  vdst:VGPRField = bits[24:17]
+  src0:Src = bits[39:32]
+  omod = bits[47:46]
+  clmp = bits[45]
+  vop_op = bits[16:9]
  vop2_op = bits[31:25]
-  src0:Src = bits[39:32]
  dst_sel = bits[42:40]
  dst_u = bits[44:43]
-  clmp = bits[45]
-  omod = bits[47:46]
  src0_sel = bits[50:48]
  src0_sext = bits[51]
  src0_neg = bits[52]
@@ -101,12 +101,10 @@ class SDWA(Inst64):
  src1_abs = bits[61]
  s1 = bits[63]

-class SDWAB(Inst64):
+class SDWAB(Inst):
+  sdst:SGPRField = bits[46:40]
  src0:Src = bits[39:32]
-  dst_sel = bits[42:40]
-  dst_u = bits[44:43]
-  clmp = bits[45]
-  omod = bits[47:46]
+  sd = bits[47]
  src0_sel = bits[50:48]
  src0_sext = bits[51]
  src0_neg = bits[52]
@@ -118,7 +116,7 @@ class SDWAB(Inst64):
  src1_abs = bits[61]
  s1 = bits[63]

-class SMEM(Inst64):
+class SMEM(Inst):
  encoding = bits[31:26] == 0b110000
  op:Annotated[BitField, SMEMOp] = bits[25:18]
  sdata:SGPRField = bits[12:6]
@@ -128,79 +126,78 @@ class SMEM(Inst64):
  glc = bits[16]
  soe = bits[14]
  nv = bits[15]
-  imm = bits[17]
+  imm:Imm = bits[17]

-class SOP1(Inst32):
+class SOP1(Inst):
  encoding = bits[31:23] == 0b101111101
  op:Annotated[BitField, SOP1Op] = bits[15:8]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]

-class SOP2(Inst32):
+class SOP2(Inst):
  encoding = bits[31:30] == 0b10
  op:Annotated[BitField, SOP2Op] = bits[29:23]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPC(Inst32):
+class SOPC(Inst):
  encoding = bits[31:23] == 0b101111110
  op:Annotated[BitField, SOPCOp] = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPK(Inst32):
+class SOPK(Inst):
  encoding = bits[31:28] == 0b1011
  op:Annotated[BitField, SOPKOp] = bits[27:23]
  sdst:SGPRField = bits[22:16]
  simm16:SImm = bits[15:0]

-class SOPP(Inst32):
+class SOPP(Inst):
  encoding = bits[31:23] == 0b101111111
  op:Annotated[BitField, SOPPOp] = bits[22:16]
  simm16:SImm = bits[15:0]

-class VOP1(Inst32):
-  encoding = bits[31:25] == 0b111111
+class VOP1(Inst):
+  encoding = bits[31:25] == 0b0111111
  op:Annotated[BitField, VOP1Op] = bits[16:9]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]

-class VOP2(Inst32):
-  encoding = bits[31] == 0
+class VOP2(Inst):
+  encoding = bits[31] == 0b0
  op:Annotated[BitField, VOP2Op] = bits[30:25]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]

-class VOP3A(Inst64):
+class VOP3A(Inst):
  encoding = bits[31:26] == 0b110100
-  vdst:VGPRField = bits[7:0]
-  abs = bits[10:8]
-  opsel = bits[14:11]
-  clmp = bits[15]
  op:Annotated[BitField, VOP3AOp] = bits[25:16]
+  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
  omod = bits[60:59]
  neg = bits[63:61]
+  abs = bits[10:8]
+  clmp = bits[15]
+  opsel = bits[14:11]

-class VOP3B(Inst64):
+class VOP3B(Inst):
  encoding = bits[31:26] == 0b110100
+  op:Annotated[BitField, VOP3BOp] = bits[25:16]
  vdst:VGPRField = bits[7:0]
  sdst:SGPRField = bits[14:8]
-  clmp = bits[15]
-  op:Annotated[BitField, VOP3BOp] = bits[25:16]
  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
  omod = bits[60:59]
  neg = bits[63:61]
+  clmp = bits[15]

-class VOP3P(Inst64):
+class VOP3P(Inst):
  encoding = bits[31:23] == 0b110100111
-  _defaults = {'opsel_hi': 3, 'opsel_hi2': 1}
  op:Annotated[BitField, VOP3POp] = bits[22:16]
  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
@@ -208,13 +205,13 @@ class VOP3P(Inst64):
  src2:Src = bits[58:50]
  neg = bits[63:61]
  neg_hi = bits[10:8]
+  clmp = bits[15]
  opsel = bits[13:11]
  opsel_hi = bits[60:59]
-  clmp = bits[15]
  opsel_hi2 = bits[14]

-class VOPC(Inst32):
-  encoding = bits[31:25] == 0b111110
+class VOPC(Inst):
+  encoding = bits[31:25] == 0b0111110
  op:Annotated[BitField, VOPCOp] = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]
@@ -333,12 +330,6 @@ ds_read2_b64 = functools.partial(DS, DSOp.DS_READ2_B64)
 ds_read2st64_b64 = functools.partial(DS, DSOp.DS_READ2ST64_B64)
 ds_add_rtn_f64 = functools.partial(DS, DSOp.DS_ADD_RTN_F64)
 ds_condxchg32_rtn_b64 = functools.partial(DS, DSOp.DS_CONDXCHG32_RTN_B64)
-ds_gws_sema_release_all = functools.partial(DS, DSOp.DS_GWS_SEMA_RELEASE_ALL)
-ds_gws_init = functools.partial(DS, DSOp.DS_GWS_INIT)
-ds_gws_sema_v = functools.partial(DS, DSOp.DS_GWS_SEMA_V)
-ds_gws_sema_br = functools.partial(DS, DSOp.DS_GWS_SEMA_BR)
-ds_gws_sema_p = functools.partial(DS, DSOp.DS_GWS_SEMA_P)
-ds_gws_barrier = functools.partial(DS, DSOp.DS_GWS_BARRIER)
 ds_read_addtid_b32 = functools.partial(DS, DSOp.DS_READ_ADDTID_B32)
 ds_pk_add_rtn_f16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_F16)
 ds_pk_add_rtn_bf16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_BF16)
@@ -352,7 +343,6 @@ ds_read_b64_tr_b8 = functools.partial(DS, DSOp.DS_READ_B64_TR_B8)
 ds_read_b64_tr_b16 = functools.partial(DS, DSOp.DS_READ_B64_TR_B16)
 ds_read_b96 = functools.partial(DS, DSOp.DS_READ_B96)
 ds_read_b128 = functools.partial(DS, DSOp.DS_READ_B128)
-cdna4 = functools.partial(DS, DSOp.CDNA4)
 flat_load_ubyte = functools.partial(FLAT, FLATOp.FLAT_LOAD_UBYTE)
 flat_load_sbyte = functools.partial(FLAT, FLATOp.FLAT_LOAD_SBYTE)
 flat_load_ushort = functools.partial(FLAT, FLATOp.FLAT_LOAD_USHORT)
@@ -407,7 +397,6 @@ flat_atomic_or_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_OR_X2)
 flat_atomic_xor_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_XOR_X2)
 flat_atomic_inc_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_INC_X2)
 flat_atomic_dec_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_DEC_X2)
-cdna4 = functools.partial(FLAT, FLATOp.CDNA4)
 global_load_ubyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_UBYTE, seg=2)
 global_load_sbyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SBYTE, seg=2)
 global_load_ushort = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_USHORT, seg=2)
@@ -469,7 +458,6 @@ global_atomic_inc_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_INC_X2, se
 global_atomic_dec_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_DEC_X2, seg=2)
 global_load_lds_dwordx4 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_DWORDX4, seg=2)
 global_load_lds_dwordx3 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_DWORDX3, seg=2)
-cdna4 = functools.partial(FLAT, GLOBALOp.CDNA4, seg=2)
 tbuffer_load_format_x = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_X)
 tbuffer_load_format_xy = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_XY)
 tbuffer_load_format_xyz = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_XYZ)
@@ -560,7 +548,6 @@ buffer_atomic_or_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_OR_X2)
 buffer_atomic_xor_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_XOR_X2)
 buffer_atomic_inc_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_INC_X2)
 buffer_atomic_dec_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_DEC_X2)
-cdna4 = functools.partial(MUBUF, MUBUFOp.CDNA4)
 scratch_load_ubyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_UBYTE, seg=1)
 scratch_load_sbyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SBYTE, seg=1)
 scratch_load_ushort = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_USHORT, seg=1)
@@ -670,7 +657,6 @@ s_atomic_or_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_OR_X2)
 s_atomic_xor_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_XOR_X2)
 s_atomic_inc_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_INC_X2)
 s_atomic_dec_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_DEC_X2)
-cdna4 = functools.partial(SMEM, SMEMOp.CDNA4)
 s_mov_b32 = functools.partial(SOP1, SOP1Op.S_MOV_B32)
 s_mov_b64 = functools.partial(SOP1, SOP1Op.S_MOV_B64)
 s_cmov_b32 = functools.partial(SOP1, SOP1Op.S_CMOV_B32)
@@ -725,7 +711,6 @@ s_orn1_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_ORN1_SAVEEXEC_B64)
 s_andn1_wrexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN1_WREXEC_B64)
 s_andn2_wrexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN2_WREXEC_B64)
 s_bitreplicate_b64_b32 = functools.partial(SOP1, SOP1Op.S_BITREPLICATE_B64_B32)
-cdna4 = functools.partial(SOP1, SOP1Op.CDNA4)
 s_add_u32 = functools.partial(SOP2, SOP2Op.S_ADD_U32)
 s_sub_u32 = functools.partial(SOP2, SOP2Op.S_SUB_U32)
 s_add_i32 = functools.partial(SOP2, SOP2Op.S_ADD_I32)
@@ -778,7 +763,6 @@ s_lshl4_add_u32 = functools.partial(SOP2, SOP2Op.S_LSHL4_ADD_U32)
 s_pack_ll_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_LL_B32_B16)
 s_pack_lh_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_LH_B32_B16)
 s_pack_hh_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_HH_B32_B16)
-cdna4 = functools.partial(SOP2, SOP2Op.CDNA4)
 s_cmp_eq_i32 = functools.partial(SOPC, SOPCOp.S_CMP_EQ_I32)
 s_cmp_lg_i32 = functools.partial(SOPC, SOPCOp.S_CMP_LG_I32)
 s_cmp_gt_i32 = functools.partial(SOPC, SOPCOp.S_CMP_GT_I32)
@@ -799,7 +783,6 @@ s_setvskip = functools.partial(SOPC, SOPCOp.S_SETVSKIP)
 s_set_gpr_idx_on = functools.partial(SOPC, SOPCOp.S_SET_GPR_IDX_ON)
 s_cmp_eq_u64 = functools.partial(SOPC, SOPCOp.S_CMP_EQ_U64)
 s_cmp_lg_u64 = functools.partial(SOPC, SOPCOp.S_CMP_LG_U64)
-cdna4 = functools.partial(SOPC, SOPCOp.CDNA4)
 s_movk_i32 = functools.partial(SOPK, SOPKOp.S_MOVK_I32)
 s_cmovk_i32 = functools.partial(SOPK, SOPKOp.S_CMOVK_I32)
 s_cmpk_eq_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_EQ_I32)
@@ -851,7 +834,6 @@ s_cbranch_cdbgsys_and_user = functools.partial(SOPP, SOPPOp.S_CBRANCH_CDBGSYS_AN
 s_endpgm_saved = functools.partial(SOPP, SOPPOp.S_ENDPGM_SAVED)
 s_set_gpr_idx_off = functools.partial(SOPP, SOPPOp.S_SET_GPR_IDX_OFF)
 s_set_gpr_idx_mode = functools.partial(SOPP, SOPPOp.S_SET_GPR_IDX_MODE)
-cdna4 = functools.partial(SOPP, SOPPOp.CDNA4)
 v_nop_e32 = functools.partial(VOP1, VOP1Op.V_NOP)
 v_mov_b32_e32 = functools.partial(VOP1, VOP1Op.V_MOV_B32)
 v_readfirstlane_b32_e32 = functools.partial(VOP1, VOP1Op.V_READFIRSTLANE_B32)
@@ -937,7 +919,6 @@ v_prng_b32_e32 = functools.partial(VOP1, VOP1Op.V_PRNG_B32)
 v_permlane16_swap_b32_e32 = functools.partial(VOP1, VOP1Op.V_PERMLANE16_SWAP_B32)
 v_permlane32_swap_b32_e32 = functools.partial(VOP1, VOP1Op.V_PERMLANE32_SWAP_B32)
 v_cvt_f32_bf16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_BF16)
-cdna4_e32 = functools.partial(VOP1, VOP1Op.CDNA4)
 v_cndmask_b32_e32 = functools.partial(VOP2, VOP2Op.V_CNDMASK_B32)
 v_add_f32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_F32)
 v_sub_f32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F32)
@@ -961,8 +942,8 @@ v_and_b32_e32 = functools.partial(VOP2, VOP2Op.V_AND_B32)
 v_or_b32_e32 = functools.partial(VOP2, VOP2Op.V_OR_B32)
 v_xor_b32_e32 = functools.partial(VOP2, VOP2Op.V_XOR_B32)
 v_dot2c_f32_bf16_e32 = functools.partial(VOP2, VOP2Op.V_DOT2C_F32_BF16)
-def v_fmamk_f32_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F32, vdst, src0, vsrc1, literal=K)
-def v_fmaak_f32_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F32, vdst, src0, vsrc1, literal=K)
+v_fmamk_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F32)
+v_fmaak_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F32)
 v_add_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_CO_U32)
 v_sub_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_CO_U32)
 v_subrev_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_CO_U32)
@@ -1000,7 +981,6 @@ v_dot8c_i32_i4_e32 = functools.partial(VOP2, VOP2Op.V_DOT8C_I32_I4)
 v_fmac_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F32)
 v_pk_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_PK_FMAC_F16)
 v_xnor_b32_e32 = functools.partial(VOP2, VOP2Op.V_XNOR_B32)
-cdna4_e32 = functools.partial(VOP2, VOP2Op.CDNA4)
 v_cmp_class_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_CLASS_F32)
 v_cmpx_class_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_CLASS_F32)
 v_cmp_class_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_CLASS_F64)
@@ -1418,7 +1398,7 @@ v_cvt_scalef32_sr_pk32_fp6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32
 v_cvt_scalef32_sr_pk32_bf6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_BF6_F32)
 v_cvt_scalef32_pk32_f32_fp6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F32_FP6)
 v_cvt_scalef32_pk32_f32_bf6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F32_BF6)
-cdna4 = functools.partial(VOP3A, VOP3AOp.CDNA4)
+v_cvt_scalef32_pk32_fp6_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_FP6_F16)
 v_cvt_scalef32_pk32_fp6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_FP6_BF16)
 v_cvt_scalef32_pk32_bf6_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF6_F16)
 v_cvt_scalef32_pk32_bf6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF6_BF16)
@@ -1486,7 +1466,6 @@ v_div_scale_f32 = functools.partial(VOP3B, VOP3BOp.V_DIV_SCALE_F32)
 v_div_scale_f64 = functools.partial(VOP3B, VOP3BOp.V_DIV_SCALE_F64)
 v_mad_u64_u32 = functools.partial(VOP3B, VOP3BOp.V_MAD_U64_U32)
 v_mad_i64_i32 = functools.partial(VOP3B, VOP3BOp.V_MAD_I64_I32)
-cdna4 = functools.partial(VOP3B, VOP3BOp.CDNA4)
 v_pk_mad_i16 = functools.partial(VOP3P, VOP3POp.V_PK_MAD_I16)
 v_pk_mul_lo_u16 = functools.partial(VOP3P, VOP3POp.V_PK_MUL_LO_U16)
 v_pk_add_i16 = functools.partial(VOP3P, VOP3POp.V_PK_ADD_I16)
@@ -1534,8 +1513,6 @@ v_smfmac_i32_16x16x128_i8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_I32_16X16X
 v_smfmac_f32_16x16x128_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_BF8_BF8)
 v_smfmac_f32_16x16x128_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_BF8_FP8)
 v_smfmac_f32_16x16x128_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_FP8_BF8)
-v_mfma_f32_16x16x8_xf32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X8_XF32)
-v_mfma_f32_32x32x4_xf32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X4_XF32)
 v_mfma_f32_32x32x1_2b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X1_2B_F32)
 v_mfma_f32_16x16x1_4b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X1_4B_F32)
 v_mfma_f32_4x4x1_16b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_4X4X1_16B_F32)
@@ -1593,7 +1570,6 @@ v_smfmac_f32_32x32x32_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32
 v_smfmac_f32_32x32x32_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_BF8_FP8)
 v_smfmac_f32_32x32x32_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_FP8_BF8)
 v_smfmac_f32_32x32x32_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_FP8_FP8)
-cdna4 = functools.partial(VOP3P, VOP3POp.CDNA4)
 v_cmp_class_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_CLASS_F32)
 v_cmpx_class_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_CLASS_F32)
 v_cmp_class_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_CLASS_F64)
@@ -1791,42 +1767,4 @@ v_cmpx_le_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_U64)
 v_cmpx_gt_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_U64)
 v_cmpx_ne_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_U64)
 v_cmpx_ge_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_U64)
-v_cmpx_t_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_U64)
-cdna4_e32 = functools.partial(VOPC, VOPCOp.CDNA4)
-
-S_ADD_U32 = SrcEnum.S_ADD_U32
-S_SUB_U32 = SrcEnum.S_SUB_U32
-S_ADD_I32 = SrcEnum.S_ADD_I32
-S_SUB_I32 = SrcEnum.S_SUB_I32
-S_ADDC_U32 = SrcEnum.S_ADDC_U32
-S_SUBB_U32 = SrcEnum.S_SUBB_U32
-S_MIN_I32 = SrcEnum.S_MIN_I32
-FLAT_SCRATCH_LO = SrcEnum.FLAT_SCRATCH_LO
-FLAT_SCRATCH_HI = SrcEnum.FLAT_SCRATCH_HI
-XNACK_MASK_LO = SrcEnum.XNACK_MASK_LO
-XNACK_MASK_HI = SrcEnum.XNACK_MASK_HI
-VCC_LO = SrcEnum.VCC_LO
-VCC_HI = SrcEnum.VCC_HI
-M0 = SrcEnum.M0
-EXEC_LO = SrcEnum.EXEC_LO
-EXEC_HI = SrcEnum.EXEC_HI
-ZERO = SrcEnum.ZERO
-DPP8FI = SrcEnum.DPP8FI
-SHARED_BASE = SrcEnum.SHARED_BASE
-SHARED_LIMIT = SrcEnum.SHARED_LIMIT
-PRIVATE_BASE = SrcEnum.PRIVATE_BASE
-PRIVATE_LIMIT = SrcEnum.PRIVATE_LIMIT
-RESERVED = SrcEnum.RESERVED
-POS_HALF = SrcEnum.POS_HALF
-NEG_HALF = SrcEnum.NEG_HALF
-POS_ONE = SrcEnum.POS_ONE
-NEG_ONE = SrcEnum.NEG_ONE
-POS_TWO = SrcEnum.POS_TWO
-NEG_TWO = SrcEnum.NEG_TWO
-POS_FOUR = SrcEnum.POS_FOUR
-NEG_FOUR = SrcEnum.NEG_FOUR
-INV_2PI = SrcEnum.INV_2PI
-VCCZ = SrcEnum.VCCZ
-EXECZ = SrcEnum.EXECZ
-SCC = SrcEnum.SCC
-LDS_DIRECT = SrcEnum.LDS_DIRECT
+v_cmpx_t_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_U64)
--- a/extra/assembly/amd/autogen/cdna/str_pcode.py
+++ b/extra/assembly/amd/autogen/cdna/str_pcode.py
--- a/extra/assembly/amd/autogen/rdna3/enum.py
+++ b/extra/assembly/amd/autogen/rdna3/enum.py
@@ -1,34 +1,97 @@
-# autogenerated from AMD RDNA3.5 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 from enum import IntEnum

-class SrcEnum(IntEnum):
-  VCC_LO = 106
-  VCC_HI = 107
-  NULL = 124
-  M0 = 125
-  EXEC_LO = 126
-  EXEC_HI = 127
-  ZERO = 128
-  DPP8 = 233
-  DPP8FI = 234
-  SHARED_BASE = 235
-  SHARED_LIMIT = 236
-  PRIVATE_BASE = 237
-  PRIVATE_LIMIT = 238
-  POS_HALF = 240
-  NEG_HALF = 241
-  POS_ONE = 242
-  NEG_ONE = 243
-  POS_TWO = 244
-  NEG_TWO = 245
-  POS_FOUR = 246
-  NEG_FOUR = 247
-  INV_2PI = 248
-  DPP16 = 250
-  VCCZ = 251
-  EXECZ = 252
-  SCC = 253
-  LDS_DIRECT = 254
+class BufFmt(IntEnum):
+  BUF_FMT_8_UNORM = 1
+  BUF_FMT_8_SNORM = 2
+  BUF_FMT_8_USCALED = 3
+  BUF_FMT_8_SSCALED = 4
+  BUF_FMT_8_UINT = 5
+  BUF_FMT_8_SINT = 6
+  BUF_FMT_16_UNORM = 7
+  BUF_FMT_16_SNORM = 8
+  BUF_FMT_16_USCALED = 9
+  BUF_FMT_16_SSCALED = 10
+  BUF_FMT_16_UINT = 11
+  BUF_FMT_16_SINT = 12
+  BUF_FMT_16_FLOAT = 13
+  BUF_FMT_8_8_UNORM = 14
+  BUF_FMT_8_8_SNORM = 15
+  BUF_FMT_8_8_USCALED = 16
+  BUF_FMT_8_8_SSCALED = 17
+  BUF_FMT_8_8_UINT = 18
+  BUF_FMT_8_8_SINT = 19
+  BUF_FMT_32_UINT = 20
+  BUF_FMT_32_SINT = 21
+  BUF_FMT_32_FLOAT = 22
+  BUF_FMT_16_16_UNORM = 23
+  BUF_FMT_16_16_SNORM = 24
+  BUF_FMT_16_16_USCALED = 25
+  BUF_FMT_16_16_SSCALED = 26
+  BUF_FMT_16_16_UINT = 27
+  BUF_FMT_16_16_SINT = 28
+  BUF_FMT_16_16_FLOAT = 29
+  BUF_FMT_10_11_11_FLOAT = 30
+  BUF_FMT_11_11_10_FLOAT = 31
+  BUF_FMT_10_10_10_2_UNORM = 32
+  BUF_FMT_10_10_10_2_SNORM = 33
+  BUF_FMT_10_10_10_2_UINT = 34
+  BUF_FMT_10_10_10_2_SINT = 35
+  BUF_FMT_2_10_10_10_UNORM = 36
+  BUF_FMT_2_10_10_10_SNORM = 37
+  BUF_FMT_2_10_10_10_USCALED = 38
+  BUF_FMT_2_10_10_10_SSCALED = 39
+  BUF_FMT_2_10_10_10_UINT = 40
+  BUF_FMT_2_10_10_10_SINT = 41
+  BUF_FMT_8_8_8_8_UNORM = 42
+  BUF_FMT_8_8_8_8_SNORM = 43
+  BUF_FMT_8_8_8_8_USCALED = 44
+  BUF_FMT_8_8_8_8_SSCALED = 45
+  BUF_FMT_8_8_8_8_UINT = 46
+  BUF_FMT_8_8_8_8_SINT = 47
+  BUF_FMT_32_32_UINT = 48
+  BUF_FMT_32_32_SINT = 49
+  BUF_FMT_32_32_FLOAT = 50
+  BUF_FMT_16_16_16_16_UNORM = 51
+  BUF_FMT_16_16_16_16_SNORM = 52
+  BUF_FMT_16_16_16_16_USCALED = 53
+  BUF_FMT_16_16_16_16_SSCALED = 54
+  BUF_FMT_16_16_16_16_UINT = 55
+  BUF_FMT_16_16_16_16_SINT = 56
+  BUF_FMT_16_16_16_16_FLOAT = 57
+  BUF_FMT_32_32_32_UINT = 58
+  BUF_FMT_32_32_32_SINT = 59
+  BUF_FMT_32_32_32_FLOAT = 60
+  BUF_FMT_32_32_32_32_UINT = 61
+  BUF_FMT_8_SRGB = 64
+  BUF_FMT_8_8_SRGB = 65
+  BUF_FMT_8_8_8_8_SRGB = 66
+  BUF_FMT_5_9_9_9_FLOAT = 67
+  BUF_FMT_5_6_5_UNORM = 68
+  BUF_FMT_1_5_5_5_UNORM = 69
+  BUF_FMT_5_5_5_1_UNORM = 70
+  BUF_FMT_4_4_4_4_UNORM = 71
+  BUF_FMT_4_4_UNORM = 72
+  BUF_FMT_1_UNORM = 73
+  BUF_FMT_1_REVERSED_UNORM = 74
+  BUF_FMT_32_FLOAT_CLAMP = 75
+  BUF_FMT_8_24_UNORM = 76
+  BUF_FMT_8_24_UINT = 77
+  BUF_FMT_24_8_UNORM = 78
+  BUF_FMT_24_8_UINT = 79
+  BUF_FMT_X24_8_32_UINT = 80
+  BUF_FMT_X24_8_32_FLOAT = 81
+  BUF_FMT_GB_GR_UNORM = 82
+  BUF_FMT_GB_GR_SNORM = 83
+  BUF_FMT_GB_GR_UINT = 84
+  BUF_FMT_GB_GR_SRGB = 85
+  BUF_FMT_BG_RG_UNORM = 86
+  BUF_FMT_BG_RG_SNORM = 87
+  BUF_FMT_BG_RG_UINT = 88
+  BUF_FMT_BG_RG_SRGB = 89
+  BUF_FMT_BC1_UNORM = 109
+  BUF_FMT_BC1_SRGB = 110
+  BUF_FMT_BC2_UNORM = 111

 class DSOp(IntEnum):
  DS_ADD_U32 = 0
@@ -1372,7 +1435,6 @@ class VOP3POp(IntEnum):
  V_WMMA_I32_16X16X16_IU4 = 69

 class VOP3SDOp(IntEnum):
-  DWORD = 1
  V_ADD_CO_CI_U32 = 288
  V_SUB_CO_CI_U32 = 289
  V_SUBREV_CO_CI_U32 = 290
@@ -1594,68 +1656,3 @@ class VOPDOp(IntEnum):
  V_DUAL_ADD_NC_U32 = 16
  V_DUAL_LSHLREV_B32 = 17
  V_DUAL_AND_B32 = 18
-
-class BufFmt(IntEnum):
-  BUF_FMT_8_UNORM = 1
-  BUF_FMT_8_SNORM = 2
-  BUF_FMT_8_USCALED = 3
-  BUF_FMT_8_SSCALED = 4
-  BUF_FMT_8_UINT = 5
-  BUF_FMT_8_SINT = 6
-  BUF_FMT_16_UNORM = 7
-  BUF_FMT_16_SNORM = 8
-  BUF_FMT_16_USCALED = 9
-  BUF_FMT_16_SSCALED = 10
-  BUF_FMT_16_UINT = 11
-  BUF_FMT_16_SINT = 12
-  BUF_FMT_16_FLOAT = 13
-  BUF_FMT_8_8_UNORM = 14
-  BUF_FMT_8_8_SNORM = 15
-  BUF_FMT_8_8_USCALED = 16
-  BUF_FMT_8_8_SSCALED = 17
-  BUF_FMT_8_8_UINT = 18
-  BUF_FMT_8_8_SINT = 19
-  BUF_FMT_32_UINT = 20
-  BUF_FMT_32_SINT = 21
-  BUF_FMT_32_FLOAT = 22
-  BUF_FMT_16_16_UNORM = 23
-  BUF_FMT_16_16_SNORM = 24
-  BUF_FMT_16_16_USCALED = 25
-  BUF_FMT_16_16_SSCALED = 26
-  BUF_FMT_16_16_UINT = 27
-  BUF_FMT_16_16_SINT = 28
-  BUF_FMT_16_16_FLOAT = 29
-  BUF_FMT_10_11_11_FLOAT = 30
-  BUF_FMT_11_11_10_FLOAT = 31
-  BUF_FMT_10_10_10_2_UNORM = 32
-  BUF_FMT_10_10_10_2_SNORM = 33
-  BUF_FMT_10_10_10_2_UINT = 34
-  BUF_FMT_10_10_10_2_SINT = 35
-  BUF_FMT_2_10_10_10_UNORM = 36
-  BUF_FMT_2_10_10_10_SNORM = 37
-  BUF_FMT_2_10_10_10_USCALED = 38
-  BUF_FMT_2_10_10_10_SSCALED = 39
-  BUF_FMT_2_10_10_10_UINT = 40
-  BUF_FMT_2_10_10_10_SINT = 41
-  BUF_FMT_8_8_8_8_UNORM = 42
-  BUF_FMT_8_8_8_8_SNORM = 43
-  BUF_FMT_8_8_8_8_USCALED = 44
-  BUF_FMT_8_8_8_8_SSCALED = 45
-  BUF_FMT_8_8_8_8_UINT = 46
-  BUF_FMT_8_8_8_8_SINT = 47
-  BUF_FMT_32_32_UINT = 48
-  BUF_FMT_32_32_SINT = 49
-  BUF_FMT_32_32_FLOAT = 50
-  BUF_FMT_16_16_16_16_UNORM = 51
-  BUF_FMT_16_16_16_16_SNORM = 52
-  BUF_FMT_16_16_16_16_USCALED = 53
-  BUF_FMT_16_16_16_16_SSCALED = 54
-  BUF_FMT_16_16_16_16_UINT = 55
-  BUF_FMT_16_16_16_16_SINT = 56
-  BUF_FMT_16_16_16_16_FLOAT = 57
-  BUF_FMT_32_32_32_UINT = 58
-  BUF_FMT_32_32_32_SINT = 59
-  BUF_FMT_32_32_32_FLOAT = 60
-  BUF_FMT_32_32_32_32_UINT = 61
-  BUF_FMT_32_32_32_32_SINT = 62
-  BUF_FMT_32_32_32_32_FLOAT = 63
--- a/extra/assembly/amd/autogen/rdna3/ins.py
+++ b/extra/assembly/amd/autogen/rdna3/ins.py
@@ -1,12 +1,11 @@
-# autogenerated from AMD RDNA3.5 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 # ruff: noqa: F401,F403
 from typing import Annotated
-from extra.assembly.amd.dsl import bits, BitField, Inst32, Inst64, Inst96, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
+from extra.assembly.amd.dsl import *
 from extra.assembly.amd.autogen.rdna3.enum import *
 import functools

-# instruction formats
-class DPP16(Inst64):
+class DPP16(Inst):
  src0:Src = bits[39:32]
  dpp_ctrl = bits[48:40]
  fi = bits[50]
@@ -18,7 +17,7 @@ class DPP16(Inst64):
  bank_mask = bits[59:56]
  row_mask = bits[63:60]

-class DPP8(Inst64):
+class DPP8(Inst):
  src0:Src = bits[39:32]
  lane_sel0 = bits[42:40]
  lane_sel1 = bits[45:43]
@@ -29,7 +28,7 @@ class DPP8(Inst64):
  lane_sel6 = bits[60:58]
  lane_sel7 = bits[63:61]

-class DS(Inst64):
+class DS(Inst):
  encoding = bits[31:26] == 0b110110
  op:Annotated[BitField, DSOp] = bits[25:18]
  vdst:VGPRField = bits[63:56]
@@ -40,18 +39,18 @@ class DS(Inst64):
  offset1 = bits[15:8]
  gds = bits[17]

-class EXP(Inst64):
+class EXP(Inst):
  encoding = bits[31:26] == 0b111110
+  vsrc0:VGPRField = bits[39:32]
+  vsrc1:VGPRField = bits[47:40]
+  vsrc2:VGPRField = bits[55:48]
+  vsrc3:VGPRField = bits[63:56]
  en = bits[3:0]
  target = bits[9:4]
-  vsrc0 = bits[39:32]
-  vsrc1:VGPRField = bits[47:40]
-  vsrc2 = bits[55:48]
-  vsrc3 = bits[63:56]
  done = bits[11]
  row = bits[13]

-class FLAT(Inst64):
+class FLAT(Inst):
  encoding = bits[31:26] == 0b110111
  op:Annotated[BitField, FLATOp] = bits[24:18]
  vdst:VGPRField = bits[63:56]
@@ -60,12 +59,12 @@ class FLAT(Inst64):
  saddr:SSrc = bits[54:48]
  offset:Imm = bits[12:0]
  seg = bits[17:16]
-  dlc = bits[13]
  glc = bits[14]
+  dlc = bits[13]
  slc = bits[15]
  sve = bits[55]

-class LDSDIR(Inst32):
+class LDSDIR(Inst):
  encoding = bits[31:24] == 0b11001110
  op = bits[21:20]
  vdst:VGPRField = bits[7:0]
@@ -73,29 +72,29 @@ class LDSDIR(Inst32):
  attr_chan = bits[9:8]
  wait_va = bits[19:16]

-class MIMG(Inst64):
+class MIMG(Inst):
  encoding = bits[31:26] == 0b111100
  op:Annotated[BitField, MIMGOp] = bits[25:18]
  vdata:VGPRField = bits[47:40]
  vaddr:VGPRField = bits[39:32]
  srsrc:SGPRField = bits[52:48]
-  ssamp = bits[62:58]
+  ssamp:SGPRField = bits[62:58]
  dmask = bits[11:8]
  dim = bits[4:2]
-  unrm = bits[7]
-  dlc = bits[13]
  glc = bits[14]
+  dlc = bits[13]
  slc = bits[12]
+  tfe = bits[53]
+  unrm = bits[7]
  nsa = bits[0]
  r128 = bits[15]
  a16 = bits[16]
  d16 = bits[17]
-  tfe = bits[53]
  lwe = bits[54]
  addr1 = bits[71:64]
  addr2 = bits[79:72]

-class MTBUF(Inst64):
+class MTBUF(Inst):
  encoding = bits[31:26] == 0b111010
  op:Annotated[BitField, MTBUFOp] = bits[18:15]
  vdata:VGPRField = bits[47:40]
@@ -111,7 +110,7 @@ class MTBUF(Inst64):
  slc = bits[12]
  tfe = bits[53]

-class MUBUF(Inst64):
+class MUBUF(Inst):
  encoding = bits[31:26] == 0b111000
  op:Annotated[BitField, MUBUFOp] = bits[25:18]
  vdata:VGPRField = bits[47:40]
@@ -126,7 +125,7 @@ class MUBUF(Inst64):
  slc = bits[12]
  tfe = bits[53]

-class SMEM(Inst64):
+class SMEM(Inst):
  encoding = bits[31:26] == 0b111101
  op:Annotated[BitField, SMEMOp] = bits[25:18]
  sdata:SGPRField = bits[12:6]
@@ -136,62 +135,63 @@ class SMEM(Inst64):
  glc = bits[14]
  dlc = bits[13]

-class SOP1(Inst32):
+class SOP1(Inst):
  encoding = bits[31:23] == 0b101111101
  op:Annotated[BitField, SOP1Op] = bits[15:8]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]

-class SOP2(Inst32):
+class SOP2(Inst):
  encoding = bits[31:30] == 0b10
  op:Annotated[BitField, SOP2Op] = bits[29:23]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPC(Inst32):
+class SOPC(Inst):
  encoding = bits[31:23] == 0b101111110
  op:Annotated[BitField, SOPCOp] = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPK(Inst32):
+class SOPK(Inst):
  encoding = bits[31:28] == 0b1011
  op:Annotated[BitField, SOPKOp] = bits[27:23]
  sdst:SGPRField = bits[22:16]
  simm16:SImm = bits[15:0]

-class SOPP(Inst32):
+class SOPP(Inst):
  encoding = bits[31:23] == 0b101111111
  op:Annotated[BitField, SOPPOp] = bits[22:16]
  simm16:SImm = bits[15:0]

-class VINTERP(Inst64):
+class VINTERP(Inst):
  encoding = bits[31:24] == 0b11001101
  op:Annotated[BitField, VINTERPOp] = bits[22:16]
  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
+  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
-  waitexp = bits[10:8]
+  neg = bits[63:61]
  clmp = bits[15]
  opsel = bits[14:11]
-  neg = bits[63:61]
+  waitexp = bits[10:8]

-class VOP1(Inst32):
-  encoding = bits[31:25] == 0b111111
+class VOP1(Inst):
+  encoding = bits[31:25] == 0b0111111
  op:Annotated[BitField, VOP1Op] = bits[16:9]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]

-class VOP2(Inst32):
-  encoding = bits[31] == 0
+class VOP2(Inst):
+  encoding = bits[31] == 0b0
  op:Annotated[BitField, VOP2Op] = bits[30:25]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]

-class VOP3(Inst64):
+class VOP3(Inst):
  encoding = bits[31:26] == 0b110101
  op:Annotated[BitField, VOP3Op] = bits[25:16]
  vdst:VGPRField = bits[7:0]
@@ -204,9 +204,8 @@ class VOP3(Inst64):
  clmp = bits[15]
  opsel = bits[14:11]

-class VOP3P(Inst64):
+class VOP3P(Inst):
  encoding = bits[31:24] == 0b11001100
-  _defaults = {'opsel_hi': 3, 'opsel_hi2': 1}
  op:Annotated[BitField, VOP3POp] = bits[22:16]
  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
@@ -214,12 +213,12 @@ class VOP3P(Inst64):
  src2:Src = bits[58:50]
  neg = bits[63:61]
  neg_hi = bits[10:8]
+  clmp = bits[15]
  opsel = bits[13:11]
  opsel_hi = bits[60:59]
-  clmp = bits[15]
  opsel_hi2 = bits[14]

-class VOP3SD(Inst64):
+class VOP3SD(Inst):
  encoding = bits[31:26] == 0b110101
  op:Annotated[BitField, VOP3SDOp] = bits[25:16]
  vdst:VGPRField = bits[7:0]
@@ -227,26 +226,26 @@ class VOP3SD(Inst64):
  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
-  clmp = bits[15]
  omod = bits[60:59]
  neg = bits[63:61]
+  clmp = bits[15]

-class VOPC(Inst32):
-  encoding = bits[31:25] == 0b111110
+class VOPC(Inst):
+  encoding = bits[31:25] == 0b0111110
  op:Annotated[BitField, VOPCOp] = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]

-class VOPD(Inst64):
+class VOPD(Inst):
  encoding = bits[31:26] == 0b110010
  opx:Annotated[BitField, VOPDOp] = bits[25:22]
  opy:Annotated[BitField, VOPDOp] = bits[21:17]
-  vdstx:VGPRField = bits[63:56]
+  vdstx = bits[63:56]
  vdsty:VDSTYEnc = bits[55:49]
  srcx0:Src = bits[8:0]
-  vsrcx1:VGPRField = bits[16:9]
  srcy0:Src = bits[40:32]
-  vsrcy1:VGPRField = bits[48:41]
+  vsrcx1 = bits[16:9]
+  vsrcy1 = bits[48:41]

 # instruction helpers
 ds_add_u32 = functools.partial(DS, DSOp.DS_ADD_U32)
@@ -1077,16 +1076,16 @@ v_add_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_NC_U32)
 v_sub_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_NC_U32)
 v_subrev_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_NC_U32)
 v_fmac_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F32)
-def v_fmamk_f32_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F32, vdst, src0, vsrc1, literal=K)
-def v_fmaak_f32_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F32, vdst, src0, vsrc1, literal=K)
+v_fmamk_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F32)
+v_fmaak_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F32)
 v_cvt_pk_rtz_f16_f32_e32 = functools.partial(VOP2, VOP2Op.V_CVT_PK_RTZ_F16_F32)
 v_add_f16_e32 = functools.partial(VOP2, VOP2Op.V_ADD_F16)
 v_sub_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F16)
 v_subrev_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_F16)
 v_mul_f16_e32 = functools.partial(VOP2, VOP2Op.V_MUL_F16)
 v_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F16)
-def v_fmamk_f16_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F16, vdst, src0, vsrc1, literal=K)
-def v_fmaak_f16_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F16, vdst, src0, vsrc1, literal=K)
+v_fmamk_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F16)
+v_fmaak_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F16)
 v_max_f16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_F16)
 v_min_f16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_F16)
 v_ldexp_f16_e32 = functools.partial(VOP2, VOP2Op.V_LDEXP_F16)
@@ -1554,7 +1553,6 @@ v_wmma_f16_16x16x16_f16 = functools.partial(VOP3P, VOP3POp.V_WMMA_F16_16X16X16_F
 v_wmma_bf16_16x16x16_bf16 = functools.partial(VOP3P, VOP3POp.V_WMMA_BF16_16X16X16_BF16)
 v_wmma_i32_16x16x16_iu8 = functools.partial(VOP3P, VOP3POp.V_WMMA_I32_16X16X16_IU8)
 v_wmma_i32_16x16x16_iu4 = functools.partial(VOP3P, VOP3POp.V_WMMA_I32_16X16X16_IU4)
-dword = functools.partial(VOP3SD, VOP3SDOp.DWORD)
 v_add_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_ADD_CO_CI_U32)
 v_sub_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_SUB_CO_CI_U32)
 v_subrev_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_SUBREV_CO_CI_U32)
@@ -1771,31 +1769,4 @@ v_dual_dot2acc_f32_f16 = functools.partial(VOPD, VOPDOp.V_DUAL_DOT2ACC_F32_F16)
 v_dual_dot2acc_f32_bf16 = functools.partial(VOPD, VOPDOp.V_DUAL_DOT2ACC_F32_BF16)
 v_dual_add_nc_u32 = functools.partial(VOPD, VOPDOp.V_DUAL_ADD_NC_U32)
 v_dual_lshlrev_b32 = functools.partial(VOPD, VOPDOp.V_DUAL_LSHLREV_B32)
-v_dual_and_b32 = functools.partial(VOPD, VOPDOp.V_DUAL_AND_B32)
-
-VCC_LO = SrcEnum.VCC_LO
-VCC_HI = SrcEnum.VCC_HI
-NULL = SrcEnum.NULL
-M0 = SrcEnum.M0
-EXEC_LO = SrcEnum.EXEC_LO
-EXEC_HI = SrcEnum.EXEC_HI
-ZERO = SrcEnum.ZERO
-DPP8FI = SrcEnum.DPP8FI
-SHARED_BASE = SrcEnum.SHARED_BASE
-SHARED_LIMIT = SrcEnum.SHARED_LIMIT
-PRIVATE_BASE = SrcEnum.PRIVATE_BASE
-PRIVATE_LIMIT = SrcEnum.PRIVATE_LIMIT
-POS_HALF = SrcEnum.POS_HALF
-NEG_HALF = SrcEnum.NEG_HALF
-POS_ONE = SrcEnum.POS_ONE
-NEG_ONE = SrcEnum.NEG_ONE
-POS_TWO = SrcEnum.POS_TWO
-NEG_TWO = SrcEnum.NEG_TWO
-POS_FOUR = SrcEnum.POS_FOUR
-NEG_FOUR = SrcEnum.NEG_FOUR
-INV_2PI = SrcEnum.INV_2PI
-VCCZ = SrcEnum.VCCZ
-EXECZ = SrcEnum.EXECZ
-SCC = SrcEnum.SCC
-LDS_DIRECT = SrcEnum.LDS_DIRECT
-OFF = NULL
+v_dual_and_b32 = functools.partial(VOPD, VOPDOp.V_DUAL_AND_B32)
--- a/extra/assembly/amd/autogen/rdna3/str_pcode.py
+++ b/extra/assembly/amd/autogen/rdna3/str_pcode.py
--- a/extra/assembly/amd/autogen/rdna4/enum.py
+++ b/extra/assembly/amd/autogen/rdna4/enum.py
@@ -1,34 +1,100 @@
-# autogenerated from AMD RDNA4 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 from enum import IntEnum

-class SrcEnum(IntEnum):
-  VCC_LO = 106
-  VCC_HI = 107
-  NULL = 124
-  M0 = 125
-  EXEC_LO = 126
-  EXEC_HI = 127
-  ZERO = 128
-  DPP8 = 233
-  DPP8FI = 234
-  SHARED_BASE = 235
-  SHARED_LIMIT = 236
-  PRIVATE_BASE = 237
-  PRIVATE_LIMIT = 238
-  POS_HALF = 240
-  NEG_HALF = 241
-  POS_ONE = 242
-  NEG_ONE = 243
-  POS_TWO = 244
-  NEG_TWO = 245
-  POS_FOUR = 246
-  NEG_FOUR = 247
-  INV_2PI = 248
-  DPP16 = 250
-  VCCZ = 251
-  EXECZ = 252
-  SCC = 253
-  LDS_DIRECT = 254
+class BufFmt(IntEnum):
+  BUF_FMT_8_UNORM = 1
+  BUF_FMT_8_SNORM = 2
+  BUF_FMT_8_USCALED = 3
+  BUF_FMT_8_SSCALED = 4
+  BUF_FMT_8_UINT = 5
+  BUF_FMT_8_SINT = 6
+  BUF_FMT_16_UNORM = 7
+  BUF_FMT_16_SNORM = 8
+  BUF_FMT_16_USCALED = 9
+  BUF_FMT_16_SSCALED = 10
+  BUF_FMT_16_UINT = 11
+  BUF_FMT_16_SINT = 12
+  BUF_FMT_16_FLOAT = 13
+  BUF_FMT_8_8_UNORM = 14
+  BUF_FMT_8_8_SNORM = 15
+  BUF_FMT_8_8_USCALED = 16
+  BUF_FMT_8_8_SSCALED = 17
+  BUF_FMT_8_8_UINT = 18
+  BUF_FMT_8_8_SINT = 19
+  BUF_FMT_32_UINT = 20
+  BUF_FMT_32_SINT = 21
+  BUF_FMT_32_FLOAT = 22
+  BUF_FMT_16_16_UNORM = 23
+  BUF_FMT_16_16_SNORM = 24
+  BUF_FMT_16_16_USCALED = 25
+  BUF_FMT_16_16_SSCALED = 26
+  BUF_FMT_16_16_UINT = 27
+  BUF_FMT_16_16_SINT = 28
+  BUF_FMT_16_16_FLOAT = 29
+  BUF_FMT_10_11_11_FLOAT = 30
+  BUF_FMT_11_11_10_FLOAT = 31
+  BUF_FMT_10_10_10_2_UNORM = 32
+  BUF_FMT_10_10_10_2_SNORM = 33
+  BUF_FMT_10_10_10_2_UINT = 34
+  BUF_FMT_10_10_10_2_SINT = 35
+  BUF_FMT_2_10_10_10_UNORM = 36
+  BUF_FMT_2_10_10_10_SNORM = 37
+  BUF_FMT_2_10_10_10_USCALED = 38
+  BUF_FMT_2_10_10_10_SSCALED = 39
+  BUF_FMT_2_10_10_10_UINT = 40
+  BUF_FMT_2_10_10_10_SINT = 41
+  BUF_FMT_8_8_8_8_UNORM = 42
+  BUF_FMT_8_8_8_8_SNORM = 43
+  BUF_FMT_8_8_8_8_USCALED = 44
+  BUF_FMT_8_8_8_8_SSCALED = 45
+  BUF_FMT_8_8_8_8_UINT = 46
+  BUF_FMT_8_8_8_8_SINT = 47
+  BUF_FMT_32_32_UINT = 48
+  BUF_FMT_32_32_SINT = 49
+  BUF_FMT_32_32_FLOAT = 50
+  BUF_FMT_16_16_16_16_UNORM = 51
+  BUF_FMT_16_16_16_16_SNORM = 52
+  BUF_FMT_16_16_16_16_USCALED = 53
+  BUF_FMT_16_16_16_16_SSCALED = 54
+  BUF_FMT_16_16_16_16_UINT = 55
+  BUF_FMT_16_16_16_16_SINT = 56
+  BUF_FMT_16_16_16_16_FLOAT = 57
+  BUF_FMT_32_32_32_UINT = 58
+  BUF_FMT_32_32_32_SINT = 59
+  BUF_FMT_32_32_32_FLOAT = 60
+  BUF_FMT_32_32_32_32_UINT = 61
+  BUF_FMT_32_32_32_32_SINT = 62
+  BUF_FMT_32_32_32_32_FLOAT = 63
+  BUF_FMT_8_SRGB = 64
+  BUF_FMT_8_8_SRGB = 65
+  BUF_FMT_8_8_8_8_SRGB = 66
+  BUF_FMT_5_9_9_9_FLOAT = 67
+  BUF_FMT_5_6_5_UNORM = 68
+  BUF_FMT_1_5_5_5_UNORM = 69
+  BUF_FMT_5_5_5_1_UNORM = 70
+  BUF_FMT_4_4_4_4_UNORM = 71
+  BUF_FMT_4_4_UNORM = 72
+  BUF_FMT_1_UNORM = 73
+  BUF_FMT_1_REVERSED_UNORM = 74
+  BUF_FMT_32_FLOAT_CLAMP = 75
+  BUF_FMT_8_24_UNORM = 76
+  BUF_FMT_8_24_UINT = 77
+  BUF_FMT_24_8_UNORM = 78
+  BUF_FMT_24_8_UINT = 79
+  BUF_FMT_X24_8_32_UINT = 80
+  BUF_FMT_X24_8_32_FLOAT = 81
+  BUF_FMT_GB_GR_UNORM = 82
+  BUF_FMT_GB_GR_SNORM = 83
+  BUF_FMT_GB_GR_UINT = 84
+  BUF_FMT_GB_GR_SRGB = 85
+  BUF_FMT_BG_RG_UNORM = 86
+  BUF_FMT_BG_RG_SNORM = 87
+  BUF_FMT_BG_RG_UINT = 88
+  BUF_FMT_BG_RG_SRGB = 89
+  BUF_FMT_BC1_UNORM = 109
+  BUF_FMT_BC1_SRGB = 110
+  BUF_FMT_BC2_UNORM = 111
+  BUF_FMT_BC2_SRGB = 112

 class DSOp(IntEnum):
  DS_ADD_U32 = 0
@@ -1347,7 +1413,6 @@ class VOP3POp(IntEnum):
  V_SWMMAC_F32_16X16X32_BF8_BF8 = 90

 class VOP3SDOp(IntEnum):
-  DWORD = 1
  V_ADD_CO_CI_U32 = 288
  V_SUB_CO_CI_U32 = 289
  V_SUBREV_CO_CI_U32 = 290
@@ -1627,52 +1692,3 @@ class VSCRATCHOp(IntEnum):
  SCRATCH_STORE_D16_HI_B16 = 37
  SCRATCH_LOAD_BLOCK = 83
  SCRATCH_STORE_BLOCK = 84
-
-class BufFmt(IntEnum):
-  BUF_FMT_8_UNORM = 1
-  BUF_FMT_8_SNORM = 2
-  BUF_FMT_8_USCALED = 3
-  BUF_FMT_8_SSCALED = 4
-  BUF_FMT_8_UINT = 5
-  BUF_FMT_8_SINT = 6
-  BUF_FMT_16_UNORM = 7
-  BUF_FMT_16_SNORM = 8
-  BUF_FMT_16_USCALED = 9
-  BUF_FMT_16_SSCALED = 10
-  BUF_FMT_16_UINT = 11
-  BUF_FMT_16_SINT = 12
-  BUF_FMT_16_FLOAT = 13
-  BUF_FMT_8_8_UNORM = 14
-  BUF_FMT_8_8_SNORM = 15
-  BUF_FMT_8_8_USCALED = 16
-  BUF_FMT_8_8_SSCALED = 17
-  BUF_FMT_8_8_UINT = 18
-  BUF_FMT_8_8_SINT = 19
-  BUF_FMT_32_UINT = 20
-  BUF_FMT_32_SINT = 21
-  BUF_FMT_32_FLOAT = 22
-  BUF_FMT_16_16_UNORM = 23
-  BUF_FMT_10_10_10_2_UNORM = 32
-  BUF_FMT_10_10_10_2_SNORM = 33
-  BUF_FMT_10_10_10_2_UINT = 34
-  BUF_FMT_10_10_10_2_SINT = 35
-  BUF_FMT_2_10_10_10_UNORM = 36
-  BUF_FMT_2_10_10_10_SNORM = 37
-  BUF_FMT_2_10_10_10_USCALED = 38
-  BUF_FMT_2_10_10_10_SSCALED = 39
-  BUF_FMT_2_10_10_10_UINT = 40
-  BUF_FMT_2_10_10_10_SINT = 41
-  BUF_FMT_8_8_8_8_UNORM = 42
-  BUF_FMT_8_8_8_8_SNORM = 43
-  BUF_FMT_8_8_8_8_USCALED = 44
-  BUF_FMT_8_8_8_8_SSCALED = 45
-  BUF_FMT_8_8_8_8_UINT = 46
-  BUF_FMT_8_8_8_8_SINT = 47
-  BUF_FMT_32_32_UINT = 48
-  BUF_FMT_32_32_SINT = 49
-  BUF_FMT_32_32_FLOAT = 50
-  BUF_FMT_16_16_16_16_UNORM = 51
-  BUF_FMT_16_16_16_16_SNORM = 52
-  BUF_FMT_16_16_16_16_USCALED = 53
-  BUF_FMT_16_16_16_16_SSCALED = 54
-  BUF_FMT_16_16_16_16_UINT = 55
--- a/extra/assembly/amd/autogen/rdna4/ins.py
+++ b/extra/assembly/amd/autogen/rdna4/ins.py
@@ -1,12 +1,11 @@
-# autogenerated from AMD RDNA4 ISA PDF by pdf.py - do not edit
+# autogenerated from AMD ISA PDF by pdf.py - do not edit
 # ruff: noqa: F401,F403
 from typing import Annotated
-from extra.assembly.amd.dsl import bits, BitField, Inst32, Inst64, Inst96, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
+from extra.assembly.amd.dsl import *
 from extra.assembly.amd.autogen.rdna4.enum import *
 import functools

-# instruction formats
-class DPP16(Inst64):
+class DPP16(Inst):
  src0:Src = bits[39:32]
  dpp_ctrl = bits[48:40]
  fi = bits[50]
@@ -18,7 +17,7 @@ class DPP16(Inst64):
  bank_mask = bits[59:56]
  row_mask = bits[63:60]

-class DPP8(Inst64):
+class DPP8(Inst):
  src0:Src = bits[39:32]
  lane_sel0 = bits[42:40]
  lane_sel1 = bits[45:43]
@@ -29,7 +28,17 @@ class DPP8(Inst64):
  lane_sel6 = bits[60:58]
  lane_sel7 = bits[63:61]

-class SMEM(Inst64):
+class DS(Inst):
+  encoding = bits[31:26] == 0b110110
+  op:Annotated[BitField, DSOp] = bits[25:18]
+  vdst:VGPRField = bits[63:56]
+  addr:VGPRField = bits[39:32]
+  data0:VGPRField = bits[47:40]
+  data1:VGPRField = bits[55:48]
+  offset0 = bits[7:0]
+  offset1 = bits[15:8]
+
+class SMEM(Inst):
  encoding = bits[31:26] == 0b111101
  op:Annotated[BitField, SMEMOp] = bits[18:13]
  sdata:SGPRField = bits[12:6]
@@ -39,153 +48,116 @@ class SMEM(Inst64):
  th = bits[24:23]
  ioffset = bits[55:32]

-class SOP1(Inst32):
+class SOP1(Inst):
  encoding = bits[31:23] == 0b101111101
  op:Annotated[BitField, SOP1Op] = bits[15:8]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]

-class SOP2(Inst32):
+class SOP2(Inst):
  encoding = bits[31:30] == 0b10
  op:Annotated[BitField, SOP2Op] = bits[29:23]
  sdst:SGPRField = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPC(Inst32):
+class SOPC(Inst):
  encoding = bits[31:23] == 0b101111110
  op:Annotated[BitField, SOPCOp] = bits[22:16]
  ssrc0:SSrc = bits[7:0]
  ssrc1:SSrc = bits[15:8]

-class SOPK(Inst32):
+class SOPK(Inst):
  encoding = bits[31:28] == 0b1011
  op:Annotated[BitField, SOPKOp] = bits[27:23]
  sdst:SGPRField = bits[22:16]
  simm16:SImm = bits[15:0]

-class SOPP(Inst32):
+class SOPP(Inst):
  encoding = bits[31:23] == 0b101111111
  op:Annotated[BitField, SOPPOp] = bits[22:16]
  simm16:SImm = bits[15:0]

-class VBUFFER(Inst96):
+class VBUFFER(Inst):
  encoding = bits[31:26] == 0b110001
-  soffset:SSrc = bits[6:0]
  op:Annotated[BitField, VBUFFEROp] = bits[21:14]
-  tfe = bits[22]
  vdata:VGPRField = bits[39:32]
-  rsrc = bits[49:41]
-  scope = bits[51:50]
-  th = bits[54:52]
+  vaddr:VGPRField = bits[71:64]
+  soffset:SSrc = bits[6:0]
  format = bits[61:55]
  offen = bits[62]
  idxen = bits[63]
-  vaddr:VGPRField = bits[71:64]
+  tfe = bits[22]
+  rsrc = bits[49:41]
+  scope = bits[51:50]
+  th = bits[54:52]
  ioffset = bits[95:72]

-class VDS(Inst64):
-  encoding = bits[31:26] == 0b110110
-  offset0 = bits[7:0]
-  offset1 = bits[15:8]
-  op = bits[25:18]
-  addr:VGPRField = bits[39:32]
-  data0:VGPRField = bits[47:40]
-  data1:VGPRField = bits[55:48]
-  vdst:VGPRField = bits[63:56]
-
-class VDSDIR(Inst64):
-  encoding = bits[31:24] == 0b11001101
+class VDSDIR(Inst):
+  encoding = bits[31:24] == 0b11001110
+  op:Annotated[BitField, VDSDIROp] = bits[21:20]
  vdst:VGPRField = bits[7:0]
-  waitexp = bits[10:8]
-  opsel = bits[14:11]
-  cm = bits[15]
-  op:Annotated[BitField, VDSDIROp] = bits[20:16]
-  src0:Src = bits[40:32]
-  src1:Src = bits[49:41]
-  src2:Src = bits[58:50]
-  neg = bits[63:61]
+  attr = bits[15:10]
+  attr_chan = bits[9:8]
+  wait_va = bits[19:16]
+  wait_vmvsrc = bits[23]

-class VEXPORT(Inst64):
+class VEXPORT(Inst):
  encoding = bits[31:26] == 0b111110
+  vsrc0:VGPRField = bits[39:32]
+  vsrc1:VGPRField = bits[47:40]
+  vsrc2:VGPRField = bits[55:48]
+  vsrc3:VGPRField = bits[63:56]
  en = bits[3:0]
  target = bits[9:4]
  done = bits[11]
  row = bits[13]
-  vsrc0 = bits[39:32]
-  vsrc1:VGPRField = bits[47:40]
-  vsrc2 = bits[55:48]
-  vsrc3 = bits[63:56]

-class VFLAT(Inst96):
-  encoding = bits[31:24] == 0b11101100
-  saddr:SSrc = bits[6:0]
-  op:Annotated[BitField, VFLATOp] = bits[20:14]
-  vdst:VGPRField = bits[39:32]
-  sve = bits[49]
-  scope = bits[51:50]
-  th = bits[54:52]
-  vsrc = bits[62:55]
-  vaddr:VGPRField = bits[71:64]
-  ioffset = bits[95:72]
-
-class VGLOBAL(Inst96):
-  encoding = bits[31:24] == 0b11101110
-  saddr:SSrc = bits[6:0]
-  op:Annotated[BitField, VGLOBALOp] = bits[20:14]
-  vdst:VGPRField = bits[39:32]
-  sve = bits[49]
-  scope = bits[51:50]
-  th = bits[54:52]
-  vsrc = bits[62:55]
-  vaddr:VGPRField = bits[71:64]
-  ioffset = bits[95:72]
-
-class VIMAGE(Inst96):
+class VIMAGE(Inst):
  encoding = bits[31:26] == 0b110100
+  op:Annotated[BitField, VIMAGEOp] = bits[21:14]
+  vdata:VGPRField = bits[39:32]
+  dmask = bits[25:22]
  dim = bits[2:0]
+  tfe = bits[55]
  r128 = bits[4]
  d16 = bits[5]
  a16 = bits[6]
-  op:Annotated[BitField, VIMAGEOp] = bits[21:14]
-  dmask = bits[25:22]
-  vdata:VGPRField = bits[39:32]
  rsrc = bits[49:41]
  scope = bits[51:50]
  th = bits[54:52]
-  tfe = bits[55]
  vaddr4 = bits[56:63]
  vaddr0 = bits[71:64]
  vaddr1 = bits[79:72]
  vaddr2 = bits[87:80]
  vaddr3 = bits[95:88]

-class VINTERP(Inst64):
+class VINTERP(Inst):
  encoding = bits[31:24] == 0b11001101
  op:Annotated[BitField, VINTERPOp] = bits[20:16]
  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
-  waitexp = bits[10:8]
-  opsel = bits[14:11]
  neg = bits[63:61]
+  opsel = bits[14:11]
+  waitexp = bits[10:8]
  cm = bits[15]

-class VOP1(Inst32):
-  encoding = bits[31:25] == 0b111111
+class VOP1(Inst):
+  encoding = bits[31:25] == 0b0111111
  op:Annotated[BitField, VOP1Op] = bits[15:9]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]

-class VOP2(Inst32):
-  encoding = bits[31] == 0
+class VOP2(Inst):
+  encoding = bits[31] == 0b0
  op:Annotated[BitField, VOP2Op] = bits[30:25]
  vdst:VGPRField = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]

-class VOP3(Inst64):
+class VOP3(Inst):
  encoding = bits[31:26] == 0b110101
  op:Annotated[BitField, VOP3Op] = bits[25:16]
  vdst:VGPRField = bits[7:0]
@@ -198,9 +170,8 @@ class VOP3(Inst64):
  opsel = bits[14:11]
  cm = bits[15]

-class VOP3P(Inst64):
+class VOP3P(Inst):
  encoding = bits[31:24] == 0b11001100
-  _defaults = {'opsel_hi': 3, 'opsel_hi2': 1}
  op:Annotated[BitField, VOP3POp] = bits[22:16]
  vdst:VGPRField = bits[7:0]
  src0:Src = bits[40:32]
@@ -213,7 +184,7 @@ class VOP3P(Inst64):
  opsel_hi2 = bits[14]
  cm = bits[15]

-class VOP3SD(Inst64):
+class VOP3SD(Inst):
  encoding = bits[31:26] == 0b110101
  op:Annotated[BitField, VOP3SDOp] = bits[25:16]
  vdst:VGPRField = bits[7:0]
@@ -221,38 +192,38 @@ class VOP3SD(Inst64):
  src0:Src = bits[40:32]
  src1:Src = bits[49:41]
  src2:Src = bits[58:50]
-  cm = bits[15]
  omod = bits[60:59]
  neg = bits[63:61]
+  cm = bits[15]

-class VOPC(Inst32):
-  encoding = bits[31:25] == 0b111110
+class VOPC(Inst):
+  encoding = bits[31:25] == 0b0111110
  op:Annotated[BitField, VOPCOp] = bits[24:17]
  src0:Src = bits[8:0]
  vsrc1:VGPRField = bits[16:9]

-class VOPD(Inst64):
+class VOPD(Inst):
  encoding = bits[31:26] == 0b110010
  opx:Annotated[BitField, VOPDOp] = bits[25:22]
  opy:Annotated[BitField, VOPDOp] = bits[21:17]
-  vdstx:VGPRField = bits[63:56]
+  vdstx = bits[63:56]
  vdsty:VDSTYEnc = bits[55:49]
  srcx0:Src = bits[8:0]
-  vsrcx1:VGPRField = bits[16:9]
  srcy0:Src = bits[40:32]
-  vsrcy1:VGPRField = bits[48:41]
+  vsrcx1 = bits[16:9]
+  vsrcy1 = bits[48:41]

-class VSAMPLE(Inst96):
+class VSAMPLE(Inst):
  encoding = bits[31:26] == 0b111001
+  op:Annotated[BitField, VSAMPLEOp] = bits[21:14]
+  vdata:VGPRField = bits[39:32]
+  dmask = bits[25:22]
  dim = bits[2:0]
  tfe = bits[3]
+  unrm = bits[13]
  r128 = bits[4]
  d16 = bits[5]
  a16 = bits[6]
-  unrm = bits[13]
-  op:Annotated[BitField, VSAMPLEOp] = bits[21:14]
-  dmask = bits[25:22]
-  vdata:VGPRField = bits[39:32]
  lwe = bits[40]
  rsrc = bits[49:41]
  scope = bits[51:50]
@@ -263,19 +234,130 @@ class VSAMPLE(Inst96):
  vaddr2 = bits[87:80]
  vaddr3 = bits[95:88]

-class VSCRATCH(Inst96):
-  encoding = bits[31:24] == 0b11101101
-  saddr:SSrc = bits[6:0]
-  op:Annotated[BitField, VSCRATCHOp] = bits[20:14]
-  vdst:VGPRField = bits[39:32]
-  sve = bits[49]
-  scope = bits[51:50]
-  th = bits[54:52]
-  vsrc = bits[62:55]
-  vaddr:VGPRField = bits[71:64]
-  ioffset = bits[95:72]
-
 # instruction helpers
+ds_add_u32 = functools.partial(DS, DSOp.DS_ADD_U32)
+ds_sub_u32 = functools.partial(DS, DSOp.DS_SUB_U32)
+ds_rsub_u32 = functools.partial(DS, DSOp.DS_RSUB_U32)
+ds_inc_u32 = functools.partial(DS, DSOp.DS_INC_U32)
+ds_dec_u32 = functools.partial(DS, DSOp.DS_DEC_U32)
+ds_min_i32 = functools.partial(DS, DSOp.DS_MIN_I32)
+ds_max_i32 = functools.partial(DS, DSOp.DS_MAX_I32)
+ds_min_u32 = functools.partial(DS, DSOp.DS_MIN_U32)
+ds_max_u32 = functools.partial(DS, DSOp.DS_MAX_U32)
+ds_and_b32 = functools.partial(DS, DSOp.DS_AND_B32)
+ds_or_b32 = functools.partial(DS, DSOp.DS_OR_B32)
+ds_xor_b32 = functools.partial(DS, DSOp.DS_XOR_B32)
+ds_mskor_b32 = functools.partial(DS, DSOp.DS_MSKOR_B32)
+ds_store_b32 = functools.partial(DS, DSOp.DS_STORE_B32)
+ds_store_2addr_b32 = functools.partial(DS, DSOp.DS_STORE_2ADDR_B32)
+ds_store_2addr_stride64_b32 = functools.partial(DS, DSOp.DS_STORE_2ADDR_STRIDE64_B32)
+ds_cmpstore_b32 = functools.partial(DS, DSOp.DS_CMPSTORE_B32)
+ds_min_num_f32 = functools.partial(DS, DSOp.DS_MIN_NUM_F32)
+ds_max_num_f32 = functools.partial(DS, DSOp.DS_MAX_NUM_F32)
+ds_nop = functools.partial(DS, DSOp.DS_NOP)
+ds_add_f32 = functools.partial(DS, DSOp.DS_ADD_F32)
+ds_store_b8 = functools.partial(DS, DSOp.DS_STORE_B8)
+ds_store_b16 = functools.partial(DS, DSOp.DS_STORE_B16)
+ds_add_rtn_u32 = functools.partial(DS, DSOp.DS_ADD_RTN_U32)
+ds_sub_rtn_u32 = functools.partial(DS, DSOp.DS_SUB_RTN_U32)
+ds_rsub_rtn_u32 = functools.partial(DS, DSOp.DS_RSUB_RTN_U32)
+ds_inc_rtn_u32 = functools.partial(DS, DSOp.DS_INC_RTN_U32)
+ds_dec_rtn_u32 = functools.partial(DS, DSOp.DS_DEC_RTN_U32)
+ds_min_rtn_i32 = functools.partial(DS, DSOp.DS_MIN_RTN_I32)
+ds_max_rtn_i32 = functools.partial(DS, DSOp.DS_MAX_RTN_I32)
+ds_min_rtn_u32 = functools.partial(DS, DSOp.DS_MIN_RTN_U32)
+ds_max_rtn_u32 = functools.partial(DS, DSOp.DS_MAX_RTN_U32)
+ds_and_rtn_b32 = functools.partial(DS, DSOp.DS_AND_RTN_B32)
+ds_or_rtn_b32 = functools.partial(DS, DSOp.DS_OR_RTN_B32)
+ds_xor_rtn_b32 = functools.partial(DS, DSOp.DS_XOR_RTN_B32)
+ds_mskor_rtn_b32 = functools.partial(DS, DSOp.DS_MSKOR_RTN_B32)
+ds_storexchg_rtn_b32 = functools.partial(DS, DSOp.DS_STOREXCHG_RTN_B32)
+ds_storexchg_2addr_rtn_b32 = functools.partial(DS, DSOp.DS_STOREXCHG_2ADDR_RTN_B32)
+ds_storexchg_2addr_stride64_rtn_b32 = functools.partial(DS, DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32)
+ds_cmpstore_rtn_b32 = functools.partial(DS, DSOp.DS_CMPSTORE_RTN_B32)
+ds_min_num_rtn_f32 = functools.partial(DS, DSOp.DS_MIN_NUM_RTN_F32)
+ds_max_num_rtn_f32 = functools.partial(DS, DSOp.DS_MAX_NUM_RTN_F32)
+ds_swizzle_b32 = functools.partial(DS, DSOp.DS_SWIZZLE_B32)
+ds_load_b32 = functools.partial(DS, DSOp.DS_LOAD_B32)
+ds_load_2addr_b32 = functools.partial(DS, DSOp.DS_LOAD_2ADDR_B32)
+ds_load_2addr_stride64_b32 = functools.partial(DS, DSOp.DS_LOAD_2ADDR_STRIDE64_B32)
+ds_load_i8 = functools.partial(DS, DSOp.DS_LOAD_I8)
+ds_load_u8 = functools.partial(DS, DSOp.DS_LOAD_U8)
+ds_load_i16 = functools.partial(DS, DSOp.DS_LOAD_I16)
+ds_load_u16 = functools.partial(DS, DSOp.DS_LOAD_U16)
+ds_consume = functools.partial(DS, DSOp.DS_CONSUME)
+ds_append = functools.partial(DS, DSOp.DS_APPEND)
+ds_add_u64 = functools.partial(DS, DSOp.DS_ADD_U64)
+ds_sub_u64 = functools.partial(DS, DSOp.DS_SUB_U64)
+ds_rsub_u64 = functools.partial(DS, DSOp.DS_RSUB_U64)
+ds_inc_u64 = functools.partial(DS, DSOp.DS_INC_U64)
+ds_dec_u64 = functools.partial(DS, DSOp.DS_DEC_U64)
+ds_min_i64 = functools.partial(DS, DSOp.DS_MIN_I64)
+ds_max_i64 = functools.partial(DS, DSOp.DS_MAX_I64)
+ds_min_u64 = functools.partial(DS, DSOp.DS_MIN_U64)
+ds_max_u64 = functools.partial(DS, DSOp.DS_MAX_U64)
+ds_and_b64 = functools.partial(DS, DSOp.DS_AND_B64)
+ds_or_b64 = functools.partial(DS, DSOp.DS_OR_B64)
+ds_xor_b64 = functools.partial(DS, DSOp.DS_XOR_B64)
+ds_mskor_b64 = functools.partial(DS, DSOp.DS_MSKOR_B64)
+ds_store_b64 = functools.partial(DS, DSOp.DS_STORE_B64)
+ds_store_2addr_b64 = functools.partial(DS, DSOp.DS_STORE_2ADDR_B64)
+ds_store_2addr_stride64_b64 = functools.partial(DS, DSOp.DS_STORE_2ADDR_STRIDE64_B64)
+ds_cmpstore_b64 = functools.partial(DS, DSOp.DS_CMPSTORE_B64)
+ds_min_num_f64 = functools.partial(DS, DSOp.DS_MIN_NUM_F64)
+ds_max_num_f64 = functools.partial(DS, DSOp.DS_MAX_NUM_F64)
+ds_add_rtn_u64 = functools.partial(DS, DSOp.DS_ADD_RTN_U64)
+ds_sub_rtn_u64 = functools.partial(DS, DSOp.DS_SUB_RTN_U64)
+ds_rsub_rtn_u64 = functools.partial(DS, DSOp.DS_RSUB_RTN_U64)
+ds_inc_rtn_u64 = functools.partial(DS, DSOp.DS_INC_RTN_U64)
+ds_dec_rtn_u64 = functools.partial(DS, DSOp.DS_DEC_RTN_U64)
+ds_min_rtn_i64 = functools.partial(DS, DSOp.DS_MIN_RTN_I64)
+ds_max_rtn_i64 = functools.partial(DS, DSOp.DS_MAX_RTN_I64)
+ds_min_rtn_u64 = functools.partial(DS, DSOp.DS_MIN_RTN_U64)
+ds_max_rtn_u64 = functools.partial(DS, DSOp.DS_MAX_RTN_U64)
+ds_and_rtn_b64 = functools.partial(DS, DSOp.DS_AND_RTN_B64)
+ds_or_rtn_b64 = functools.partial(DS, DSOp.DS_OR_RTN_B64)
+ds_xor_rtn_b64 = functools.partial(DS, DSOp.DS_XOR_RTN_B64)
+ds_mskor_rtn_b64 = functools.partial(DS, DSOp.DS_MSKOR_RTN_B64)
+ds_storexchg_rtn_b64 = functools.partial(DS, DSOp.DS_STOREXCHG_RTN_B64)
+ds_storexchg_2addr_rtn_b64 = functools.partial(DS, DSOp.DS_STOREXCHG_2ADDR_RTN_B64)
+ds_storexchg_2addr_stride64_rtn_b64 = functools.partial(DS, DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64)
+ds_cmpstore_rtn_b64 = functools.partial(DS, DSOp.DS_CMPSTORE_RTN_B64)
+ds_min_num_rtn_f64 = functools.partial(DS, DSOp.DS_MIN_NUM_RTN_F64)
+ds_max_num_rtn_f64 = functools.partial(DS, DSOp.DS_MAX_NUM_RTN_F64)
+ds_load_b64 = functools.partial(DS, DSOp.DS_LOAD_B64)
+ds_load_2addr_b64 = functools.partial(DS, DSOp.DS_LOAD_2ADDR_B64)
+ds_load_2addr_stride64_b64 = functools.partial(DS, DSOp.DS_LOAD_2ADDR_STRIDE64_B64)
+ds_add_rtn_f32 = functools.partial(DS, DSOp.DS_ADD_RTN_F32)
+ds_condxchg32_rtn_b64 = functools.partial(DS, DSOp.DS_CONDXCHG32_RTN_B64)
+ds_cond_sub_u32 = functools.partial(DS, DSOp.DS_COND_SUB_U32)
+ds_sub_clamp_u32 = functools.partial(DS, DSOp.DS_SUB_CLAMP_U32)
+ds_pk_add_f16 = functools.partial(DS, DSOp.DS_PK_ADD_F16)
+ds_pk_add_bf16 = functools.partial(DS, DSOp.DS_PK_ADD_BF16)
+ds_store_b8_d16_hi = functools.partial(DS, DSOp.DS_STORE_B8_D16_HI)
+ds_store_b16_d16_hi = functools.partial(DS, DSOp.DS_STORE_B16_D16_HI)
+ds_load_u8_d16 = functools.partial(DS, DSOp.DS_LOAD_U8_D16)
+ds_load_u8_d16_hi = functools.partial(DS, DSOp.DS_LOAD_U8_D16_HI)
+ds_load_i8_d16 = functools.partial(DS, DSOp.DS_LOAD_I8_D16)
+ds_load_i8_d16_hi = functools.partial(DS, DSOp.DS_LOAD_I8_D16_HI)
+ds_load_u16_d16 = functools.partial(DS, DSOp.DS_LOAD_U16_D16)
+ds_load_u16_d16_hi = functools.partial(DS, DSOp.DS_LOAD_U16_D16_HI)
+ds_cond_sub_rtn_u32 = functools.partial(DS, DSOp.DS_COND_SUB_RTN_U32)
+ds_sub_clamp_rtn_u32 = functools.partial(DS, DSOp.DS_SUB_CLAMP_RTN_U32)
+ds_pk_add_rtn_f16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_F16)
+ds_pk_add_rtn_bf16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_BF16)
+ds_store_addtid_b32 = functools.partial(DS, DSOp.DS_STORE_ADDTID_B32)
+ds_load_addtid_b32 = functools.partial(DS, DSOp.DS_LOAD_ADDTID_B32)
+ds_permute_b32 = functools.partial(DS, DSOp.DS_PERMUTE_B32)
+ds_bpermute_b32 = functools.partial(DS, DSOp.DS_BPERMUTE_B32)
+ds_bpermute_fi_b32 = functools.partial(DS, DSOp.DS_BPERMUTE_FI_B32)
+ds_store_b96 = functools.partial(DS, DSOp.DS_STORE_B96)
+ds_store_b128 = functools.partial(DS, DSOp.DS_STORE_B128)
+ds_bvh_stack_push4_pop1_rtn_b32 = functools.partial(DS, DSOp.DS_BVH_STACK_PUSH4_POP1_RTN_B32)
+ds_bvh_stack_push8_pop1_rtn_b32 = functools.partial(DS, DSOp.DS_BVH_STACK_PUSH8_POP1_RTN_B32)
+ds_bvh_stack_push8_pop2_rtn_b64 = functools.partial(DS, DSOp.DS_BVH_STACK_PUSH8_POP2_RTN_B64)
+ds_load_b96 = functools.partial(DS, DSOp.DS_LOAD_B96)
+ds_load_b128 = functools.partial(DS, DSOp.DS_LOAD_B128)
 s_load_b32 = functools.partial(SMEM, SMEMOp.S_LOAD_B32)
 s_load_b64 = functools.partial(SMEM, SMEMOp.S_LOAD_B64)
 s_load_b128 = functools.partial(SMEM, SMEMOp.S_LOAD_B128)
@@ -647,126 +729,6 @@ tbuffer_store_d16_format_xyz = functools.partial(VBUFFER, VBUFFEROp.TBUFFER_STOR
 tbuffer_store_d16_format_xyzw = functools.partial(VBUFFER, VBUFFEROp.TBUFFER_STORE_D16_FORMAT_XYZW)
 ds_param_load = functools.partial(VDSDIR, VDSDIROp.DS_PARAM_LOAD)
 ds_direct_load = functools.partial(VDSDIR, VDSDIROp.DS_DIRECT_LOAD)
-flat_load_u8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_U8)
-flat_load_i8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_I8)
-flat_load_u16 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_U16)
-flat_load_i16 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_I16)
-flat_load_b32 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_B32)
-flat_load_b64 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_B64)
-flat_load_b96 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_B96)
-flat_load_b128 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_B128)
-flat_store_b8 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B8)
-flat_store_b16 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B16)
-flat_store_b32 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B32)
-flat_store_b64 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B64)
-flat_store_b96 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B96)
-flat_store_b128 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_B128)
-flat_load_d16_u8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_U8)
-flat_load_d16_i8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_I8)
-flat_load_d16_b16 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_B16)
-flat_load_d16_hi_u8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_HI_U8)
-flat_load_d16_hi_i8 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_HI_I8)
-flat_load_d16_hi_b16 = functools.partial(VFLAT, VFLATOp.FLAT_LOAD_D16_HI_B16)
-flat_store_d16_hi_b8 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_D16_HI_B8)
-flat_store_d16_hi_b16 = functools.partial(VFLAT, VFLATOp.FLAT_STORE_D16_HI_B16)
-flat_atomic_swap_b32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_SWAP_B32)
-flat_atomic_cmpswap_b32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_CMPSWAP_B32)
-flat_atomic_add_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_ADD_U32)
-flat_atomic_sub_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_SUB_U32)
-flat_atomic_sub_clamp_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_SUB_CLAMP_U32)
-flat_atomic_min_i32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MIN_I32)
-flat_atomic_min_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MIN_U32)
-flat_atomic_max_i32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MAX_I32)
-flat_atomic_max_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MAX_U32)
-flat_atomic_and_b32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_AND_B32)
-flat_atomic_or_b32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_OR_B32)
-flat_atomic_xor_b32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_XOR_B32)
-flat_atomic_inc_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_INC_U32)
-flat_atomic_dec_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_DEC_U32)
-flat_atomic_swap_b64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_SWAP_B64)
-flat_atomic_cmpswap_b64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_CMPSWAP_B64)
-flat_atomic_add_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_ADD_U64)
-flat_atomic_sub_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_SUB_U64)
-flat_atomic_min_i64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MIN_I64)
-flat_atomic_min_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MIN_U64)
-flat_atomic_max_i64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MAX_I64)
-flat_atomic_max_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MAX_U64)
-flat_atomic_and_b64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_AND_B64)
-flat_atomic_or_b64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_OR_B64)
-flat_atomic_xor_b64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_XOR_B64)
-flat_atomic_inc_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_INC_U64)
-flat_atomic_dec_u64 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_DEC_U64)
-flat_atomic_cond_sub_u32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_COND_SUB_U32)
-flat_atomic_min_num_f32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MIN_NUM_F32)
-flat_atomic_max_num_f32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_MAX_NUM_F32)
-flat_atomic_add_f32 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_ADD_F32)
-flat_atomic_pk_add_f16 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_PK_ADD_F16)
-flat_atomic_pk_add_bf16 = functools.partial(VFLAT, VFLATOp.FLAT_ATOMIC_PK_ADD_BF16)
-global_load_u8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_U8)
-global_load_i8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_I8)
-global_load_u16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_U16)
-global_load_i16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_I16)
-global_load_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_B32)
-global_load_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_B64)
-global_load_b96 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_B96)
-global_load_b128 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_B128)
-global_store_b8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B8)
-global_store_b16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B16)
-global_store_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B32)
-global_store_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B64)
-global_store_b96 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B96)
-global_store_b128 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_B128)
-global_load_d16_u8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_U8)
-global_load_d16_i8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_I8)
-global_load_d16_b16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_B16)
-global_load_d16_hi_u8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_HI_U8)
-global_load_d16_hi_i8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_HI_I8)
-global_load_d16_hi_b16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_D16_HI_B16)
-global_store_d16_hi_b8 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_D16_HI_B8)
-global_store_d16_hi_b16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_D16_HI_B16)
-global_load_addtid_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_ADDTID_B32)
-global_store_addtid_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_ADDTID_B32)
-global_inv = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_INV)
-global_wb = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_WB)
-global_atomic_swap_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_SWAP_B32)
-global_atomic_cmpswap_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B32)
-global_atomic_add_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_ADD_U32)
-global_atomic_sub_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_SUB_U32)
-global_atomic_sub_clamp_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_SUB_CLAMP_U32)
-global_atomic_min_i32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MIN_I32)
-global_atomic_min_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MIN_U32)
-global_atomic_max_i32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MAX_I32)
-global_atomic_max_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MAX_U32)
-global_atomic_and_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_AND_B32)
-global_atomic_or_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_OR_B32)
-global_atomic_xor_b32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_XOR_B32)
-global_atomic_inc_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_INC_U32)
-global_atomic_dec_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_DEC_U32)
-global_atomic_swap_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_SWAP_B64)
-global_atomic_cmpswap_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B64)
-global_atomic_add_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_ADD_U64)
-global_atomic_sub_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_SUB_U64)
-global_atomic_min_i64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MIN_I64)
-global_atomic_min_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MIN_U64)
-global_atomic_max_i64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MAX_I64)
-global_atomic_max_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MAX_U64)
-global_atomic_and_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_AND_B64)
-global_atomic_or_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_OR_B64)
-global_atomic_xor_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_XOR_B64)
-global_atomic_inc_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_INC_U64)
-global_atomic_dec_u64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_DEC_U64)
-global_wbinv = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_WBINV)
-global_atomic_cond_sub_u32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_COND_SUB_U32)
-global_atomic_min_num_f32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MIN_NUM_F32)
-global_atomic_max_num_f32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_MAX_NUM_F32)
-global_load_block = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_BLOCK)
-global_store_block = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_STORE_BLOCK)
-global_atomic_add_f32 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_ADD_F32)
-global_load_tr_b128 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_TR_B128)
-global_load_tr_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_LOAD_TR_B64)
-global_atomic_pk_add_f16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_PK_ADD_F16)
-global_atomic_pk_add_bf16 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_PK_ADD_BF16)
-global_atomic_ordered_add_b64 = functools.partial(VGLOBAL, VGLOBALOp.GLOBAL_ATOMIC_ORDERED_ADD_B64)
 image_load = functools.partial(VIMAGE, VIMAGEOp.IMAGE_LOAD)
 image_load_mip = functools.partial(VIMAGE, VIMAGEOp.IMAGE_LOAD_MIP)
 image_load_pck = functools.partial(VIMAGE, VIMAGEOp.IMAGE_LOAD_PCK)
@@ -931,8 +893,8 @@ v_add_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_NC_U32)
 v_sub_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_NC_U32)
 v_subrev_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_NC_U32)
 v_fmac_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F32)
-def v_fmamk_f32_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F32, vdst, src0, vsrc1, literal=K)
-def v_fmaak_f32_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F32, vdst, src0, vsrc1, literal=K)
+v_fmamk_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F32)
+v_fmaak_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F32)
 v_cvt_pk_rtz_f16_f32_e32 = functools.partial(VOP2, VOP2Op.V_CVT_PK_RTZ_F16_F32)
 v_min_num_f16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_NUM_F16)
 v_max_num_f16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_NUM_F16)
@@ -941,8 +903,8 @@ v_sub_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F16)
 v_subrev_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_F16)
 v_mul_f16_e32 = functools.partial(VOP2, VOP2Op.V_MUL_F16)
 v_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F16)
-def v_fmamk_f16_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F16, vdst, src0, vsrc1, literal=K)
-def v_fmaak_f16_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F16, vdst, src0, vsrc1, literal=K)
+v_fmamk_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F16)
+v_fmaak_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F16)
 v_ldexp_f16_e32 = functools.partial(VOP2, VOP2Op.V_LDEXP_F16)
 v_pk_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_PK_FMAC_F16)
 v_cmp_lt_f16_e64 = functools.partial(VOP3, VOP3Op.V_CMP_LT_F16)
@@ -1435,7 +1397,6 @@ v_swmmac_f32_16x16x32_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SWMMAC_F32_16
 v_swmmac_f32_16x16x32_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SWMMAC_F32_16X16X32_FP8_BF8)
 v_swmmac_f32_16x16x32_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SWMMAC_F32_16X16X32_BF8_FP8)
 v_swmmac_f32_16x16x32_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SWMMAC_F32_16X16X32_BF8_BF8)
-dword = functools.partial(VOP3SD, VOP3SDOp.DWORD)
 v_add_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_ADD_CO_CI_U32)
 v_sub_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_SUB_CO_CI_U32)
 v_subrev_co_ci_u32 = functools.partial(VOP3SD, VOP3SDOp.V_SUBREV_CO_CI_U32)
@@ -1682,55 +1643,4 @@ image_gather4_c_cl = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4_C_CL)
 image_gather4_c_l = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4_C_L)
 image_gather4_c_b = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4_C_B)
 image_gather4_c_b_cl = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4_C_B_CL)
-image_gather4h = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4H)
-scratch_load_u8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_U8)
-scratch_load_i8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_I8)
-scratch_load_u16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_U16)
-scratch_load_i16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_I16)
-scratch_load_b32 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_B32)
-scratch_load_b64 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_B64)
-scratch_load_b96 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_B96)
-scratch_load_b128 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_B128)
-scratch_store_b8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B8)
-scratch_store_b16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B16)
-scratch_store_b32 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B32)
-scratch_store_b64 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B64)
-scratch_store_b96 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B96)
-scratch_store_b128 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_B128)
-scratch_load_d16_u8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_U8)
-scratch_load_d16_i8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_I8)
-scratch_load_d16_b16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_B16)
-scratch_load_d16_hi_u8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_HI_U8)
-scratch_load_d16_hi_i8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_HI_I8)
-scratch_load_d16_hi_b16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_D16_HI_B16)
-scratch_store_d16_hi_b8 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_D16_HI_B8)
-scratch_store_d16_hi_b16 = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_D16_HI_B16)
-scratch_load_block = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_LOAD_BLOCK)
-scratch_store_block = functools.partial(VSCRATCH, VSCRATCHOp.SCRATCH_STORE_BLOCK)
-
-VCC_LO = SrcEnum.VCC_LO
-VCC_HI = SrcEnum.VCC_HI
-NULL = SrcEnum.NULL
-M0 = SrcEnum.M0
-EXEC_LO = SrcEnum.EXEC_LO
-EXEC_HI = SrcEnum.EXEC_HI
-ZERO = SrcEnum.ZERO
-DPP8FI = SrcEnum.DPP8FI
-SHARED_BASE = SrcEnum.SHARED_BASE
-SHARED_LIMIT = SrcEnum.SHARED_LIMIT
-PRIVATE_BASE = SrcEnum.PRIVATE_BASE
-PRIVATE_LIMIT = SrcEnum.PRIVATE_LIMIT
-POS_HALF = SrcEnum.POS_HALF
-NEG_HALF = SrcEnum.NEG_HALF
-POS_ONE = SrcEnum.POS_ONE
-NEG_ONE = SrcEnum.NEG_ONE
-POS_TWO = SrcEnum.POS_TWO
-NEG_TWO = SrcEnum.NEG_TWO
-POS_FOUR = SrcEnum.POS_FOUR
-NEG_FOUR = SrcEnum.NEG_FOUR
-INV_2PI = SrcEnum.INV_2PI
-VCCZ = SrcEnum.VCCZ
-EXECZ = SrcEnum.EXECZ
-SCC = SrcEnum.SCC
-LDS_DIRECT = SrcEnum.LDS_DIRECT
-OFF = NULL
+image_gather4h = functools.partial(VSAMPLE, VSAMPLEOp.IMAGE_GATHER4H)
--- a/extra/assembly/amd/autogen/rdna4/str_pcode.py
+++ b/extra/assembly/amd/autogen/rdna4/str_pcode.py
--- a/extra/assembly/amd/dsl.py
+++ b/extra/assembly/amd/dsl.py
@@ -9,6 +9,18 @@ from extra.assembly.amd.autogen.rdna3.enum import (VOP1Op, VOP2Op, VOP3Op, VOP3S
  SOPCOp, SOPKOp, SOPPOp, SMEMOp, DSOp, FLATOp, MUBUFOp, MTBUFOp, MIMGOp, VINTERPOp)
 from extra.assembly.amd.autogen.cdna.enum import VOP1Op as CDNA_VOP1Op, VOP2Op as CDNA_VOP2Op

+# Source operand encoding - constant across all AMD ISAs
+class SrcEnum(IntEnum):
+  VCC_LO=106; VCC_HI=107; NULL=124; M0=125; EXEC_LO=126; EXEC_HI=127; ZERO=128
+  DPP8=233; DPP8FI=234; SHARED_BASE=235; SHARED_LIMIT=236; PRIVATE_BASE=237; PRIVATE_LIMIT=238
+  POS_HALF=240; NEG_HALF=241; POS_ONE=242; NEG_ONE=243; POS_TWO=244; NEG_TWO=245
+  POS_FOUR=246; NEG_FOUR=247; INV_2PI=248; DPP16=250; VCCZ=251; EXECZ=252; SCC=253; LDS_DIRECT=254
+VCC_LO, VCC_HI, NULL, M0, EXEC_LO, EXEC_HI, ZERO = SrcEnum.VCC_LO, SrcEnum.VCC_HI, SrcEnum.NULL, SrcEnum.M0, SrcEnum.EXEC_LO, SrcEnum.EXEC_HI, SrcEnum.ZERO
+DPP8FI, SHARED_BASE, SHARED_LIMIT, PRIVATE_BASE, PRIVATE_LIMIT = SrcEnum.DPP8FI, SrcEnum.SHARED_BASE, SrcEnum.SHARED_LIMIT, SrcEnum.PRIVATE_BASE, SrcEnum.PRIVATE_LIMIT
+POS_HALF, NEG_HALF, POS_ONE, NEG_ONE, POS_TWO, NEG_TWO = SrcEnum.POS_HALF, SrcEnum.NEG_HALF, SrcEnum.POS_ONE, SrcEnum.NEG_ONE, SrcEnum.POS_TWO, SrcEnum.NEG_TWO
+POS_FOUR, NEG_FOUR, INV_2PI, VCCZ, EXECZ, SCC, LDS_DIRECT = SrcEnum.POS_FOUR, SrcEnum.NEG_FOUR, SrcEnum.INV_2PI, SrcEnum.VCCZ, SrcEnum.EXECZ, SrcEnum.SCC, SrcEnum.LDS_DIRECT
+OFF = NULL
+
 # Common masks and bit conversion functions
 MASK32, MASK64, MASK128 = 0xffffffff, 0xffffffffffffffff, (1 << 128) - 1
 _struct_f, _struct_I = struct.Struct("<f"), struct.Struct("<I")
@@ -288,7 +300,16 @@ class Inst:

  def __init_subclass__(cls, **kwargs):
    super().__init_subclass__(**kwargs)
-    cls._fields = {n: v[0] if isinstance(v, tuple) else v for n, v in cls.__dict__.items() if isinstance(v, BitField) or (isinstance(v, tuple) and len(v) == 2 and isinstance(v[0], BitField))}
+    # Merge fields from parent classes
+    cls._fields = {}
+    for base in reversed(cls.__mro__):
+      if base is Inst or not hasattr(base, '_fields'): continue
+      cls._fields.update(base._fields)
+    # Add this class's own fields (overrides parents)
+    cls._fields.update({n: v[0] if isinstance(v, tuple) else v for n, v in cls.__dict__.items() if isinstance(v, BitField) or (isinstance(v, tuple) and len(v) == 2 and isinstance(v[0], BitField))})
+    # Compute size from max bit (exclude optional fields starting at bit 64+, e.g. MIMG NSA)
+    max_bit = max((bf.hi for bf in cls._fields.values() if bf.lo < 64), default=0) if cls._fields else 0
+    cls._sz = 12 if max_bit > 63 else 8 if max_bit > 31 else 4
    if 'encoding' in cls._fields and isinstance(cls.__dict__.get('encoding'), tuple): cls._encoding = cls.__dict__['encoding']

  def _or_field(self, name: str, bit: int):
@@ -352,6 +373,16 @@ class Inst:
    field_names = [n for n in self._fields if n != 'encoding']
    # Map Python-friendly names to actual field names (abs_ -> abs for Python reserved word)
    if 'abs_' in kwargs: kwargs['abs'] = kwargs.pop('abs_')
+    # If more args than fields, treat extra arg as literal (for FMAAK/FMAMK style instructions)
+    # FMAMK has K in middle (vdst, src0, K, vsrc1), FMAAK has K at end (vdst, src0, vsrc1, K)
+    args = list(args)
+    if len(args) > len(field_names) and literal is None:
+      for i, a in enumerate(args):
+        if isinstance(a, int) and not isinstance(a, SrcEnum) and i < len(field_names) and field_names[i] in ('vsrc1',):
+          literal = args.pop(i)
+          break
+      else:
+        literal = args.pop()  # fallback: last arg is literal
    orig_args = dict(zip(field_names, args)) | kwargs
    self._values.update(orig_args)
    self._precompute()
@@ -450,7 +481,7 @@ class Inst:
    return result + (lit32 & MASK32).to_bytes(4, 'little')

  @classmethod
-  def _size(cls) -> int: return 4 if issubclass(cls, Inst32) else 12 if issubclass(cls, Inst96) else 8
+  def _size(cls) -> int: return cls._sz
  def size(self) -> int:
    # Literal is always 4 bytes in the binary (for 64-bit ops, it's in high 32 bits)
    return self._size() + (4 if self._literal is not None else 0)
@@ -583,6 +614,4 @@ class Inst:
  def is_64bit(self) -> bool: return spec_is_64bit(self.op_name)
  def is_dst_16(self) -> bool: return self._spec_regs[0] == 1 and is_dtype_16(self._spec_dtype[0])

-class Inst32(Inst): pass
-class Inst64(Inst): pass
-class Inst96(Inst): pass
+
--- a/extra/assembly/amd/emu.py
+++ b/extra/assembly/amd/emu.py
@@ -7,8 +7,9 @@ from extra.assembly.amd.dsl import Inst, unwrap, FLOAT_ENC, MASK32, MASK64, _f32
 from extra.assembly.amd.asm import detect_format
 from extra.assembly.amd.pcode import compile_pseudocode
 from extra.assembly.amd.autogen.rdna3.str_pcode import PSEUDOCODE_STRINGS
+from extra.assembly.amd.dsl import SrcEnum
 from extra.assembly.amd.autogen.rdna3.ins import (SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD,
-  SrcEnum, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, SCRATCHOp, VOPDOp)
+  SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, SCRATCHOp, VOPDOp)

 WAVE_SIZE, SGPR_COUNT, VGPR_COUNT = 32, 128, 256
 VCC_LO, VCC_HI, NULL, EXEC_LO, EXEC_HI, SCC = SrcEnum.VCC_LO, SrcEnum.VCC_HI, SrcEnum.NULL, SrcEnum.EXEC_LO, SrcEnum.EXEC_HI, SrcEnum.SCC
--- a/extra/assembly/amd/pcode.py
+++ b/extra/assembly/amd/pcode.py
@@ -447,6 +447,27 @@ TWO_OVER_PI_1201 = Reg(0x0145f306dc9c882a53f84eafa3ea69bb81b6c52b3278872083fca2c
 # COMPILER: pseudocode -> Python (minimal transforms)
 # ═══════════════════════════════════════════════════════════════════════════════

+def _filter_pseudocode(pseudocode: str) -> str:
+  """Filter raw PDF pseudocode to only include actual code lines."""
+  pcode_lines, in_lambda, depth = [], 0, 0
+  for line in pseudocode.split('\n'):
+    s = line.strip()
+    if not s: continue
+    if '=>' in s or re.match(r'^[A-Z_]+\(', s): continue  # Skip example lines
+    if '= lambda(' in s: in_lambda += 1; continue  # Skip lambda definitions
+    if in_lambda > 0:
+      if s.endswith(');'): in_lambda -= 1
+      continue
+    # Only include lines that look like pseudocode
+    is_code = (any(p in s for p in ['D0.', 'D1.', 'S0.', 'S1.', 'S2.', 'SCC =', 'SCC ?', 'VCC', 'EXEC', 'tmp =', 'tmp[', 'lane =', 'PC =',
+                                    'D0[', 'D1[', 'S0[', 'S1[', 'S2[', 'MEM[', 'RETURN_DATA', 'VADDR', 'VDATA', 'VDST', 'SADDR', 'OFFSET']) or
+               s.startswith(('if ', 'else', 'elsif', 'endif', 'declare ', 'for ', 'endfor', '//')) or
+               re.match(r'^[a-z_]+\s*=', s) or re.match(r'^[a-z_]+\[', s) or (depth > 0 and '=' in s))
+    if s.startswith('if '): depth += 1
+    elif s.startswith('endif'): depth = max(0, depth - 1)
+    if is_code: pcode_lines.append(s)
+  return '\n'.join(pcode_lines)
+
 def _compile_pseudocode(pseudocode: str) -> str:
  """Compile pseudocode to Python. Transforms are minimal - most syntax just works."""
  pseudocode = re.sub(r'\bpass\b', 'pass_', pseudocode)  # 'pass' is Python keyword
@@ -756,9 +777,10 @@ _PCODE_GLOBALS = {
@functools.cache
 def compile_pseudocode(cls_name: str, op_name: str, pseudocode: str):
  """Compile pseudocode string to executable function. Cached for performance."""
-  code = _compile_pseudocode(pseudocode)
+  filtered = _filter_pseudocode(pseudocode)
+  code = _compile_pseudocode(filtered)
  code = _apply_pseudocode_fixes(op_name, code)
-  fn_code = _generate_function(cls_name, op_name, pseudocode, code)
+  fn_code = _generate_function(cls_name, op_name, filtered, code)
  fn_name = f"_{cls_name}_{op_name}"
  local_ns = {}
  exec(fn_code, _PCODE_GLOBALS, local_ns)
--- a/extra/assembly/amd/pdf.py
+++ b/extra/assembly/amd/pdf.py
@@ -1,457 +1,305 @@
-# Generate AMD ISA autogen files from PDF documentation
-# Combines format/enum generation (previously in dsl.py) and pseudocode compilation (previously in pcode.py)
-# Usage: python -m extra.assembly.amd.pdf [--arch rdna3|rdna4|cdna|all]
-import re, functools
-from pathlib import Path
-from concurrent.futures import ProcessPoolExecutor
+# Generic PDF text extractor - no external dependencies
+import re, zlib
+from tinygrad.helpers import fetch, merge_dicts

 PDF_URLS = {
  "rdna3": "https://docs.amd.com/api/khub/documents/UVVZM22UN7tMUeiW_4ShTQ/content",
  "rdna4": "https://docs.amd.com/api/khub/documents/uQpkEvk3pv~kfAb2x~j4uw/content",
-  "cdna": ["https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf",
-           "https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-cdna4-instruction-set-architecture.pdf"],
+  "cdna": "https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-cdna4-instruction-set-architecture.pdf",
 }

-# Field type mappings and ordering
-FIELD_TYPES = {'SSRC0': 'SSrc', 'SSRC1': 'SSrc', 'SOFFSET': 'SSrc', 'SADDR': 'SSrc', 'SRC0': 'Src', 'SRC1': 'Src', 'SRC2': 'Src',
-  'SDST': 'SGPRField', 'SBASE': 'SGPRField', 'SDATA': 'SGPRField', 'SRSRC': 'SGPRField', 'VDST': 'VGPRField', 'VSRC1': 'VGPRField',
-  'VDATA': 'VGPRField', 'VADDR': 'VGPRField', 'ADDR': 'VGPRField', 'DATA': 'VGPRField', 'DATA0': 'VGPRField', 'DATA1': 'VGPRField',
-  'SIMM16': 'SImm', 'OFFSET': 'Imm', 'OPX': 'VOPDOp', 'OPY': 'VOPDOp', 'SRCX0': 'Src', 'SRCY0': 'Src',
-  'VSRCX1': 'VGPRField', 'VSRCY1': 'VGPRField', 'VDSTX': 'VGPRField', 'VDSTY': 'VDSTYEnc'}
-FIELD_ORDER = {
-  'SOP2': ['op', 'sdst', 'ssrc0', 'ssrc1'], 'SOP1': ['op', 'sdst', 'ssrc0'], 'SOPC': ['op', 'ssrc0', 'ssrc1'],
-  'SOPK': ['op', 'sdst', 'simm16'], 'SOPP': ['op', 'simm16'], 'VOP1': ['op', 'vdst', 'src0'], 'VOPC': ['op', 'src0', 'vsrc1'],
-  'VOP2': ['op', 'vdst', 'src0', 'vsrc1'], 'VOP3SD': ['op', 'vdst', 'sdst', 'src0', 'src1', 'src2', 'clmp'],
-  'SMEM': ['op', 'sdata', 'sbase', 'soffset', 'offset', 'glc', 'dlc'], 'DS': ['op', 'vdst', 'addr', 'data0', 'data1'],
-  'VOP3': ['op', 'vdst', 'src0', 'src1', 'src2', 'omod', 'neg', 'abs', 'clmp', 'opsel'],
-  'VOP3P': ['op', 'vdst', 'src0', 'src1', 'src2', 'neg', 'neg_hi', 'opsel', 'opsel_hi', 'clmp'],
-  'FLAT': ['op', 'vdst', 'addr', 'data', 'saddr', 'offset', 'seg', 'dlc', 'glc', 'slc'],
-  'MUBUF': ['op', 'vdata', 'vaddr', 'srsrc', 'soffset', 'offset', 'offen', 'idxen', 'glc', 'dlc', 'slc', 'tfe'],
-  'MTBUF': ['op', 'vdata', 'vaddr', 'srsrc', 'soffset', 'offset', 'format', 'offen', 'idxen', 'glc', 'dlc', 'slc', 'tfe'],
-  'MIMG': ['op', 'vdata', 'vaddr', 'srsrc', 'ssamp', 'dmask', 'dim', 'unrm', 'dlc', 'glc', 'slc'],
-  'EXP': ['en', 'target', 'vsrc0', 'vsrc1', 'vsrc2', 'vsrc3', 'done', 'row'],
-  'VINTERP': ['op', 'vdst', 'src0', 'src1', 'src2', 'waitexp', 'clmp', 'opsel', 'neg'],
-  'VOPD': ['opx', 'opy', 'vdstx', 'vdsty', 'srcx0', 'vsrcx1', 'srcy0', 'vsrcy1'],
-  'LDSDIR': ['op', 'vdst', 'attr', 'attr_chan', 'wait_va']}
-SRC_EXTRAS = {233: 'DPP8', 234: 'DPP8FI', 250: 'DPP16', 251: 'VCCZ', 252: 'EXECZ', 254: 'LDS_DIRECT'}
-FLOAT_MAP = {'0.5': 'POS_HALF', '-0.5': 'NEG_HALF', '1.0': 'POS_ONE', '-1.0': 'NEG_ONE', '2.0': 'POS_TWO', '-2.0': 'NEG_TWO',
-  '4.0': 'POS_FOUR', '-4.0': 'NEG_FOUR', '1/(2*PI)': 'INV_2PI', '0': 'ZERO'}
-INST_PATTERN = re.compile(r'^([SVD]S?_[A-Z0-9_]+|(?:FLAT|GLOBAL|SCRATCH)_[A-Z0-9_]+)\s+(\d+)\s*$', re.M)
-
-
-
 # ═══════════════════════════════════════════════════════════════════════════════
-# PDF PARSING WITH PAGE CACHING
+# Generic PDF extraction tools
 # ═══════════════════════════════════════════════════════════════════════════════

-class CachedPDF:
-  """PDF wrapper with page text/table caching for faster repeated access."""
-  def __init__(self, pdf):
-    self._pdf, self._text_cache, self._table_cache = pdf, {}, {}
-  def __len__(self): return len(self._pdf.pages)
-  def text(self, i):
-    if i not in self._text_cache: self._text_cache[i] = self._pdf.pages[i].extract_text() or ''
-    return self._text_cache[i]
-  def tables(self, i):
-    if i not in self._table_cache: self._table_cache[i] = [t.extract() for t in self._pdf.pages[i].find_tables()]
-    return self._table_cache[i]
+def extract(url: str) -> list[list[tuple[float, float, str, str]]]:
+  """Extract positioned text from PDF. Returns list of text elements (x, y, text, font) per page."""
+  data = fetch(url).read_bytes()

-def _parse_bits(s: str) -> tuple[int, int] | None:
-  return (int(m.group(1)), int(m.group(2) or m.group(1))) if (m := re.match(r'\[(\d+)(?::(\d+))?\]', s)) else None
+  # Parse xref table to locate objects
+  xref: dict[int, int] = {}
+  pos = int(re.search(rb'startxref\s+(\d+)', data).group(1)) + 4
+  while data[pos:pos+7] != b'trailer':
+    while data[pos:pos+1] in b' \r\n': pos += 1
+    line_end = data.find(b'\n', pos)
+    start_obj, count = map(int, data[pos:line_end].split()[:2])
+    pos = line_end + 1
+    for i in range(count):
+      if data[pos+17:pos+18] == b'n' and (off := int(data[pos:pos+10])) > 0: xref[start_obj + i] = off
+      pos += 20

-def _parse_fields_table(table: list, fmt: str, enums: set[str]) -> list[tuple]:
-  fields = []
-  for row in table[1:]:
-    if not row or not row[0]: continue
-    name, bits_str = row[0].split('\n')[0].strip(), (row[1] or '').split('\n')[0].strip()
-    if not (bits := _parse_bits(bits_str)): continue
-    enc_val, hi, lo = None, bits[0], bits[1]
-    if name == 'ENCODING' and row[2]:
-      desc = row[2]
-      # Handle shared FLAT/GLOBAL/SCRATCH table: look for format-specific encoding
-      fmt_key = fmt.lstrip('V').lower().capitalize()  # VFLAT -> Flat, VGLOBAL -> Global
-      if m := re.search(rf"{fmt_key}='b([01_]+)", desc):
-        enc_bits = m.group(1).replace('_', '')
-      elif m := re.search(r"(?:'b|Must be:\s*)([01_]+)", desc):
-        enc_bits = m.group(1).replace('_', '')
-      else:
-        enc_bits = None
-      if enc_bits:
-        enc_val, declared_width, actual_width = int(enc_bits, 2), hi - lo + 1, len(enc_bits)
-        if actual_width > declared_width: lo = hi - actual_width + 1
-    ftype = f"{fmt}Op" if name == 'OP' and f"{fmt}Op" in enums else FIELD_TYPES.get(name.upper())
-    fields.append((name, hi, lo, enc_val, ftype))
-  return fields
+  def get_stream(n: int) -> bytes:
+    obj = data[xref[n]:data.find(b'endobj', xref[n])]
+    raw = obj[obj.find(b'stream\n') + 7:obj.find(b'\nendstream')]
+    return zlib.decompress(raw) if b'/FlateDecode' in obj else raw

-def _parse_single_pdf(url: str):
-  """Parse a single PDF and return (formats, enums, src_enum, doc_name, instructions)."""
-  import pdfplumber
-  from tinygrad.helpers import fetch
+  # Find page content streams and extract text
+  pages = []
+  for n in sorted(xref):
+    if b'/Type /Page' not in data[xref[n]:xref[n]+500]: continue
+    if not (m := re.search(rb'/Contents (\d+) 0 R', data[xref[n]:xref[n]+500])): continue
+    stream = get_stream(int(m.group(1))).decode('latin-1')
+    elements, font = [], ''
+    for bt in re.finditer(r'BT(.*?)ET', stream, re.S):
+      x, y = 0.0, 0.0
+      for m in re.finditer(r'(/F[\d.]+) [\d.]+ Tf|([\d.+-]+) ([\d.+-]+) Td|[\d.+-]+ [\d.+-]+ [\d.+-]+ [\d.+-]+ ([\d.+-]+) ([\d.+-]+) Tm|<([0-9A-Fa-f]+)>.*?Tj|\[([^\]]+)\] TJ', bt.group(1)):
+        if m.group(1): font = m.group(1)
+        elif m.group(2): x, y = x + float(m.group(2)), y + float(m.group(3))
+        elif m.group(4): x, y = float(m.group(4)), float(m.group(5))
+        elif m.group(6) and (t := bytes.fromhex(m.group(6)).decode('latin-1')).strip(): elements.append((x, y, t, font))
+        elif m.group(7) and (t := ''.join(bytes.fromhex(h).decode('latin-1') for h in re.findall(r'<([0-9A-Fa-f]+)>', m.group(7)))).strip(): elements.append((x, y, t, font))
+    pages.append(sorted(elements, key=lambda e: (-e[1], e[0])))
+  return pages

-  pdf = CachedPDF(pdfplumber.open(fetch(url)))
-  total_pages = len(pdf)
+def extract_tables(pages: list[list[tuple[float, float, str, str]]]) -> dict[int, tuple[str, list[list[str]]]]:
+  """Extract numbered tables from PDF pages. Returns {table_num: (title, rows)} where rows is list of cells per row."""
+  def group_by_y(texts, key=lambda y: round(y)):
+    by_y: dict[int, list[tuple[float, float, str]]] = {}
+    for x, y, t, _ in texts:
+      by_y.setdefault(key(y), []).append((x, y, t))
+    return by_y

-  # Auto-detect document type
-  first_page = pdf.text(0)
-  is_cdna4, is_cdna3 = 'CDNA4' in first_page or 'CDNA 4' in first_page, 'CDNA3' in first_page or 'MI300' in first_page
-  is_cdna, is_rdna4 = is_cdna3 or is_cdna4, 'RDNA4' in first_page or 'RDNA 4' in first_page
-  is_rdna35, is_rdna3 = 'RDNA3.5' in first_page or 'RDNA 3.5' in first_page, 'RDNA3' in first_page and 'RDNA3.5' not in first_page
-  doc_name = "CDNA4" if is_cdna4 else "CDNA3" if is_cdna3 else "RDNA4" if is_rdna4 else "RDNA3.5" if is_rdna35 else "RDNA3" if is_rdna3 else "Unknown"
+  # Find all table headers by merging text on same line
+  table_positions = []
+  for page_idx, texts in enumerate(pages):
+    for items in group_by_y(texts).values():
+      line = ''.join(t for _, t in sorted((x, t) for x, _, t in items))
+      if m := re.search(r'Table (\d+)\. (.+)', line):
+        table_positions.append((int(m.group(1)), m.group(2).strip(), page_idx, items[0][1]))
+  table_positions.sort(key=lambda t: (t[2], -t[3]))

-  # Find Microcode Formats section (for formats/enums)
-  microcode_start = next((i for i in range(int(total_pages * 0.2), total_pages)
-                          if re.search(r'\d+\.\d+\.\d+\.\s+SOP2\b|Chapter \d+\.\s+Microcode Formats', pdf.text(i))), int(total_pages * 0.9))
-  # Find Instructions section (for pseudocode)
-  instr_start = next((i for i in range(int(total_pages * 0.1), int(total_pages * 0.5))
-                      if re.search(r'Chapter \d+\.\s+Instructions\b', pdf.text(i))), total_pages // 3)
-  instr_end = next((i for start in [int(total_pages * 0.6), int(total_pages * 0.5), instr_start]
-                    for i in range(start, min(start + 100, total_pages))
-                    if re.search(r'Chapter \d+\.\s+Microcode Formats', pdf.text(i))), total_pages)
-
-  # Parse src enum from SSRC encoding table
-  src_enum = dict(SRC_EXTRAS)
-  for i in range(microcode_start, min(microcode_start + 10, total_pages)):
-    text = pdf.text(i)
-    if 'SSRC0' in text and 'VCC_LO' in text:
-      for m in re.finditer(r'^(\d+)\s+(\S+)', text, re.M):
-        val, name = int(m.group(1)), m.group(2).rstrip('.:')
-        if name in FLOAT_MAP: src_enum[val] = FLOAT_MAP[name]
-        elif re.match(r'^[A-Z][A-Z0-9_]*$', name): src_enum[val] = name
+  # For each table, find rows with matching X positions
+  result: dict[int, tuple[str, list[list[str]]]] = {}
+  for num, title, start_page, header_y in table_positions:
+    rows, col_xs = [], None
+    for page_idx in range(start_page, len(pages)):
+      page_texts = [(x, y, t) for x, y, t, _ in pages[page_idx] if 30 < y < 760 and (page_idx > start_page or y < header_y)]
+      for items in sorted(group_by_y([(x, y, t, '') for x, y, t in page_texts], key=lambda y: round(y / 5)).values(), key=lambda items: -items[0][1]):
+        xs = tuple(sorted(round(x) for x, _, _ in items))
+        if col_xs is None:
+          if len(xs) < 2: continue  # Skip single-column rows before table starts
+          col_xs = xs
+        elif len(xs) == 1 and xs[0] in col_xs: continue  # Skip continuation rows at known column positions
+        elif not any(c in xs for c in col_xs[:2]): break  # Row missing first columns = end of table
+        rows.append([t for _, t in sorted((x, t) for x, _, t in items)])
+      else: continue
      break
+    if rows: result[num] = (title, rows)
+  return result

-  # Parse opcode tables
-  full_text = '\n'.join(pdf.text(i) for i in range(microcode_start, min(microcode_start + 50, total_pages)))
+# ═══════════════════════════════════════════════════════════════════════════════
+# AMD specific extraction
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def extract_enums(tables: dict[int, tuple[str, list[list[str]]]]) -> dict[str, dict[int, str]]:
+  """Extract all enums from tables. Returns {enum_name: {value: name}}."""
  enums: dict[str, dict[int, str]] = {}
-  for m in re.finditer(r'Table \d+\. (\w+) Opcodes(.*?)(?=Table \d+\.|\n\d+\.\d+\.\d+\.\s+\w+\s*\nDescription|$)', full_text, re.S):
-    if ops := {int(x.group(1)): x.group(2) for x in re.finditer(r'(\d+)\s+([A-Z][A-Z0-9_]+)', m.group(2))}:
-      enums[m.group(1) + "Op"] = ops
-  if vopd_m := re.search(r'Table \d+\. VOPD Y-Opcodes\n(.*?)(?=Table \d+\.|15\.\d)', full_text, re.S):
-    if ops := {int(x.group(1)): x.group(2) for x in re.finditer(r'(\d+)\s+(V_DUAL_\w+)', vopd_m.group(1))}:
-      enums["VOPDOp"] = ops
-  enum_names = set(enums.keys())
+  for num, (title, rows) in tables.items():
+    # Opcode enums from "XXX Opcodes" tables
+    if m := re.match(r'(\w+) (?:Y-)?Opcodes', title):
+      fmt_name = 'VOPD' if 'Y-Opcodes' in title else m.group(1)
+      ops: dict[int, str] = {}
+      for row in rows:
+        for i in range(0, len(row) - 1, 2):
+          if row[i].isdigit() and re.match(r'^[A-Z][A-Z0-9_]+$', row[i + 1]):
+            ops[int(row[i])] = row[i + 1]
+      if ops: enums[fmt_name] = ops
+    # BufFmt from "Data Format" tables
+    if 'Data Format' in title:
+      for row in rows:
+        for i in range(0, len(row) - 1, 2):
+          if row[i].isdigit() and re.match(r'^[\dA-Z_]+$', row[i + 1]) and 'INVALID' not in row[i + 1]:
+            enums.setdefault('BufFmt', {})[int(row[i])] = row[i + 1]
+  return enums

-  # Parse instruction formats
-  def is_fields_table(t): return t and len(t) > 1 and t[0] and 'Field' in str(t[0][0] or '')
-  def has_encoding(fields): return any(f[0] == 'ENCODING' for f in fields)
-  def has_header_before_fields(text): return (pos := text.find('Field Name')) != -1 and bool(re.search(r'\d+\.\d+\.\d+\.\s+\w+\s*\n', text[:pos]))
+def extract_ins(tables: dict[int, tuple[str, list[list[str]]]]) -> tuple[dict[str, list[tuple[str, int, int]]], dict[str, str]]:
+  """Extract formats and encodings from 'XXX Fields' tables. Returns (formats, encodings)."""
+  formats: dict[str, list[tuple[str, int, int]]] = {}
+  encodings: dict[str, str] = {}
+  for num, (title, rows) in tables.items():
+    if not (m := re.match(r'(\w+) Fields$', title)): continue
+    fmt_name = m.group(1)
+    fields = []
+    for row in rows:
+      if len(row) < 2: continue
+      if (bits := re.match(r'\[?(\d+):(\d+)\]?$', row[1])) or (bits := re.match(r'\[(\d+)\]$', row[1])):
+        field_name = row[0].lower()
+        hi, lo = int(bits.group(1)), int(bits.group(2)) if bits.lastindex >= 2 else int(bits.group(1))
+        if field_name == 'encoding' and len(row) >= 3:
+          enc_bits = None
+          if "'b" in row[2]: enc_bits = row[2].split("'b")[-1].replace('_', '')
+          elif (enc := re.search(r':\s*([01_]+)', row[2])): enc_bits = enc.group(1).replace('_', '')
+          if enc_bits:
+            # If encoding bits exceed field width, extend field to match (AMD docs sometimes have this)
+            declared_width, actual_width = hi - lo + 1, len(enc_bits)
+            if actual_width > declared_width: lo = hi - actual_width + 1
+            encodings[fmt_name] = enc_bits
+        fields.append((field_name, hi, lo))
+    if fields: formats[fmt_name] = fields
+  return formats, encodings

-  format_headers = []
-  for i in range(50):
-    if microcode_start + i >= total_pages: break
-    text = pdf.text(microcode_start + i)
-    for m in re.finditer(r'\d+\.\d+\.\d+\.\s+(\w+)\s*\n?Description', text): format_headers.append((m.group(1), i, m.start()))
-    for m in re.finditer(r'\d+\.\d+\.\d+\.\s+(\w+)\s*\n', text):
-      fmt_name = m.group(1)
-      if is_cdna and fmt_name.isupper() and len(fmt_name) >= 2: format_headers.append((fmt_name, i, m.start()))
-      elif m.start() > len(text) - 200 and 'Description' not in text[m.end():] and i + 1 < 50:
-        next_text = pdf.text(microcode_start + i + 1).lstrip()
-        if next_text.startswith('Description') or (next_text.startswith('"RDNA') and 'Description' in next_text[:200]):
-          format_headers.append((fmt_name, i, m.start()))
-    # RDNA4: Look for "Table X. Y Fields" patterns (e.g., VIMAGE, VSAMPLE, or shared FLAT/GLOBAL/SCRATCH)
-    for m in re.finditer(r'Table \d+\.\s+([\w,\s]+?)\s+Fields', text):
-      table_name = m.group(1).strip()
-      # Handle shared table like "FLAT, GLOBAL and SCRATCH"
-      if ',' in table_name or ' and ' in table_name:
-        for part in re.split(r',\s*|\s+and\s+', table_name):
-          fmt_name = 'V' + part.strip()
-          if fmt_name not in [h[0] for h in format_headers]: format_headers.append((fmt_name, i, m.start()))
-      elif table_name.startswith('V'):
-        if table_name not in [h[0] for h in format_headers]: format_headers.append((table_name, i, m.start()))
+def extract_pcode(pages: list[list[tuple[float, float, str, str]]], enums: dict[str, dict[int, str]]) -> dict[tuple[str, int], str]:
+  """Extract pseudocode for instructions. Returns {(name, opcode): pseudocode}."""
+  # Build lookup from instruction name to opcode
+  name_to_op = {name: op for ops in enums.values() for op, name in ops.items()}

-  formats: dict[str, list] = {}
-  for fmt_name, rel_idx, header_pos in format_headers:
-    if fmt_name in formats: continue
-    page_idx = microcode_start + rel_idx
-    text = pdf.text(page_idx)
-    field_pos = text.find('Field Name', header_pos)
-    fields = None
-    for offset in range(3):
-      if page_idx + offset >= total_pages: break
-      if offset > 0 and has_header_before_fields(pdf.text(page_idx + offset)): break
-      for t in pdf.tables(page_idx + offset) if offset > 0 or field_pos > header_pos else []:
-        if is_fields_table(t) and (f := _parse_fields_table(t, fmt_name, enum_names)) and has_encoding(f): fields = f; break
-      if fields: break
-    if not fields and field_pos > header_pos:
-      for t in pdf.tables(page_idx):
-        if is_fields_table(t) and (f := _parse_fields_table(t, fmt_name, enum_names)): fields = f; break
-    if not fields: continue
-    field_names = {f[0] for f in fields}
-    for pg_offset in range(1, 3):
-      if page_idx + pg_offset >= total_pages or has_header_before_fields(pdf.text(page_idx + pg_offset)): break
-      for t in pdf.tables(page_idx + pg_offset):
-        if is_fields_table(t) and (extra := _parse_fields_table(t, fmt_name, enum_names)) and not has_encoding(extra):
-          for ef in extra:
-            if ef[0] not in field_names: fields.append(ef); field_names.add(ef[0])
-          break
-    formats[fmt_name] = fields
+  # First pass: find all instruction headers across all pages
+  all_instructions: list[tuple[int, float, str, int]] = []  # (page_idx, y, name, opcode)
+  for page_idx, page in enumerate(pages):
+    by_y: dict[int, list[tuple[float, str]]] = {}
+    for x, y, t, _ in page:
+      by_y.setdefault(round(y), []).append((x, t))
+    for y, items in sorted(by_y.items(), reverse=True):
+      left = [(x, t) for x, t in items if 55 < x < 65]
+      right = [(x, t) for x, t in items if 535 < x < 550]
+      if left and right and left[0][1] in name_to_op and right[0][1].isdigit():
+        all_instructions.append((page_idx, y, left[0][1], int(right[0][1])))

-  # Fix known PDF errors (RDNA-specific SMEM bit positions)
-  if 'SMEM' in formats and not is_cdna:
-    formats['SMEM'] = [(n, 13 if n == 'DLC' else 14 if n == 'GLC' else h, 13 if n == 'DLC' else 14 if n == 'GLC' else l, e, t)
-                       for n, h, l, e, t in formats['SMEM']]
-  # RDNA4: VFLAT/VGLOBAL/VSCRATCH OP field is [20:14] not [20:13] (PDF documentation error)
-  for fmt_name in ['VFLAT', 'VGLOBAL', 'VSCRATCH']:
-    if fmt_name in formats:
-      formats[fmt_name] = [(n, h, 14 if n == 'OP' else l, e, t) for n, h, l, e, t in formats[fmt_name]]
-  if doc_name in ('RDNA3', 'RDNA3.5'):
-    if 'SOPPOp' in enums:
-      for k, v in {8: 'S_WAITCNT_DEPCTR', 58: 'S_TTRACEDATA', 59: 'S_TTRACEDATA_IMM'}.items():
-        assert k not in enums['SOPPOp']; enums['SOPPOp'][k] = v
-    if 'SOPKOp' in enums:
-      for k, v in {22: 'S_SUBVECTOR_LOOP_BEGIN', 23: 'S_SUBVECTOR_LOOP_END'}.items():
-        assert k not in enums['SOPKOp']; enums['SOPKOp'][k] = v
-    if 'SMEMOp' in enums:
-      for k, v in {34: 'S_ATC_PROBE', 35: 'S_ATC_PROBE_BUFFER'}.items():
-        assert k not in enums['SMEMOp']; enums['SMEMOp'][k] = v
-    if 'DSOp' in enums:
-      for k, v in {24: 'DS_GWS_SEMA_RELEASE_ALL', 25: 'DS_GWS_INIT', 26: 'DS_GWS_SEMA_V', 27: 'DS_GWS_SEMA_BR', 28: 'DS_GWS_SEMA_P', 29: 'DS_GWS_BARRIER'}.items():
-        assert k not in enums['DSOp']; enums['DSOp'][k] = v
-    if 'FLATOp' in enums:
-      for k, v in {40: 'GLOBAL_LOAD_ADDTID_B32', 41: 'GLOBAL_STORE_ADDTID_B32', 55: 'FLAT_ATOMIC_CSUB_U32'}.items():
-        assert k not in enums['FLATOp']; enums['FLATOp'][k] = v
-  # CDNA MTBUF: PDF is missing the FORMAT field (bits[25:19]) which is required for tbuffer_* instructions
-  if is_cdna and 'MTBUF' in formats:
-    field_names = {f[0] for f in formats['MTBUF']}
-    if 'FORMAT' not in field_names:
-      formats['MTBUF'].append(('FORMAT', 25, 19, None, None))
-  # CDNA SDWA/DPP: PDF only has modifier fields, need VOP1/VOP2 overlay for correct encoding
-  if is_cdna:
-    if 'SDWA' in formats:
-      formats['SDWA'] = [('ENCODING', 8, 0, 0xf9, None), ('VOP_OP', 16, 9, None, None), ('VDST', 24, 17, None, 'VGPRField'), ('VOP2_OP', 31, 25, None, None)] + \
-                        [f for f in formats['SDWA'] if f[0] not in ('ENCODING', 'SDST', 'SD', 'ROW_MASK')]
-    if 'DPP' in formats:
-      formats['DPP'] = [('ENCODING', 8, 0, 0xfa, None), ('VOP_OP', 16, 9, None, None), ('VDST', 24, 17, None, 'VGPRField'), ('VOP2_OP', 31, 25, None, None),
-        ('SRC0', 39, 32, None, 'Src'), ('DPP_CTRL', 48, 40, None, None), ('BOUND_CTRL', 51, 51, None, None), ('SRC0_NEG', 52, 52, None, None), ('SRC0_ABS', 53, 53, None, None),
-        ('SRC1_NEG', 54, 54, None, None), ('SRC1_ABS', 55, 55, None, None), ('BANK_MASK', 59, 56, None, None), ('ROW_MASK', 63, 60, None, None)]
-
-  # Extract pseudocode for instructions
-  all_text = '\n'.join(pdf.text(i) for i in range(instr_start, instr_end))
-  matches = list(INST_PATTERN.finditer(all_text))
-  raw_pseudocode: dict[tuple[str, int], str] = {}
-  for i, match in enumerate(matches):
-    name, opcode = match.group(1), int(match.group(2))
-    start, end = match.end(), matches[i + 1].start() if i + 1 < len(matches) else match.end() + 2000
-    snippet = all_text[start:end].strip()
-    if pseudocode := _extract_pseudocode(snippet): raw_pseudocode[(name, opcode)] = pseudocode
-
-  # Extract unified buffer format table (RDNA only, for MTBUF format field)
-  buf_fmt = {}
-  if not is_cdna:
-    for i in range(total_pages):
-      for t in pdf.tables(i):
-        if t and len(t) > 2 and t[0] and '#' in str(t[0][0]) and 'Format' in str(t[0]):
-          for row in t[1:]:
-            for j in range(0, len(row) - 1, 3):  # table has 3-column groups: #, Format, (empty)
-              if row[j] and row[j].isdigit() and row[j+1] and re.match(r'^[\d_]+_(UNORM|SNORM|USCALED|SSCALED|UINT|SINT|FLOAT)$', row[j+1]):
-                buf_fmt[int(row[j])] = row[j+1]
-          if buf_fmt: break
-      if buf_fmt: break
-
-  return {"formats": formats, "enums": enums, "src_enum": src_enum, "doc_name": doc_name, "pseudocode": raw_pseudocode, "is_cdna": is_cdna, "buf_fmt": buf_fmt}
-
-def _extract_pseudocode(text: str) -> str | None:
-  """Extract pseudocode from an instruction description snippet."""
-  lines, result, depth, in_lambda = text.split('\n'), [], 0, 0
-  for line in lines:
-    s = line.strip()
-    if not s or re.match(r'^\d+ of \d+$', s) or re.match(r'^\d+\.\d+\..*Instructions', s): continue
-    if s.startswith(('Notes', 'Functional examples', '•', '-')): break  # Stop at notes/bullets
-    if s.startswith(('"RDNA', 'AMD ', 'CDNA')): continue
-    if '•' in s or '–' in s: continue  # Skip lines with bullets/dashes
-    if '= lambda(' in s: in_lambda += 1; continue
-    if in_lambda > 0:
-      if s.endswith(');'): in_lambda -= 1
-      continue
-    if s.startswith('if '): depth += 1
-    elif s.startswith('endif'): depth = max(0, depth - 1)
-    if s.endswith('.') and not any(p in s for p in ['D0', 'D1', 'S0', 'S1', 'S2', 'SCC', 'VCC', 'tmp', '=']): continue
-    if re.match(r'^[a-z].*\.$', s) and '=' not in s: continue
-    is_code = (any(p in s for p in ['D0.', 'D1.', 'S0.', 'S1.', 'S2.', 'SCC =', 'SCC ?', 'VCC', 'EXEC', 'tmp =', 'tmp[', 'lane =', 'PC =',
-                                    'D0[', 'D1[', 'S0[', 'S1[', 'S2[', 'MEM[', 'RETURN_DATA',
-                                    'VADDR', 'VDATA', 'VDST', 'SADDR', 'OFFSET']) or
-               s.startswith(('if ', 'else', 'elsif', 'endif', 'declare ', 'for ', 'endfor', '//')) or
-               re.match(r'^[a-z_]+\s*=', s) or re.match(r'^[a-z_]+\[', s) or (depth > 0 and '=' in s))
-    if is_code: result.append(s)
-  return '\n'.join(result) if result else None
-
-def _merge_results(results: list[dict]) -> dict:
-  """Merge multiple PDF parse results into a superset."""
-  merged = {"formats": {}, "enums": {}, "src_enum": dict(SRC_EXTRAS), "doc_names": [], "pseudocode": {}, "is_cdna": False, "buf_fmt": {}}
-  for r in results:
-    merged["doc_names"].append(r["doc_name"])
-    merged["is_cdna"] = merged["is_cdna"] or r["is_cdna"]
-    for val, name in r["src_enum"].items():
-      if val in merged["src_enum"]: assert merged["src_enum"][val] == name
-      else: merged["src_enum"][val] = name
-    for enum_name, ops in r["enums"].items():
-      if enum_name not in merged["enums"]: merged["enums"][enum_name] = {}
-      for val, name in ops.items():
-        if val in merged["enums"][enum_name]: assert merged["enums"][enum_name][val] == name
-        else: merged["enums"][enum_name][val] = name
-    for fmt_name, fields in r["formats"].items():
-      if fmt_name not in merged["formats"]: merged["formats"][fmt_name] = list(fields)
-      else:
-        existing = {f[0]: (f[1], f[2]) for f in merged["formats"][fmt_name]}
-        for f in fields:
-          if f[0] in existing: assert existing[f[0]] == (f[1], f[2])
-          else: merged["formats"][fmt_name].append(f)
-    for key, pc in r["pseudocode"].items():
-      if key not in merged["pseudocode"]: merged["pseudocode"][key] = pc
-    for val, name in r.get("buf_fmt", {}).items():
-      if val not in merged["buf_fmt"]: merged["buf_fmt"][val] = name
-  return merged
+  # Second pass: extract pseudocode between consecutive instructions
+  pcode: dict[tuple[str, int], str] = {}
+  for i, (page_idx, y, name, opcode) in enumerate(all_instructions):
+    # Get end boundary from next instruction
+    if i + 1 < len(all_instructions):
+      next_page, next_y = all_instructions[i + 1][0], all_instructions[i + 1][1]
+    else:
+      next_page, next_y = page_idx, 0
+    # Collect F6 text from current position to next instruction
+    lines = []
+    for p in range(page_idx, next_page + 1):
+      start_y = y if p == page_idx else 800
+      end_y = next_y if p == next_page else 0
+      lines.extend((p, y2, t) for x, y2, t, f in pages[p] if f in ('/F6.0', '/F7.0') and end_y < y2 < start_y)
+    if lines:
+      # Sort by page first, then by y descending within each page (higher y = earlier text in PDF)
+      pcode_lines = [t.replace('Ê', '').strip() for _, _, t in sorted(lines, key=lambda x: (x[0], -x[1]))]
+      if pcode_lines: pcode[(name, opcode)] = '\n'.join(pcode_lines)
+  return pcode

 # ═══════════════════════════════════════════════════════════════════════════════
-# CODE GENERATION
+# Write autogen files
 # ═══════════════════════════════════════════════════════════════════════════════

-def _generate_enum_py(enums, src_enum, doc_name, buf_fmt=None) -> str:
-  """Generate enum.py content (just enums, no dsl.py dependency)."""
-  def enum_lines(name, items): return [f"class {name}(IntEnum):"] + [f"  {n} = {v}" for v, n in sorted(items.items())] + [""]
-  lines = [f"# autogenerated from AMD {doc_name} ISA PDF by pdf.py - do not edit", "from enum import IntEnum", ""]
-  lines += enum_lines("SrcEnum", src_enum) + sum([enum_lines(n, ops) for n, ops in sorted(enums.items())], [])
-  if buf_fmt: lines += enum_lines("BufFmt", {v: f"BUF_FMT_{n}" for v, n in buf_fmt.items() if 1 <= v <= 63})
-  return '\n'.join(lines)
-
-def _generate_ins_py(formats, enums, src_enum, doc_name) -> str:
-  """Generate ins.py content (instruction formats and helpers, imports dsl.py and enum.py)."""
-  def field_key(f, order): return order.index(f[0].lower()) if f[0].lower() in order else 1000
-  lines = [f"# autogenerated from AMD {doc_name} ISA PDF by pdf.py - do not edit",
-           "# ruff: noqa: F401,F403", "from typing import Annotated",
-           "from extra.assembly.amd.dsl import bits, BitField, Inst32, Inst64, Inst96, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField",
-           "from extra.assembly.amd.autogen.{arch}.enum import *",
-           "import functools", ""]
-  format_defaults = {'VOP3P': {'opsel_hi': 3, 'opsel_hi2': 1}}
-  lines.append("# instruction formats")
-  # MIMG has optional NSA (Non-Sequential Address) fields that extend beyond 64 bits, but base encoding is 64-bit
-  inst64_override = {'MIMG'}
-  for fmt_name, fields in sorted(formats.items()):
-    max_bit = max(f[1] for f in fields)
-    if fmt_name in inst64_override: base = "Inst64"
-    else: base = "Inst96" if max_bit > 63 else "Inst64" if max_bit > 31 or fmt_name == 'VOP3SD' else "Inst32"
-    order = FIELD_ORDER.get(fmt_name, [])
-    lines.append(f"class {fmt_name}({base}):")
-    if enc := next((f for f in fields if f[0] == 'ENCODING'), None):
-      lines.append(f"  encoding = bits[{enc[1]}:{enc[2]}] == 0b{enc[3]:b}" if enc[1] != enc[2] else f"  encoding = bits[{enc[1]}] == {enc[3]}")
-    if defaults := format_defaults.get(fmt_name): lines.append(f"  _defaults = {defaults}")
-    for name, hi, lo, _, ftype in sorted([f for f in fields if f[0] != 'ENCODING'], key=lambda f: field_key(f, order)):
-      ann = f":Annotated[BitField, {ftype}]" if ftype and ftype.endswith('Op') else f":{ftype}" if ftype else ""
-      lines.append(f"  {name.lower()}{ann} = bits[{hi}]" if hi == lo else f"  {name.lower()}{ann} = bits[{hi}:{lo}]")
+def write_enums(enums: dict[str, dict[int, str]], arch: str, path: str):
+  """Write enum.py file from extracted enums."""
+  lines = ["# autogenerated from AMD ISA PDF by pdf.py - do not edit", "from enum import IntEnum", ""]
+  for name, values in sorted(enums.items()):
+    suffix = "Op" if name not in ('Src', 'BufFmt') else ("Enum" if name == 'Src' else "")
+    prefix = "BUF_FMT_" if name == 'BufFmt' else ""
+    lines.append(f"class {name}{suffix}(IntEnum):")
+    for val, member in sorted(values.items()):
+      lines.append(f"  {prefix}{member} = {val}")
    lines.append("")
+  with open(path, "w") as f:
+    f.write("\n".join(lines))
+
+def write_ins(formats: dict[str, list[tuple[str, int, int]]], encodings: dict[str, str], enums: dict[str, dict[int, str]], arch: str, path: str):
+  """Write ins.py file from extracted formats and enums."""
+  # Field types and ordering
+  def field_type(name, fmt):
+    if name == 'op' and fmt in enums: return f'Annotated[BitField, {fmt}Op]'
+    if name in ('opx', 'opy'): return 'Annotated[BitField, VOPDOp]'
+    if name == 'vdsty': return 'VDSTYEnc'
+    if name in ('vdst', 'vsrc1', 'vaddr', 'vdata', 'data', 'data0', 'data1', 'addr', 'vsrc0', 'vsrc2', 'vsrc3'): return 'VGPRField'
+    if name in ('sdst', 'sbase', 'sdata', 'srsrc', 'ssamp'): return 'SGPRField'
+    if name.startswith('ssrc') or name in ('saddr', 'soffset'): return 'SSrc'
+    if name in ('src0', 'srcx0', 'srcy0') or name.startswith('src') and name[3:].isdigit(): return 'Src'
+    if name.startswith('simm'): return 'SImm'
+    if name == 'offset' or name.startswith('imm'): return 'Imm'
+    return None
+  field_priority = ['encoding', 'op', 'opx', 'opy', 'vdst', 'vdstx', 'vdsty', 'sdst', 'vdata', 'sdata', 'addr', 'vaddr', 'data', 'data0', 'data1',
+                    'src0', 'srcx0', 'srcy0', 'vsrc0', 'ssrc0', 'src1', 'vsrc1', 'vsrcx1', 'vsrcy1', 'ssrc1', 'src2', 'vsrc2', 'src3', 'vsrc3',
+                    'saddr', 'sbase', 'srsrc', 'ssamp', 'soffset', 'offset', 'simm16', 'en', 'target', 'attr', 'attr_chan',
+                    'omod', 'neg', 'neg_hi', 'abs', 'clmp', 'opsel', 'opsel_hi', 'waitexp', 'wait_va',
+                    'dmask', 'dim', 'seg', 'format', 'offen', 'idxen', 'glc', 'dlc', 'slc', 'tfe', 'unrm', 'done', 'row']
+  def sort_fields(fields):
+    order = {name: i for i, name in enumerate(field_priority)}
+    return sorted(fields, key=lambda f: (order.get(f[0], 1000), f[2]))
+
+  # Generate format classes
+  lines = ["# autogenerated from AMD ISA PDF by pdf.py - do not edit", "# ruff: noqa: F401,F403",
+           "from typing import Annotated",
+           "from extra.assembly.amd.dsl import *",
+           f"from extra.assembly.amd.autogen.{arch}.enum import *", "import functools", ""]
+  for fmt_name, fields in sorted(formats.items()):
+    lines.append(f"class {fmt_name}(Inst):")
+    for name, hi, lo in sort_fields(fields):
+      bits_str = f"bits[{hi}:{lo}]" if hi != lo else f"bits[{hi}]"
+      if name == 'encoding' and fmt_name in encodings: lines.append(f"  encoding = {bits_str} == 0b{encodings[fmt_name]}")
+      else:
+        ftype = field_type(name, fmt_name)
+        lines.append(f"  {name}{f':{ftype}' if ftype else ''} = {bits_str}")
+    lines.append("")
+
+  # Generate instruction helpers
  lines.append("# instruction helpers")
-  for cls_name, ops in sorted(enums.items()):
-    fmt = cls_name[:-2]
-    for op_val, name in sorted(ops.items()):
-      seg = {"GLOBAL": ", seg=2", "SCRATCH": ", seg=1"}.get(fmt, "")
-      tgt = {"GLOBAL": "FLAT, GLOBALOp", "SCRATCH": "FLAT, SCRATCHOp"}.get(fmt, f"{fmt}, {cls_name}")
-      if fmt in formats or fmt in ("GLOBAL", "SCRATCH"):
-        suffix = "_e32" if fmt in ("VOP1", "VOP2", "VOPC") else "_e64" if fmt == "VOP3" and op_val < 512 else ""
-        if name in ('V_FMAMK_F32', 'V_FMAMK_F16'):
-          lines.append(f"def {name.lower()}{suffix}(vdst, src0, K, vsrc1): return {fmt}({cls_name}.{name}, vdst, src0, vsrc1, literal=K)")
-        elif name in ('V_FMAAK_F32', 'V_FMAAK_F16'):
-          lines.append(f"def {name.lower()}{suffix}(vdst, src0, vsrc1, K): return {fmt}({cls_name}.{name}, vdst, src0, vsrc1, literal=K)")
-        else: lines.append(f"{name.lower()}{suffix} = functools.partial({tgt}.{name}{seg})")
-  src_names = {name for _, name in src_enum.items()}
-  lines += [""] + [f"{name} = SrcEnum.{name}" for _, name in sorted(src_enum.items()) if name not in {'DPP8', 'DPP16'}]
-  if "NULL" in src_names: lines.append("OFF = NULL\n")
-  return '\n'.join(lines)
+  for fmt_name, ops in sorted(enums.items()):
+    seg = {"GLOBAL": ", seg=2", "SCRATCH": ", seg=1"}.get(fmt_name, "")
+    tgt = {"GLOBAL": "FLAT, GLOBALOp", "SCRATCH": "FLAT, SCRATCHOp"}.get(fmt_name, f"{fmt_name}, {fmt_name}Op")
+    suffix = "_e32" if fmt_name in ("VOP1", "VOP2", "VOPC") else "_e64" if fmt_name == "VOP3" and len(ops) > 0 else ""
+    if fmt_name in formats or fmt_name in ("GLOBAL", "SCRATCH"):
+      for op_val, name in sorted(ops.items()):
+        fn_suffix = suffix if fmt_name != "VOP3" or op_val < 512 else ""
+        lines.append(f"{name.lower()}{fn_suffix} = functools.partial({tgt}.{name}{seg})")

-def _generate_str_pcode_py(enums, pseudocode, arch) -> str:
-  """Generate str_pcode.py content (raw pseudocode strings)."""
-  # Get op enums for this arch (import from .ins which re-exports from .enum)
-  import importlib
-  autogen = importlib.import_module(f"extra.assembly.amd.autogen.{arch}.ins")
-  OP_ENUMS = [getattr(autogen, name) for name in ['SOP1Op', 'SOP2Op', 'SOPCOp', 'SOPKOp', 'SOPPOp', 'SMEMOp', 'VOP1Op', 'VOP2Op', 'VOP3Op', 'VOP3SDOp', 'VOP3POp', 'VOPCOp', 'VOP3AOp', 'VOP3BOp', 'DSOp', 'FLATOp', 'GLOBALOp', 'SCRATCHOp'] if hasattr(autogen, name)]
+  with open(path, "w") as f:
+    f.write("\n".join(lines))

-  # Build defined ops mapping
-  defined_ops: dict[tuple, list] = {}
-  for enum_cls in OP_ENUMS:
-    for op in enum_cls:
-      if op.name.startswith(('S_', 'V_', 'DS_', 'FLAT_', 'GLOBAL_', 'SCRATCH_')): defined_ops.setdefault((op.name, op.value), []).append((enum_cls, op))
-
-  enum_names = [e.__name__ for e in OP_ENUMS]
-  instructions: dict = {cls: {} for cls in OP_ENUMS}
-  for key, pc in pseudocode.items():
-    if key in defined_ops:
-      for enum_cls, enum_val in defined_ops[key]: instructions[enum_cls][enum_val] = pc
-
-  # Build string dictionaries for each enum
-  lines = [f'''# autogenerated by pdf.py - do not edit
-# to regenerate: python -m extra.assembly.amd.pdf --arch {arch}
-# ruff: noqa: E501
-from extra.assembly.amd.autogen.{arch}.enum import {", ".join(enum_names)}
-''']
-  all_dict_entries: dict = {}
-  for enum_cls in OP_ENUMS:
-    cls_name = enum_cls.__name__
-    if not instructions.get(enum_cls): continue
-    dict_entries = [(op, repr(pc)) for op, pc in instructions[enum_cls].items()]
-    if dict_entries:
-      all_dict_entries[enum_cls] = dict_entries
-      lines.append(f'{cls_name}_PCODE = {{')
-      for op, escaped in dict_entries: lines.append(f"  {cls_name}.{op.name}: {escaped},")
-      lines.append('}\n')
-
-  lines.append('PSEUDOCODE_STRINGS = {')
-  for enum_cls in OP_ENUMS:
-    if all_dict_entries.get(enum_cls): lines.append(f'  {enum_cls.__name__}: {enum_cls.__name__}_PCODE,')
-  lines.append('}')
-  return '\n'.join(lines)
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# MAIN GENERATION
-# ═══════════════════════════════════════════════════════════════════════════════
-
-def generate_arch(arch: str) -> dict:
-  """Generate enum.py, ins.py and str_pcode.py for a single architecture."""
-  urls = PDF_URLS[arch]
-  if isinstance(urls, str): urls = [urls]
-
-  print(f"\n{'='*60}\nGenerating {arch}...")
-  print(f"Parsing {len(urls)} PDF(s)...")
-  results = [_parse_single_pdf(url) for url in urls]
-  merged = _merge_results(results) if len(results) > 1 else results[0]
-  doc_name = "+".join(merged["doc_names"]) if len(results) > 1 else merged["doc_name"]
-
-  base_path = Path(f"extra/assembly/amd/autogen/{arch}")
-  base_path.mkdir(parents=True, exist_ok=True)
-  (base_path / "__init__.py").touch()
-
-  # Write enum.py (enums only, no dsl.py dependency)
-  enum_path = base_path / "enum.py"
-  enum_content = _generate_enum_py(merged["enums"], merged["src_enum"], doc_name, merged.get("buf_fmt"))
-  enum_path.write_text(enum_content)
-  buf_fmt_count = len([v for v in merged.get("buf_fmt", {}) if 1 <= v <= 63])
-  print(f"Generated {enum_path}: SrcEnum ({len(merged['src_enum'])}) + {len(merged['enums'])} enums" + (f" + BufFmt ({buf_fmt_count})" if buf_fmt_count else ""))
-
-  # Write ins.py (instruction formats and helpers, imports dsl.py and enum.py)
-  ins_path = base_path / "ins.py"
-  ins_content = _generate_ins_py(merged["formats"], merged["enums"], merged["src_enum"], doc_name).replace("{arch}", arch)
-  ins_path.write_text(ins_content)
-  print(f"Generated {ins_path}: {len(merged['formats'])} formats")
-
-  # Write str_pcode.py (needs enum.py to exist first for imports)
-  pcode_path = base_path / "str_pcode.py"
-  pcode_content = _generate_str_pcode_py(merged["enums"], merged["pseudocode"], arch)
-  pcode_path.write_text(pcode_content)
-  print(f"Generated {pcode_path}: {len(merged['pseudocode'])} instructions")
-
-  return merged
-
-def _generate_arch_wrapper(arch: str):
-  """Wrapper for multiprocessing - returns arch name for ordering."""
-  generate_arch(arch)
-  return arch
-
-def generate_all():
-  """Generate all architectures in parallel."""
-  with ProcessPoolExecutor() as executor:
-    list(executor.map(_generate_arch_wrapper, PDF_URLS.keys()))
+def write_pcode(pcode: dict[tuple[str, int], str], enums: dict[str, dict[int, str]], arch: str, path: str):
+  """Write str_pcode.py file from extracted pseudocode."""
+  # Group pseudocode by enum class
+  by_enum: dict[str, list[tuple[str, int, str]]] = {}
+  for fmt_name, ops in enums.items():
+    for opcode, name in ops.items():
+      if (name, opcode) in pcode: by_enum.setdefault(f"{fmt_name}Op", []).append((name, opcode, pcode[(name, opcode)]))
+  # Generate file
+  enum_names = sorted(by_enum.keys())
+  lines = [f"# autogenerated by pdf.py - do not edit", f"# to regenerate: python -m extra.assembly.amd.pdf",
+           "# ruff: noqa: E501", f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", ""]
+  for enum_name in enum_names:
+    lines.append(f"{enum_name}_PCODE = {{")
+    for name, opcode, code in sorted(by_enum[enum_name], key=lambda x: x[1]):
+      lines.append(f"  {enum_name}.{name}: {code!r},")
+    lines.append("}\n")
+  lines.append(f"PSEUDOCODE_STRINGS = {{{', '.join(f'{e}: {e}_PCODE' for e in enum_names)}}}")
+  with open(path, "w") as f:
+    f.write("\n".join(lines))

 if __name__ == "__main__":
-  import argparse
-  parser = argparse.ArgumentParser(description="Generate AMD ISA autogen files from PDF documentation")
-  parser.add_argument("--arch", choices=list(PDF_URLS.keys()) + ["all"], default="rdna3")
-  args = parser.parse_args()
-  if args.arch == "all": generate_all()
-  else: generate_arch(args.arch)
+  import pathlib
+  for arch, url in PDF_URLS.items():
+    print(f"Processing {arch}...")
+    pages = extract(url)
+    tables = extract_tables(pages)
+    enums = extract_enums(tables)
+    formats, encodings = extract_ins(tables)
+    pcode = extract_pcode(pages, enums)
+    # Fix known PDF errors
+    if arch == 'rdna3':
+      fixes = {'SOPP': {8: 'S_WAITCNT_DEPCTR', 58: 'S_TTRACEDATA', 59: 'S_TTRACEDATA_IMM'},
+               'SOPK': {22: 'S_SUBVECTOR_LOOP_BEGIN', 23: 'S_SUBVECTOR_LOOP_END'},
+               'SMEM': {34: 'S_ATC_PROBE', 35: 'S_ATC_PROBE_BUFFER'},
+               'DS': {24: 'DS_GWS_SEMA_RELEASE_ALL', 25: 'DS_GWS_INIT', 26: 'DS_GWS_SEMA_V', 27: 'DS_GWS_SEMA_BR', 28: 'DS_GWS_SEMA_P', 29: 'DS_GWS_BARRIER'},
+               'FLAT': {40: 'GLOBAL_LOAD_ADDTID_B32', 41: 'GLOBAL_STORE_ADDTID_B32', 55: 'FLAT_ATOMIC_CSUB_U32'}}
+      for fmt, ops in fixes.items(): enums[fmt] = merge_dicts([enums[fmt], ops])
+    if arch in ('rdna3', 'rdna4'):
+      # RDNA SMEM: PDF says DLC=[14], GLC=[16] but hardware uses DLC=[13], GLC=[14]
+      if 'SMEM' in formats:
+        formats['SMEM'] = [(n, 13 if n == 'dlc' else 14 if n == 'glc' else h, 13 if n == 'dlc' else 14 if n == 'glc' else l)
+                           for n, h, l in formats['SMEM']]
+    if arch == 'cdna':
+      # CDNA DS: PDF is missing the GDS field (bit 16)
+      if 'DS' in formats and not any(n == 'gds' for n, _, _ in formats['DS']):
+        formats['DS'].append(('gds', 16, 16))
+      # CDNA DPP/SDWA: PDF only documents modifier fields (bits[63:32]), need to add VOP overlay fields (bits[31:0])
+      vop_overlay = [('encoding', 8, 0), ('vop_op', 16, 9), ('vdst', 24, 17), ('vop2_op', 31, 25)]
+      if 'DPP' in formats and not any(n == 'encoding' for n, _, _ in formats['DPP']):
+        formats['DPP'] = vop_overlay + [('bc' if n == 'bound_ctrl' else n, h, l) for n, h, l in formats['DPP']]
+        encodings['DPP'] = '11111010'
+      if 'SDWA' in formats and not any(n == 'encoding' for n, _, _ in formats['SDWA']):
+        formats['SDWA'] = vop_overlay + [(n, h, l) for n, h, l in formats['SDWA']]
+        encodings['SDWA'] = '11111001'
+    base = pathlib.Path(__file__).parent / "autogen" / arch
+    write_enums(enums, arch, base / "enum.py")
+    write_ins(formats, encodings, enums, arch, base / "ins.py")
+    write_pcode(pcode, enums, arch, base / "str_pcode.py")
+    print(f"  {len(tables)} tables, {len(pcode)} pcode -> {base}")
--- a/extra/assembly/amd/test/hw/test_vop3.py
+++ b/extra/assembly/amd/test/hw/test_vop3.py
@@ -1615,7 +1615,7 @@ class TestCarryBorrow(unittest.TestCase):
      v_mov_b32_e32(v[2], s[2]),
      v_mov_b32_e32(v[3], s[3]),
      v_add_co_u32(v[4], VCC, v[0], v[2]),
-      v_add_co_ci_u32_e32(v[5], VCC, v[1], v[3]),
+      v_add_co_ci_u32_e32(v[5], v[1], v[3]),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result")
--- a/extra/assembly/amd/test/hw/test_vop3p.py
+++ b/extra/assembly/amd/test/hw/test_vop3p.py
@@ -271,7 +271,7 @@ class TestVOP3P(unittest.TestCase):
      s_mov_b32(s[1], 0x44004200),  # hi=4.0, lo=3.0
      v_mov_b32_e32(v[0], s[0]),
      v_mov_b32_e32(v[1], s[1]),
-      v_pk_add_f16(v[2], v[0], v[1]),
+      v_pk_add_f16(v[2], v[0], v[1], opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][2]
@@ -288,7 +288,7 @@ class TestVOP3P(unittest.TestCase):
      s_mov_b32(s[1], 0x45004400),  # hi=5.0, lo=4.0
      v_mov_b32_e32(v[0], s[0]),
      v_mov_b32_e32(v[1], s[1]),
-      v_pk_mul_f16(v[2], v[0], v[1]),
+      v_pk_mul_f16(v[2], v[0], v[1], opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][2]
@@ -307,7 +307,7 @@ class TestVOP3P(unittest.TestCase):
      v_mov_b32_e32(v[0], s[0]),
      v_mov_b32_e32(v[1], s[1]),
      v_mov_b32_e32(v[2], s[2]),
-      v_pk_fma_f16(v[3], v[0], v[1], v[2]),
+      v_pk_fma_f16(v[3], v[0], v[1], v[2], opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][3]
@@ -325,7 +325,7 @@ class TestVOP3P(unittest.TestCase):
    instructions = [
      s_mov_b32(s[0], 0x3c003c00),  # packed f16: hi=1.0, lo=1.0
      v_mov_b32_e32(v[0], s[0]),
-      v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE),  # Add inline constant 1.0
+      v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE, opsel_hi=3, opsel_hi2=1),  # Add inline constant 1.0
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][1]
@@ -345,7 +345,7 @@ class TestVOP3P(unittest.TestCase):
    instructions = [
      s_mov_b32(s[0], 0x44004200),  # packed f16: hi=4.0, lo=3.0
      v_mov_b32_e32(v[0], s[0]),
-      v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO),
+      v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO, opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][1]
@@ -486,12 +486,12 @@ class TestSpecialOps(unittest.TestCase):
    """V_DOT2_F32_BF16 computes dot product of bf16 pairs."""
    # bf16 1.0 = 0x3f80, bf16 2.0 = 0x4000
    instructions = [
-      s_mov_b32(s[0], 0x3f803f80),  # packed bf16: 1.0, 1.0
-      s_mov_b32(s[1], 0x40003f80),  # packed bf16: 2.0, 1.0
+      s_mov_b32(s[0], 0x3f803f80),  # packed bf16: lo=1.0, hi=1.0
+      s_mov_b32(s[1], 0x40003f80),  # packed bf16: lo=1.0, hi=2.0
      v_mov_b32_e32(v[0], s[0]),
      v_mov_b32_e32(v[1], s[1]),
      v_mov_b32_e32(v[2], 0),
-      v_dot2_f32_bf16(v[3], v[0], v[1], v[2]),
+      v_dot2_f32_bf16(v[3], v[0], v[1], v[2], opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    # 1.0*1.0 + 1.0*2.0 + 0 = 3.0
@@ -510,7 +510,7 @@ class TestPackedMixedSigns(unittest.TestCase):
      s_mov_b32(s[1], 0x3c003c00),  # packed: hi=1.0, lo=1.0
      v_mov_b32_e32(v[0], s[0]),
      v_mov_b32_e32(v[1], s[1]),
-      v_pk_add_f16(v[2], v[0], v[1]),
+      v_pk_add_f16(v[2], v[0], v[1], opsel_hi=3, opsel_hi2=1),
    ]
    st = run_program(instructions, n_lanes=1)
    result = st.vgpr[0][2]
--- a/extra/assembly/amd/test/test_pdf.py
+++ b/extra/assembly/amd/test/test_pdf.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+"""Test pdf.py PDF parser and enum generation."""
+import unittest, tempfile, importlib.util
+from extra.assembly.amd.pdf import extract, extract_tables, extract_enums, write_enums, PDF_URLS
+
+EXPECTED = {
+  "rdna3": {"pages": 655, "tables": 115, "sop2_ops": 67, "sop2_first": "S_ADD_U32"},
+  "rdna4": {"pages": 711, "tables": 125, "sop2_ops": 74, "sop2_first": "S_ADD_CO_U32"},
+  "cdna":  {"pages": 610, "tables": 104, "sop2_ops": 52, "sop2_first": "S_ADD_U32"},
+}
+
+class TestPDF2(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.data = {name: extract(url) for name, url in PDF_URLS.items()}
+    cls.tables = {name: extract_tables(pages) for name, pages in cls.data.items()}
+    cls.enums = {name: extract_enums(cls.tables[name]) for name in PDF_URLS}
+
+  def test_page_counts(self):
+    for name, exp in EXPECTED.items():
+      self.assertEqual(len(self.data[name]), exp["pages"], f"{name} page count")
+
+  def test_table_counts(self):
+    for name, exp in EXPECTED.items():
+      self.assertEqual(len(self.tables[name]), exp["tables"], f"{name} table count")
+
+  def test_tables_sequential(self):
+    for name in PDF_URLS:
+      nums = sorted(self.tables[name].keys())
+      missing = set(range(1, max(nums) + 1)) - set(nums)
+      self.assertEqual(missing, set(), f"{name} missing tables: {missing}")
+
+  def test_generate_enums(self):
+    for name, exp in EXPECTED.items():
+      with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        write_enums(self.enums[name], name, f.name)
+        spec = importlib.util.spec_from_file_location("enum", f.name)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        # Check SOP2Op
+        self.assertTrue(hasattr(mod, 'SOP2Op'), f"{name} missing SOP2Op")
+        self.assertEqual(len(mod.SOP2Op), exp["sop2_ops"], f"{name} SOP2Op count")
+        self.assertEqual(mod.SOP2Op(0).name, exp["sop2_first"], f"{name} SOP2Op first")
+        # Check all enums have at least 2 ops
+        for attr in dir(mod):
+          if attr.endswith('Op'):
+            self.assertGreaterEqual(len(getattr(mod, attr)), 2, f"{name} {attr} has too few ops")
+
+if __name__ == "__main__":
+  unittest.main()
--- a/extra/assembly/amd/test/test_pdf_parser.py
+++ b/extra/assembly/amd/test/test_pdf_parser.py
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-"""Test that PDF parser correctly extracts format fields."""
-import unittest, os
-from extra.assembly.amd.autogen.rdna3.ins import SOP1, SOP2, SOPK, SOPP, VOP1, VOP2, VOP3SD, VOPC, FLAT, VOPD, SOP1Op, SOP2Op, VOP1Op, VOP3Op
-
-# expected formats with key fields and whether they have ENCODING
-EXPECTED_FORMATS = {
-  'DPP16': (['SRC0', 'DPP_CTRL', 'BANK_MASK', 'ROW_MASK'], False),
-  'DPP8': (['SRC0', 'LANE_SEL0', 'LANE_SEL7'], False),
-  'DS': (['OP', 'ADDR', 'DATA0', 'DATA1', 'VDST'], True),
-  'EXP': (['EN', 'TARGET', 'VSRC0', 'VSRC1', 'VSRC2', 'VSRC3'], True),
-  'FLAT': (['OP', 'ADDR', 'DATA', 'SADDR', 'VDST', 'OFFSET'], True),
-  'LDSDIR': (['VDST', 'OP'], True),
-  'MIMG': (['OP', 'VADDR', 'VDATA', 'SRSRC', 'DMASK'], True),
-  'MTBUF': (['OP', 'VADDR', 'VDATA', 'SRSRC', 'FORMAT', 'SOFFSET'], True),
-  'MUBUF': (['OP', 'VADDR', 'VDATA', 'SRSRC', 'SOFFSET'], True),
-  'SMEM': (['OP', 'SBASE', 'SDATA', 'OFFSET', 'SOFFSET'], True),
-  'SOP1': (['OP', 'SDST', 'SSRC0'], True),
-  'SOP2': (['OP', 'SDST', 'SSRC0', 'SSRC1'], True),
-  'SOPC': (['OP', 'SSRC0', 'SSRC1'], True),
-  'SOPK': (['OP', 'SDST', 'SIMM16'], True),
-  'SOPP': (['OP', 'SIMM16'], True),
-  'VINTERP': (['OP', 'VDST', 'SRC0', 'SRC1', 'SRC2'], True),
-  'VOP1': (['OP', 'VDST', 'SRC0'], True),
-  'VOP2': (['OP', 'VDST', 'SRC0', 'VSRC1'], True),
-  'VOP3': (['OP', 'VDST', 'SRC0', 'SRC1', 'SRC2'], True),
-  'VOP3P': (['OP', 'VDST', 'SRC0', 'SRC1', 'SRC2'], True),
-  'VOP3SD': (['OP', 'VDST', 'SDST', 'SRC0', 'SRC1', 'SRC2'], True),
-  'VOPC': (['OP', 'SRC0', 'VSRC1'], True),
-  'VOPD': (['OPX', 'OPY', 'SRCX0', 'SRCY0', 'VDSTX', 'VDSTY'], True),
-}
-
-# Skip PDF parsing tests by default - only run with TEST_PDF_PARSER=1
-# These are slow (~5s) and only needed when regenerating autogen/
-@unittest.skipUnless(os.environ.get("TEST_PDF_PARSER"), "set TEST_PDF_PARSER=1 to run PDF parser tests")
-class TestPDFParserGenerate(unittest.TestCase):
-  """Test the PDF parser by running generate() and checking results."""
-
-  def test_pdf_parser(self):
-    """Single test that validates all PDF parser outputs."""
-    from extra.assembly.amd.dsl import generate
-    result = generate()
-
-    # test_all_formats_present
-    for fmt_name in EXPECTED_FORMATS:
-      self.assertIn(fmt_name, result["formats"], f"missing format {fmt_name}")
-
-    # test_format_count
-    self.assertEqual(len(result["formats"]), 23)
-
-    # test_no_duplicate_fields
-    for fmt_name, fields in result["formats"].items():
-      field_names = [f[0] for f in fields]
-      self.assertEqual(len(field_names), len(set(field_names)), f"{fmt_name} has duplicate fields: {field_names}")
-
-    # test_expected_fields
-    for fmt_name, (expected_fields, has_encoding) in EXPECTED_FORMATS.items():
-      fields = {f[0] for f in result["formats"].get(fmt_name, [])}
-      for field in expected_fields:
-        self.assertIn(field, fields, f"{fmt_name} missing {field}")
-      if has_encoding:
-        self.assertIn("ENCODING", fields, f"{fmt_name} should have ENCODING")
-      else:
-        self.assertNotIn("ENCODING", fields, f"{fmt_name} should not have ENCODING")
-
-    # test_vopd_no_dpp16_fields
-    vopd_fields = {f[0] for f in result["formats"].get("VOPD", [])}
-    for field in ['DPP_CTRL', 'BANK_MASK', 'ROW_MASK']:
-      self.assertNotIn(field, vopd_fields, f"VOPD should not have {field}")
-
-    # test_dpp16_no_vinterp_fields
-    dpp16_fields = {f[0] for f in result["formats"].get("DPP16", [])}
-    for field in ['VDST', 'WAITEXP']:
-      self.assertNotIn(field, dpp16_fields, f"DPP16 should not have {field}")
-
-    # test_sopp_no_smem_fields
-    sopp_fields = {f[0] for f in result["formats"].get("SOPP", [])}
-    for field in ['SBASE', 'SDATA']:
-      self.assertNotIn(field, sopp_fields, f"SOPP should not have {field}")
-
-class TestPDFParser(unittest.TestCase):
-  """Verify format classes have correct fields from PDF parsing."""
-
-  def test_sop2_fields(self):
-    """SOP2 should have op, sdst, ssrc0, ssrc1."""
-    for field in ['op', 'sdst', 'ssrc0', 'ssrc1']:
-      self.assertIn(field, SOP2._fields)
-    self.assertEqual(SOP2._fields['op'].hi, 29)
-    self.assertEqual(SOP2._fields['op'].lo, 23)
-
-  def test_sop1_fields(self):
-    """SOP1 should have op, sdst, ssrc0 with correct bit positions."""
-    for field in ['op', 'sdst', 'ssrc0']:
-      self.assertIn(field, SOP1._fields)
-    self.assertNotIn('simm16', SOP1._fields)
-    self.assertEqual(SOP1._fields['ssrc0'].hi, 7)
-    self.assertEqual(SOP1._fields['ssrc0'].lo, 0)
-    assert SOP1._encoding is not None
-    self.assertEqual(SOP1._encoding[0].hi, 31)
-    self.assertEqual(SOP1._encoding[1], 0b101111101)
-
-  def test_vop3sd_fields(self):
-    """VOP3SD should have all fields including src0/src1/src2 from page continuation."""
-    for field in ['op', 'vdst', 'sdst', 'src0', 'src1', 'src2']:
-      self.assertIn(field, VOP3SD._fields)
-    self.assertEqual(VOP3SD._fields['src0'].hi, 40)
-    self.assertEqual(VOP3SD._fields['src0'].lo, 32)
-    self.assertEqual(VOP3SD._size(), 8)
-
-  def test_flat_has_vdst(self):
-    """FLAT should have vdst field."""
-    self.assertIn('vdst', FLAT._fields)
-    self.assertEqual(FLAT._fields['vdst'].hi, 63)
-    self.assertEqual(FLAT._fields['vdst'].lo, 56)
-
-  def test_encoding_bits(self):
-    """Verify encoding bits are correct for major formats."""
-    tests = [
-      (SOP2, 31, 30, 0b10),
-      (SOPK, 31, 28, 0b1011),
-      (SOPP, 31, 23, 0b101111111),
-      (VOP1, 31, 25, 0b0111111),
-      (VOP2, 31, 31, 0b0),
-      (VOPC, 31, 25, 0b0111110),
-      (FLAT, 31, 26, 0b110111),
-    ]
-    for cls, hi, lo, val in tests:
-      assert cls._encoding is not None
-      self.assertEqual(cls._encoding[0].hi, hi, f"{cls.__name__} encoding hi")
-      self.assertEqual(cls._encoding[0].lo, lo, f"{cls.__name__} encoding lo")
-      self.assertEqual(cls._encoding[1], val, f"{cls.__name__} encoding val")
-
-  def test_opcode_enums_exist(self):
-    """Verify opcode enums are generated with expected counts."""
-    self.assertGreater(len(SOP1Op), 50)
-    self.assertGreater(len(SOP2Op), 50)
-    self.assertGreater(len(VOP1Op), 50)
-    self.assertGreater(len(VOP3Op), 200)
-
-  def test_vopd_no_duplicate_fields(self):
-    """VOPD should not have duplicate fields and should not include DPP16 fields."""
-    field_names = list(VOPD._fields.keys())
-    self.assertEqual(len(field_names), len(set(field_names)))
-    for field in ['srcx0', 'srcy0', 'opx', 'opy']:
-      self.assertIn(field, VOPD._fields)
-    for field in ['dpp_ctrl', 'bank_mask', 'row_mask']:
-      self.assertNotIn(field, VOPD._fields)
-
-if __name__ == "__main__":
-  unittest.main()