Files
ROCm/scripts/amd/tikzplot.tex
Lixun Zhang 670ae8054d Add a cute tool to plot blocked, dotOperand, and mfma layout (#407)
* Add commands to plot blocked, dotOperand, and mfma layout

* Add commands to plot LDS layout and wmma instruction layout
2023-11-29 09:35:33 -06:00

875 lines
32 KiB
TeX
Executable File

\newcommand{\drawBlockedWave}[5]{
%%
%% Draw a wave coverage with blocked layout
%%
%% Wave TL: pre defined top-left coordinate of the wave
%% \elem: pre defined variable
%%
%% #1: sizePerThread[0] --> sizePerThreadM
%% #2: sizePerThread[1] --> sizePerThreadN
%% #3: threadsPerWarp[0] --> threadsPerWarpM
%% #4: threadsPerWarp[1] --> threadsPerWarpN
%% #5: fastest changing dim --> order
\pgfmathsetmacro{\sizePerThreadM}{#1}
\pgfmathsetmacro{\sizePerThreadN}{#2}
\pgfmathsetmacro{\threadsPerWarpM}{#3}
\pgfmathsetmacro{\threadsPerWarpN}{#4}
\pgfmathsetmacro{\order}{#5}
\pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM}
\pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN}
\foreach \tid in {0,...,63}{
\pgfmathsetmacro{\tidM}{int(\tid/\threadsPerWarpN)}
\pgfmathsetmacro{\tidN}{mod(\tid,\threadsPerWarpN)}
\coordinate (Thread TL) at ($(Wave TL)+(\tidN*\sizePerThreadN*\elem, -\tidM*\sizePerThreadM*\elem)$);
\pgfmathsetmacro{\ratio}{\tidM*10}
\ifthenelse{\tid = 0}{
\draw [line width = 0.01mm, fill=red] (Thread TL)
rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
}{
\draw [line width = 0.01mm, fill=blue!\ratio!white] (Thread TL)
rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
}
}
\draw (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem);
}
\newcommand{\drawBlockedCTA}[7]{
%%
%% Draw a CTA coverage with blocked layout
%%
%% CTA TL: pre defined top-left coordinate of the CTA
%% \elem: pre defined variable
%%
%% #1: sizePerThread[0] --> sizePerThreadM
%% #2: sizePerThread[1] --> sizePerThreadN
%% #3: threadsPerWarp[0] --> threadsPerWarpM
%% #4: threadsPerWarp[1] --> threadsPerWarpN
%% #5: warpsPerCTA[0] --> warpsPerCTAM
%% #6: warpsPerCTA[1] --> warpsPerCTAN
%% #7: fastest changing dim --> order
\pgfmathsetmacro{\sizePerThreadM}{#1}
\pgfmathsetmacro{\sizePerThreadN}{#2}
\pgfmathsetmacro{\threadsPerWarpM}{#3}
\pgfmathsetmacro{\threadsPerWarpN}{#4}
\pgfmathsetmacro{\warpsPerCTAM}{#5}
\pgfmathsetmacro{\warpsPerCTAN}{#6}
\pgfmathsetmacro{\order}{#7}
\pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM}
\pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN}
\pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM}
\pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN}
\pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM*\warpsPerCTAN-1}
\coordinate (Wave TL) at (CTA TL);
\drawBlockedWave{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\order}
\foreach \waveId in {0,...,\maxWaveId}{
\ifthenelse{\order=1}
{
\pgfmathsetmacro{\waveCoordM}{int(\waveId/\warpsPerCTAN)}
\pgfmathsetmacro{\waveCoordN}{mod(\waveId,\warpsPerCTAN)}
\pgfmathsetmacro{\rot}{0}
}{
\pgfmathsetmacro{\waveCoordM}{mod(\waveId,\warpsPerCTAM)}
\pgfmathsetmacro{\waveCoordN}{int(\waveId/\warpsPerCTAM)}
\pgfmathsetmacro{\rot}{90}
}
\coordinate (Wave TL) at ($(CTA TL)+(\waveCoordN*\waveSizeN*\elem, -\waveCoordM*\waveSizeM*\elem)$);
\draw [ultra thin] (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem)
node [pos=.5, scale=.6*\scale, inner sep=0, fill=white, rotate=\rot] {wave\waveId};
}
\draw [thick] (CTA TL) rectangle ++(\CTASizeN*\elem, -\CTASizeM*\elem);
}
\newcommand{\drawBlockedTensor}[8]{
%%
%% Draw a tensor with blocked layout of the following parameters
%% sizePerThread[2]
%% threadsPerWarp[2]
%% warpsPerCTA[2]
%% order[2]
%%
%% TL: pre defined top-left coordinate of the tensor
%% \elem: pre defined variable
%%
%% #1: tensorShape[0] --> M
%% #2: tensorShape[1] --> N
%% #3: sizePerThread[0] --> sizePerThreadM
%% #4: sizePerThread[1] --> sizePerThreadN
%% #5: threadsPerWarp[0] --> threadsPerWarpM
%% Note that threadsPerWarp[1] is calculated by 64/threadsPerWarp[0]
%% #6: warpsPerCTA[0] --> warpsPerCTAM
%% #7: warpsPerCTA[1] --> warpsPerCTAN
%% #8: fastest changing dim --> order
\pgfmathsetmacro{\M}{#1}
\pgfmathsetmacro{\N}{#2}
\pgfmathsetmacro{\sizePerThreadM}{#3}
\pgfmathsetmacro{\sizePerThreadN}{#4}
\pgfmathsetmacro{\threadsPerWarpM}{#5}
\pgfmathsetmacro{\warpsPerCTAM}{#6}
\pgfmathsetmacro{\warpsPerCTAN}{#7}
\pgfmathsetmacro{\order}{#8}
\pgfmathsetmacro{\threadsPerWarpN}{64/\threadsPerWarpM}
\pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM}
\pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN}
\pgfmathsetmacro{\CTARepM}{\M/\CTASizeM}
\pgfmathsetmacro{\CTARepN}{\N/\CTASizeN}
\pgfmathsetmacro{\maxCTAId}{\CTARepM*\CTARepN-1}
\foreach \ctaId in {0,...,\maxCTAId}{
\pgfmathsetmacro{\ctaCoordM}{int(\ctaId/\CTARepN)}
\pgfmathsetmacro{\ctaCoordN}{mod(\ctaId,\CTARepN)}
\coordinate (CTA TL) at ($(TL)+(\ctaCoordN*\CTASizeN*\elem, -\ctaCoordM*\CTASizeM*\elem)$);
\drawBlockedCTA{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\warpsPerCTAM}{\warpsPerCTAN}{\order}
}
\node [scale=.7*\scale, above, rotate=90] at ($(TL)+(0, -.5*\M*\elem)$) {M=\M};
\node [scale=.7*\scale, above] at ($(TL)+(.5*\N*\elem, 0)$) {K=\N};
\def\zoomR{1.5}
\coordinate (zoomin BL) at ($(TL)+(0, .3)$);
\foreach \hl in {0,...,\sizePerThreadM}{
\draw ($(zoomin BL)+(0, \hl*\elem*\zoomR)$) -- ++(\sizePerThreadN*\elem*\zoomR,0);
}
\foreach \vl in {0,...,\sizePerThreadN}{
\draw ($(zoomin BL)+(\vl*\elem*\zoomR, 0)$) -- ++(0, \sizePerThreadM*\elem*\zoomR);
}
\node [scale=.6*\scale, left] at ($(zoomin BL)+(0, .5*\sizePerThreadM*\elem*\zoomR)$) {$t_0$};
\node [scale=.6*\scale, right] at ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, .5*\sizePerThreadM*\elem*\zoomR)$) {\sizePerThreadM$\times$\sizePerThreadN};
\draw [densely dotted] (TL) -- (zoomin BL);
\draw [densely dotted] ($(TL)+(\sizePerThreadN*\elem, 0)$) -- ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, 0)$);
\draw [fill=red] (TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
}
\newcommand{\drawBlockMFMALayoutLarge}[2]{
%%
%% Draw a single block of MFMA_32x32x8xf16
%%
%% block TL: pre-defined top-left coordinate of the block
%% \elem: pre defined variable
%%
%% #1: 1 for mfma.trans, 0 for normal mfma
%% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing
\pgfmathsetmacro{\trans}{#1}
\pgfmathsetmacro{\nonTrans}{1-#1}
\pgfmathsetmacro{\verbose}{#2}
\foreach \iVec in {0,1,2,3} {
\coordinate (wave TL) at ($(block TL)+(\trans*\iVec*2*4*\elem, -\nonTrans*\iVec*2*4*\elem)$);
\foreach \col/\tg in {blue/0,orange/1}{
\foreach \tid in {0,...,31} {
\pgfmathsetmacro{\ratio}{\tid*2.5+15}
\ifthenelse{\verbose=0}{
\draw [line width=0.005mm, fill=\col!\ratio!white]
($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$)
rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem);
}{
\pgfmathsetmacro{\drawTid}{int(\tid+\tg*32)}
\draw [line width=0.005mm, fill=\col!\ratio!white]
($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$)
rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem)
node [pos=.5, scale=.35*\scale, rotate=90*\nonTrans] {t\drawTid};
}
}
}
}
\draw [thick] (block TL) rectangle ++(32*\elem, -32*\elem);
}
\newcommand{\drawTensorMFMALayout}[6]{
%%
%% Draw a tensor with mfma layout.
%%
%% C TL: pre defined top-left coordinates of the tensor
%%
%% #1: M
%% #2: N
%% #3: MFMA nonKDim
%% #4: warpsPerCTA[0]
%% #5: warpsPerCTA[1]
%% #6: 1 for mfma.trans, 0 for normal mfma
\pgfmathsetmacro{\tensorShapeH}{#1}
\pgfmathsetmacro{\tensorShapeW}{#2}
\pgfmathsetmacro{\mfmaNonKDim}{#3}
\pgfmathsetmacro{\warpsPerCTAH}{#4}
\pgfmathsetmacro{\warpsPerCTAW}{#5}
\pgfmathsetmacro{\mfmaTrans}{#6}
\coordinate (old TL) at (TL);
\coordinate (TL) at (C TL);
\pgfmathsetmacro{\CTARepH}{\tensorShapeH/\mfmaNonKDim/\warpsPerCTAH}
\pgfmathsetmacro{\CTARepW}{\tensorShapeW/\mfmaNonKDim/\warpsPerCTAW}
\pgfmathsetmacro{\maxCTAId}{\CTARepH*\CTARepW-1}
\pgfmathsetmacro{\maxWaveId}{\warpsPerCTAH*\warpsPerCTAW-1}
\pgfmathsetmacro{\CTASizeH}{\warpsPerCTAH*\mfmaNonKDim}
\pgfmathsetmacro{\CTASizeW}{\warpsPerCTAW*\mfmaNonKDim}
\foreach \ctaId in {0,...,\maxCTAId}{
\pgfmathsetmacro{\ctaCoordH}{int(\ctaId/\CTARepW)}
\pgfmathsetmacro{\ctaCoordW}{mod(\ctaId,\CTARepW)}
\coordinate (CTA TL) at ($(TL)+(\ctaCoordW*\CTASizeW*\elem, -\ctaCoordH*\CTASizeH*\elem)$);
%% Draw a detailed view of wave0 in each CTA
\coordinate (block TL) at (CTA TL);
\drawBlockMFMALayoutLarge{\mfmaTrans}{0}
\foreach \waveId in {0,...,\maxWaveId}{
\pgfmathsetmacro{\waveCoordH}{int(\waveId/\warpsPerCTAW)}
\pgfmathsetmacro{\waveCoordW}{mod(\waveId,\warpsPerCTAW)}
\coordinate (block TL) at ($(CTA TL)+(\waveCoordW*\mfmaNonKDim*\elem, -\waveCoordH*\mfmaNonKDim*\elem)$);
%% Inside the loop, only draw a rectangle
\draw [ultra thin] (block TL) rectangle ++(\mfmaNonKDim*\elem, -\mfmaNonKDim*\elem)
node [scale=.7*\scale, pos=.5, fill=white, inner sep=0] {wave\waveId};
}
%% Draw the outline of each CTA rep
\draw [ultra thick] (CTA TL) rectangle ++(\CTASizeW*\elem, -\CTASizeH*\elem);
}
\coordinate (TL) at (old TL);
}
\newcommand{\drawMFMAOperand}[4]{
%%
%% Draw one mfma operand
%%
%% mfma op TL: pre defined coordinates of the top-left
%% \elem: pre defined variable
%%
%% #1: mfmNonKDim
%% #2: kpack
%% #3: 0 for opA and 1 for opB
%% #4: verbose. 1 means draw tid in each vec; 0 means draw nothing
\pgfmathsetmacro{\nonKDim}{#1}
\pgfmathsetmacro{\kpack}{#2}
\pgfmathsetmacro{\opIdxA}{#3}
\pgfmathsetmacro{\opIdxB}{1-\opIdxA}
\pgfmathsetmacro{\verbose}{#4}
\ifthenelse{\opIdxA = 0}{
\def\opColorL{\opColorAL}
\def\opColorR{\opColorAR}
}{
\def\opColorL{\opColorBL}
\def\opColorR{\opColorBR}
}
\foreach \col/\tg in {\opColorL/0,\opColorR/1}{
\foreach \tid in {0,...,31} {
% \pgfmathsetmacro{\ratio}{\tid*2.5+15}
\ifthenelse{\verbose=0}{
\draw [line width=0.005mm, fill=\col]
($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$)
rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA);
}{
\pgfmathsetmacro{\drawTid}{int(\tid+\tg*32)}
\draw [line width=0.005mm, fill=\col]
($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$)
rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA)
node [pos=.5, scale=.35*\scale, rotate=90*\opIdxA] {t\drawTid};
}
}
}
}
\newcommand{\drawWaveOperand}[4]{
%%
%% Draw the part of the tensor that is one operand of the wave
%%
%% Op TL: pre defined coordinates of the top-left of the operand
%% \elem: pre defined variable
%%
%% #1: K
%% #2: mfmNonKDim
%% #3: kpack
%% #4: 0 for opA and 1 for opB
\pgfmathsetmacro{\K}{#1}
\pgfmathsetmacro{\nonKDim}{#2}
\pgfmathsetmacro{\kpack}{#3}
\pgfmathsetmacro{\opIdx}{#4}
\pgfmathsetmacro{\opIdxOther}{1-\opIdx}
\coordinate (TL) at (Op TL);
\pgfmathsetmacro{\numKRep}{\K/\kpack/2}
\pgfmathsetmacro{\maxKRepId}{\numKRep-1}
\foreach \repId in {0,...,\maxKRepId}{
\coordinate (mfma op TL) at ($(TL)+(\repId*2*\kpack*\elem*\opIdxOther, -\repId*2*\kpack*\elem*\opIdx)$);
\drawMFMAOperand{\nonKDim}{\kpack}{\opIdx}{0}
\draw [thick] (mfma op TL) rectangle
++(2*\kpack*\elem*\opIdxOther+\nonKDim*\opIdx*\elem, -\nonKDim*\opIdxOther*\elem-2*\kpack*\elem*\opIdx);
}
}
\newcommand{\drawDotOperands}[7]{
%%
%% Draw operand tensors of dot
%%
%% A TL and B TL: pre defined top-left coordinates of A and B tensor
%% \elem: pre defined variable
%%
%% #1: M
%% #2: N
%% #3: K
%% #4: MFMA nonKDim
%% #5: warpsPerCTA[0]
%% #6: warpsPerCTA[1]
%% #7: kpack
\pgfmathsetmacro{\M}{#1}
\pgfmathsetmacro{\N}{#2}
\pgfmathsetmacro{\K}{#3}
\pgfmathsetmacro{\mfmaNonKDim}{#4}
\pgfmathsetmacro{\warpsPerCTAM}{#5}
\pgfmathsetmacro{\warpsPerCTAN}{#6}
\pgfmathsetmacro{\kpack}{#7}
%% operand A
\pgfmathsetmacro{\CTARepM}{\M/\warpsPerCTAM/32}
\pgfmathsetmacro{\maxCTAIdM}{\CTARepM-1}
\pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM-1}
\foreach \ctaId in {0,...,\maxCTAIdM}{
\coordinate (CTA TL) at ($(A TL)+(0, -\ctaId*\warpsPerCTAM*32*\elem)$);
\foreach \waveId in {0,...,\maxWaveId}{
\coordinate (wave TL) at ($(CTA TL)+(0, -\waveId*32*\elem)$);
\draw [ultra thin] (wave TL) rectangle ++(\K*\elem, -32*\elem);
}
%% Only draw the detailed view of the first wave in CTA
\coordinate (Op TL) at (CTA TL);
\drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{0}
%% Draw the outline of each CTA rep
\draw [ultra thick] (CTA TL) rectangle ++(\K*\elem, -\warpsPerCTAM*32*\elem);
}
\draw [ultra thin] (A TL) rectangle ++(\K*\elem, -\M*\elem);
%% operand B
\pgfmathsetmacro{\CTARepN}{\N/\warpsPerCTAN/32}
\pgfmathsetmacro{\maxCTAIdN}{\CTARepN-1}
\pgfmathsetmacro{\maxWaveId}{\warpsPerCTAN-1}
\foreach \ctaId in {0,...,\maxCTAIdN}{
\coordinate (CTA TL) at ($(B TL)+(\ctaId*\warpsPerCTAN*32*\elem, 0)$);
\foreach \waveId in {0,...,\maxWaveId}{
\coordinate (wave TL) at ($(CTA TL)+(\waveId*32*\elem ,0)$);
\draw [ultra thin] (wave TL) rectangle ++(32*\elem, -\K*\elem);
}
%% Only draw the detailed view of the first wave in CTA
\coordinate (Op TL) at (CTA TL);
\drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{1}
%% Draw the outline of each CTA rep
\draw [ultra thick] (CTA TL) rectangle ++(\warpsPerCTAN*32*\elem, -\K*\elem);
}
\draw [ultra thin] (B TL) rectangle ++(\N*\elem, -\K*\elem);
}
\newcommand{\drawDot}[8]{
%%
%% Draw C = dot A, B
%%
%% C TL: pre defined top-left coordinates of the result tensor
%% \elem: pre defined variable
%%
%% #1: M
%% #2: N
%% #3: K
%% #4: MFMA nonKDim
%% #5: warpsPerCTA[0]
%% #6: warpsPerCTA[1]
%% #7: 1 for mfma.trans, 0 for normal mfma
%% #8: kpack
\pgfmathsetmacro{\M}{#1}
\pgfmathsetmacro{\N}{#2}
\pgfmathsetmacro{\K}{#3}
\pgfmathsetmacro{\mfmaNonKDim}{#4}
\pgfmathsetmacro{\warpsPerCTAM}{#5}
\pgfmathsetmacro{\warpsPerCTAN}{#6}
\pgfmathsetmacro{\mfmaTrans}{#7}
\pgfmathsetmacro{\kpack}{#8}
\pgfmathsetmacro{\kdim}{int(2*\kpack)}
\pgfmathsetmacro{\gap}{\elem*20}
\coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$);
\coordinate (B TL) at ($(C TL)+(0, \gap+\K*\elem)$);
\drawDotOperands{\M}{\N}{\K}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\kpack}
\drawTensorMFMALayout{\M}{\N}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\mfmaTrans}
%% Draw labels
\node [scale=\scale, above] at ($(A TL)+(.5*\K*\elem, 0)$) {K=\K};
\node [scale=\scale, above, rotate=90] at ($(A TL)+(0, -.5*\M*\elem)$) {M=\M};
\node [scale=\scale, above, rotate=90] at ($(B TL)+(0, -.5*\K*\elem)$) {K=\K};
\node [scale=\scale, above] at ($(B TL)+(.5*\N*\elem, 0)$) {N=\N};
\node [scale=\scale, above left] at (A TL) {A};
\node [scale=\scale, above left] at (B TL) {B};
\node [scale=\scale, above left] at (C TL) {C};
%% label nonKDim
\node [scale=.8*\scale, left] at ($(A TL)+(0, -.5*\mfmaNonKDim*\elem)$) {\mfmaNonKDim};
\node [scale=.8*\scale, above] at ($(B TL)+(.5*\mfmaNonKDim*\elem, 0)$) {\mfmaNonKDim};
%% label kpack
\node [scale=.8*\scale, above] at ($(A TL)+(\kpack*\elem, 0)$) {\kdim};
\node [scale=.8*\scale, left] at ($(B TL)+(0, -\kpack*\elem)$) {\kdim};
}
\newcommand{\Colors}{{
"red",
"YellowGreen",
"blue",
"Maroon",
"orange",
"cyan",
"magenta",
"brown",
"teal",
"purple",
"gray",
"Green",
"BlueGreen",
"violet",
"olive",
"darkgray",
}}
\newcommand{\drawTensorLayoutGlobalMem}{
%%
%% Draw tensor layout in global memory without any swizzling
%%
%% TL: pre defined top-left coordinates of the tensor in global memory
%% \elem: per defined variable
%% \Colors: a pre defined array of 16 colors
%%
%% The following arguments are also expected to be pre defined
%% #1: M
%% #2: K
%% #3: vec: number of elements in a group
\pgfmathsetmacro{\numVecK}{\K/\vec}
\pgfmathsetmacro{\maxVecId}{16*\numVecK-1}
\pgfmathsetmacro{\drawM}{20}
%% Draw the tensor, but only draw 32 rows
\draw (TL) rectangle ++(\K*\elem, -\drawM*\elem);
%% Draw detailed vec view of the tensor
\foreach \vecId in {0,...,\maxVecId}{
\pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)}
\pgfmathsetmacro{\vecCoordK}{mod(\vecId,\numVecK)}
\coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$);
\pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))}
\pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)}
\pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]}
\pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40}
\draw [ultra thin, fill=\vecColor!\ratio!white] (vec TL) rectangle ++(\vec*\elem, -\elem)
node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
}
%% M and K dim
\node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem-8*\elem)$) {M=\M};
\node [scale=.8*\scale, left] at ($(TL)+(0, -.5*16*\elem)$) {16};
\node [scale=\scale, above] at ($(TL)+(.5*\K*\elem, 0)$) {K=\K};
%% label for vecSize
\def\vecR{1.5}
\coordinate (vec TL) at ($(TL)+(-.25*\vec*\elem, 3*\elem*\vecR)$);
\pgfmathsetmacro{\maxVec}{\vec-1}
\foreach \vecId in {0,...,\maxVec}{
\draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
}
\draw [densely dotted] (TL) -- ($(vec TL)+(0, -\elem*\vecR)$);
\draw [densely dotted] ($(TL)+(\vec*\elem, 0)$) -- ($(vec TL)+(\vec*\elem*\vecR, -\elem*\vecR)$);
\node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec};
}
\newcommand{\drawLDSLayoutTritonSwizzling}[2]{
%%
%% Draw tensor layout in LDS with swizzling
%%
%% TL: pre defined top-left coordinates of the tensor in global memory
%% \elem: per defined variable
%% \Colors: a pre defined array of 16 colors
%%
%% The following three arguments are expected to be pre defined
%% #1: M
%% #2: K
%% #3: vec: number of elements in a group
%%
%% #1: hasSwizzle, 0 means no swizzling and no padding,
%% 1 means optimal swizzling
%% 2 means padding
%% #2: access mode, 0 means draw nothing, 1 means ds_read, 2 means ds_write
%% For ds_write access, the following variables are assumed to be pre defined
%% \sizePerThreadK
%% \sizePerThreadM
%% \threadsPerWarpK
\pgfmathsetmacro{\hasSwizzle}{#1}
\pgfmathsetmacro{\accessMode}{#2}
\pgfmathsetmacro{\numVecK}{\K/\vec}
%% Assuming fp16 data type
\pgfmathsetmacro{\LDSK}{64}
\pgfmathsetmacro{\numLDSVec}{\LDSK/\vec}
\pgfmathsetmacro{\swizzleK}{max(\LDSK, \K)}
\pgfmathsetmacro{\LDSM}{int(\M/\LDSK*\K)}
\ifthenelse{\accessMode = 2}{
%% \accessMode == 2, draw 8 rows
\pgfmathsetmacro{\maxVecId}{8*\numVecK-1}
\pgfmathsetmacro{\drawM}{8*\K/\LDSK+4}
}{
%% \accessMode == 0 or 1, draw 16 rows
\pgfmathsetmacro{\maxVecId}{16*\numVecK-1}
\pgfmathsetmacro{\drawM}{16*\K/\LDSK+4}
}
%% Parameters used for swizzling
\pgfmathsetmacro{\numVecSwizzleK}{\swizzleK/\vec}
%% perPhase = ceil(LDSK / K)
%% The number of the rows of the tensor that can share the same swizzling pattern
\pgfmathsetmacro{\perPhase}{ceil(\LDSK/\K)}
%% maxPhase: the total number of different swizzling patterns
\ifthenelse{\hasSwizzle=0}{
%% When swizzling is disabled
\pgfmathsetmacro{\maxPhase}{1}
}{
%% When vec is small enough, we want 16/perPhase different swizzling patterns
%% When vec is large, we can only have 64 / \vec different swizzling pattern at most
\pgfmathsetmacro{\maxPhase}{min(16/\perPhase,64/\vec)}
}
%% Draw the LDS
\draw (TL) rectangle ++(\LDSK*\elem, -\drawM*\elem);
%% Draw detailed vec view of LDS
\foreach \vecId in {0,...,\maxVecId}{
\pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)}
\pgfmathsetmacro{\vecCoordK}{int(mod(\vecId,\numVecK))}
\pgfmathsetmacro{\rawPhase}{floor(\vecId/\numVecSwizzleK)}
%% vec color
\pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))}
\pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)}
\pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40}
\pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]}
%% old vec coordinates
\coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$);
%% new vec coordinates in LDS by swizzling
%% The following two conditions correspond to the relation between \LDSK and \K
\ifthenelse{\LDSK < \K}{
\pgfmathsetmacro{\vecLDSM}{\vecCoordM*\K/\LDSK+floor(\vecCoordK*\vec/\LDSK)}
\pgfmathsetmacro{\vecLDSK}{int(mod(\vecCoordK, \LDSK/\vec))}
}{
\pgfmathsetmacro{\vecLDSM}{floor(\vecCoordM/\perPhase)}
\pgfmathsetmacro{\vecLDSK}{int(\vecCoordK+mod(\vecCoordM,\perPhase)*\numVecK)}
}
%%
\pgfmathsetmacro{\phase}{int(mod(\rawPhase, \maxPhase))}
%% Compute the swizzled col id
\pgfmathsetmacro{\vecLDSKSwizzled}{\bitwiseXor{\vecLDSK}{\phase}}
%% new vec coordinates in LDS by padding
\pgfmathsetmacro{\numPads}{floor(\vecId/\numLDSVec)}
\pgfmathsetmacro{\bankId}{\vec/2*\vecId+\numPads}
\pgfmathsetmacro{\vecPadM}{int(\bankId/32)}
\pgfmathsetmacro{\vecPadK}{int(mod(\bankId,32))}
\ifthenelse{\hasSwizzle = 2}{
%% vec coordinates by padding
\coordinate (new vec TL) at ($(TL)+(\vecPadK*2*\elem, -\vecPadM*\elem)$);
\pgfmathsetmacro{\tailBankId}{int(\vecPadK+\vec/2-1)}
}{
%% vec coordinates by swizzling
\coordinate (new vec TL) at ($(TL)+(\vecLDSKSwizzled*\vec*\elem, -\vecLDSM*\elem)$);
\pgfmathsetmacro{\tailBankId}{0}
}
\ifthenelse{\hasSwizzle = 2 \AND \tailBankId > 31}{
\pgfmathsetmacro{\nextBanks}{\tailBankId-31}
\pgfmathsetmacro{\leftBanks}{\vec/2 - \nextBanks}
\draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\leftBanks*2*\elem, -\elem)
node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
\draw [ultra thin, fill=\vecColor!\ratio!white] ($(TL)+(0, -\vecPadM*\elem-\elem)$)
rectangle ++(\nextBanks*2*\elem, -\elem) node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
}{
\draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\vec*\elem, -\elem)
node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
}
%% ds_read
%% Highlight the elements the first 16 threads access in the first cycle
%% This is used to visualize bank conflicts
\ifthenelse{\accessMode = 1}{
\ifthenelse{\vecCoordK = 0}{
\draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem);
\draw (new vec TL) -- ++(\elem, -\elem);
\draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);
}{}
}{}
%% Draw ds_write pattern
\ifthenelse{\accessMode = 2}{
%% First compute the coverage of the first 16 threads
\pgfmathsetmacro{\covK}{min(16, \threadsPerWarpK)*\sizePerThreadK/\vec}
\pgfmathsetmacro{\covM}{ceil(16/\threadsPerWarpK)*\sizePerThreadM}
%% Check conditions for the first 16 threads
\pgfmathsetmacro{\vecInThread}{int(mod(\vecCoordK, \sizePerThreadK/\vec))}
\ifthenelse{\vecInThread=0}{
\ifthenelse{\vecCoordK<\covK \AND \vecCoordM<\covM}{
\draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem);
\draw (new vec TL) -- ++(\elem, -\elem);
\draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);
}{}
}{}
}{}
%% Label the phase of each line if swizzling is used
\ifthenelse{\hasSwizzle = 2}{}{
\pgfmathsetmacro{\lastVecId}{int(64/\vec)-1}
\ifthenelse{\vecLDSKSwizzled = \lastVecId}{
\draw [ultra thin] ($(new vec TL)+(\vec*\elem, -.5*\elem)$) -- ++(\elem, 0)
node [scale=.6*\scale, right] {\phase};
}{}
}
}
%% Draw boundary of 32 banks
%% Assume fp16 data type
\foreach \bank in {0,...,31}{
\draw [ultra thin, gray] ($(TL)+(\bank*2*\elem, 0)$) -- ++(0, 2*\elem)
node [scale=.6*\scale, right, black] {\bank};
}
\draw [ultra thin, gray] ($(TL)+(32*2*\elem, 0)$) -- ++(0, 2*\elem);
\node [scale=.6*\scale, left, black] at ($(TL)+(0, 2*\elem)$) {bank id};
\node [scale=\scale, above] at ($(TL)+(.5*\LDSK*\elem, 3*\elem)$) {LDS 32 banks};
\node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem)$) {LDSM=\LDSM};
%% label phase if swizzling is used
\ifthenelse{\hasSwizzle = 2}{}{
\node [scale=.6*\scale, above right] at($(TL)+(32*2*\elem, 0)$) {phase};
}
}
\newcommand{\drawMFMAInstr}[3]{
%%
%% Draw layout of mfma instructions with tid labeled
%%
%% C TL: pre defined top-left coordinates of the output matrix
%% \elem: pre defined variable
%%
%% #1: mfmaNonKDim
%% #2: kpack
%% #3: mfmaTrans
\pgfmathsetmacro{\mfmaNonKDim}{#1}
\pgfmathsetmacro{\kpack}{#2}
\pgfmathsetmacro{\mfmaTrans}{#3}
\pgfmathsetmacro{\nonTrans}{1-#3}
\pgfmathsetmacro{\gap}{\elem*5}
\coordinate (mfma opA TL) at ($(C TL)+(-.5*\gap-1.2*\nonTrans*\gap-2*\kpack*\elem, 0)$);
\coordinate (mfma op TL) at (mfma opA TL);
\drawMFMAOperand{\mfmaNonKDim}{\kpack}{0}{1}
\coordinate (mfma op TL) at ($(C TL)+(0, 1.5*\gap+.5*\mfmaTrans*\gap+2*\kpack*\elem)$);
\drawMFMAOperand{\mfmaNonKDim}{\kpack}{1}{1}
\coordinate (block TL) at (C TL);
\drawBlockMFMALayoutLarge{\mfmaTrans}{1}
%% Draw labels
\def\vecR{1.5}
\coordinate (vec TL) at ($(mfma opA TL)+(-.25*\kpack*\elem, 3*\elem*\vecR)$);
\pgfmathsetmacro{\maxVec}{\kpack-1}
\foreach \vecId in {0,...,\maxVec}{
\draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
}
\draw [densely dotted] (mfma opA TL) -- ($(vec TL)+(0, -\elem*\vecR)$);
\draw [densely dotted] ($(mfma opA TL)+(\kpack*\elem, 0)$) -- ($(vec TL)+(\kpack*\elem*\vecR, -\elem*\vecR)$);
\node [scale=.8*\scale, above] at ($(vec TL)+(.5*\kpack*\elem*\vecR, 0)$) {vec=\kpack};
\coordinate (vec TL) at ($(mfma op TL)+(-3*\elem*\vecR, .25*\kpack*\elem)$);
\foreach \vecId in {0,...,\maxVec}{
\draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
}
\draw [densely dotted] (mfma op TL) -- ($(vec TL)+(\elem*\vecR,0)$);
\draw [densely dotted] ($(mfma op TL)+(0, -\kpack*\elem)$) -- ($(vec TL)+(\elem*\vecR, -\kpack*\elem*\vecR)$);
\node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*\kpack*\elem*\vecR)$) {vec=\kpack};
\node [scale=\scale, below] at ($(block TL)+(.5*\mfmaNonKDim*\elem,-\mfmaNonKDim*\elem)$) {outC};
\ifthenelse{\mfmaTrans=0}{
\node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opA};
\node [scale=\scale, above] at (mfma op TL) {opB};
\coordinate (vec TL) at ($(block TL)+(-3*\elem-\elem*\vecR, .25*4*\elem)$);
\foreach \vecId in {0,1,2,3}{
\draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
}
\draw [densely dotted] (block TL) -- ++(-3*\elem, .25*4*\elem);
\draw [densely dotted] ($(block TL)+(0, -4*\elem)$) -- ++(-3*\elem, -.25*4*\elem);
\node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*4*\elem*\vecR)$) {vec=4};
\node [scale=.8*\scale, above, align=center] at ($(block TL)+(16*\elem, 0)$) {mfmaLayout\\trans=False};
}{
\node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opB};
\node [scale=\scale, above] at (mfma op TL) {opA};
\coordinate (vec TL) at ($(block TL)+(-.25*4*\elem, 3*\elem+\elem*\vecR)$);
\foreach \vecId in {0,1,2,3}{
\draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
}
\draw [densely dotted] (block TL) -- ++(-.25*4*\elem, 3*\elem);
\draw [densely dotted] ($(block TL)+(4*\elem, 0)$) -- ++(.25*4*\elem, 3*\elem);
\node [scale=.8*\scale, above] at ($(vec TL)+(.5*4*\elem*\vecR, 0)$) {vec=4};
\node [scale=.8*\scale, above, align=center] at ($(block TL)+(16*\elem, 0)$) {mfmaLayout\\trans=True};
}
}
\newcommand{\drawWMMAOperand}[3]{
%%
%% Draw the layout of one operand of WMMA instruction
%%
%% #1: opIdx. 0 for opA, 1 for opB
%% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing
%% #3: mode. 0 for w32, 1 for w64
%%
%% wmma op TL: pre defined top-left coordinates of the operand matrix
\pgfmathsetmacro{\isOpB}{#1}
\pgfmathsetmacro{\isOpA}{1-\isOpB}
\pgfmathsetmacro{\verbose}{#2}
\pgfmathsetmacro{\isWLarge}{#3}
\foreach \row in {0,...,15}{
\pgfmathsetmacro{\ratio}{\row*5+15}
\coordinate (vec TL) at ($(wmma op TL)+(\row*\isOpB*\elem, -\row*\elem*\isOpA)$);
\ifthenelse{\isWLarge=1}{
\pgfmathsetmacro{\tidone}{int(\row+16)}
\pgfmathsetmacro{\tidtwo}{int(\row+32)}
\pgfmathsetmacro{\tidthree}{int(\row+48)}
\draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL)
rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB)
node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone, t\tidtwo, t\tidthree};
}{
\pgfmathsetmacro{\tidone}{int(\row+16)}
\draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL)
rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB)
node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone};
}
}
}
\newcommand{\drawWMMAResult}[2]{
%%
%% Draw layout of WMMA result tensor
%%
%% #1: verbose. 1 means draw tid in each vec; 0 means draw nothing
%% #2: mode. 0 for w32, 1 for w64
\pgfmathsetmacro{\verbose}{#1}
\pgfmathsetmacro{\isWLarge}{#2}
\pgfmathsetmacro{\numElem}{256}
\pgfmathsetmacro{\maxElemId}{\numElem-1}
\foreach \elemId in {0,...,\maxElemId}{
%% figure out the rowID
\pgfmathsetmacro{\rowId}{floor(\elemId/16)}
%% figure out the colID
\pgfmathsetmacro{\colId}{mod(\elemId,16)}
%% figure out the tid and color
\ifthenelse{\isWLarge=1}{
\pgfmathsetmacro{\tid}{int(mod(\elemId,64))}
\pgfmathsetmacro{\laneId}{mod(\elemId,64)}
}{
\pgfmathsetmacro{\tid}{int(mod(\elemId,32))}
\pgfmathsetmacro{\laneId}{mod(\elemId,32)}
}
%% figure out the color
\pgfmathsetmacro{\colorId}{floor(\laneId/16)}
\pgfmathsetmacro{\vecColor}{\Colors[\colorId]}
%% Coordinate
\coordinate (vec TL) at ($(C TL)+(\colId*\elem, -\rowId*\elem)$);
\draw [line width=0.005mm, fill=\vecColor!60!white] (vec TL) rectangle ++(\elem, -\elem)
node [scale=.4*\scale, pos=.5] {t\tid};
}
}
\newcommand{\drawWMMAInstr}[2]{
%%
%% Draw wmma instruction layouts 16x16x16
%%
%% #1: mode. 0 for w32, 1 for w64
%% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing
%%
%% C TL: pre defined top-left coordinates of output matrix
%% \elem: pre defined element size
\pgfmathsetmacro{\isWLarge}{#1}
\pgfmathsetmacro{\verbose}{#2}
\pgfmathsetmacro{\gap}{\elem*2}
\coordinate (wmma op TL) at ($(C TL)+(-\gap-16*\elem, 0)$);
\coordinate (wmma opA TL) at (wmma op TL);
\drawWMMAOperand{0}{\verbose}{\isWLarge}
\coordinate (wmma op TL) at ($(C TL)+(0, \gap+16*\elem)$);
\drawWMMAOperand{1}{\verbose}{\isWLarge}
\drawWMMAResult{1}{\isWLarge}
%% labels
\pgfmathsetmacro{\gap}{\elem}
\node [above left, scale=\scale] at (wmma opA TL) {A};
\node [above left, scale=\scale] at (wmma op TL) {B};
\node [above right, scale=\scale] at ($(C TL)+(16*\elem, 0)$) {C};
%% A k dim
\node [scale=.8*\scale] (k dim A) at ($(wmma opA TL)+(8*\elem,\gap)$) {16};
\draw [->, >=stealth] (k dim A.west) -- ($(wmma opA TL)+(0, \gap)$);
\draw [->, >=stealth] (k dim A.east) -- ($(wmma opA TL)+(16*\elem, \gap)$);
%% B K dim
\node [scale=.8*\scale, rotate=90] (k dim B) at ($(wmma op TL)+(-\gap, -8*\elem)$) {16};
\draw [->, >=stealth] (k dim B.east) -- ($(wmma op TL)+(-\gap, 0)$);
\draw [->, >=stealth] (k dim B.west) -- ($(wmma op TL)+(-\gap, -16*\elem)$);
%% C M dim
\node [scale=.8*\scale] (m dim) at ($(C TL)+(8*\elem,-16*\elem-\gap)$) {16};
\draw [->, >=stealth] (m dim.west) -- ($(C TL)+(0, -16*\elem-\gap)$);
\draw [->, >=stealth] (m dim.east) -- ($(C TL)+(16*\elem, -16*\elem-\gap)$);
%% C N dim
\node [scale=.8*\scale, rotate=-90] (n dim) at ($(C TL)+(16*\elem+\gap, -8*\elem)$) {16};
\draw [->, >=stealth] (n dim.west) -- ($(C TL)+(16*\elem+\gap, 0)$);
\draw [->, >=stealth] (n dim.east) -- ($(C TL)+(16*\elem+\gap, -16*\elem)$);
}