Files
linea-monorepo/prover/crypto/ringsis/templates/transversal_hash.go.tmpl
Lakshminarayanan Nandakumar 8ddd4c1d3d Prover/wizard performance monitor (#768)
* wizard runtime perf monitor

* ring-sis before revert

* sanity check successful

* rm test config file
2025-03-12 08:43:31 +00:00

188 lines
6.0 KiB
Cheetah

package ringsis_{{.ModulusDegree}}_{{.LogTwoBound}}
import (
"github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft"
"github.com/consensys/linea-monorepo/prover/maths/common/smartvectors"
"github.com/consensys/linea-monorepo/prover/maths/common/vector"
"github.com/consensys/linea-monorepo/prover/maths/field"
"github.com/consensys/linea-monorepo/prover/utils"
"github.com/consensys/linea-monorepo/prover/utils/parallel"
ppool "github.com/consensys/linea-monorepo/prover/utils/parallel/pool"
)
{{- $bitPerField := 256}}
{{- $limbPerField := div $bitPerField .LogTwoBound}}
{{- $fieldPerPoly := div .ModulusDegree $limbPerField}}
{{- $numMask := pow 2 $fieldPerPoly}}
func TransversalHash(
// the Ag for ring-sis
ag [][]field.Element,
// A non-transposed list of columns
// All of the same length
pols []smartvectors.SmartVector,
// The precomputed twiddle cosets for the forward FFT
twiddleCosets []field.Element,
// The domain for the final inverse-FFT
domain *fft.Domain,
) []field.Element {
var (
// Each field element is encoded in {{$limbPerField}} limbs but the degree is {{.ModulusDegree}}. So, each
// polynomial multiplication "hashes" {{$fieldPerPoly}} field elements at once. This is
// important to know for parallelization.
resultSize = pols[0].Len() * {{.ModulusDegree}}
// To optimize memory usage, we limit ourself to hash only 16 columns per
// iteration.
numColumnPerJob int = 16
// In theory, it should be a div ceil. But in practice we only process power's
// of two number of columns. If that's not the case, then the function will panic
// but we can always change that if this is needed. The rational for the current
// design is simplicity.
numJobs = utils.DivExact(pols[0].Len(), numColumnPerJob) // we make blocks of 16 columns
// Main result of the hashing
mainResults = make([]field.Element, resultSize)
// When we encounter a const row, it will have the same additive contribution
// to the result on every column. So we compute the contribution only once and
// accumulate it with the other "constant column contributions". And it is only
// performed by the first thread.
constResults = make([]field.Element, {{.ModulusDegree}})
)
ppool.ExecutePoolChunky(numJobs, func(i int) {
// We process the columns per segment of `numColumnPerJob`
var (
localResult = make([]field.Element, numColumnPerJob*{{.ModulusDegree}})
limbs = make([]field.Element, {{.ModulusDegree}})
// Each segment is processed by packet of `numFieldPerPoly={{$fieldPerPoly}}` rows
startFromCol = i * numColumnPerJob
stopAtCol = (i + 1) * numColumnPerJob
)
for row := 0; row < len(pols); row += {{$fieldPerPoly}} {
var (
chunksFull = make([][]field.Element, {{$fieldPerPoly}})
mask = 0
)
for j := 0; j < {{$fieldPerPoly}}; j++ {
if row+j >= len(pols) {
continue
}
pReg, pIsReg := pols[row+j].(*smartvectors.Regular)
if pIsReg {
chunksFull[j] = (*pReg)[startFromCol:stopAtCol]
mask |= (1 << j)
continue
}
pPool, pIsPool := pols[row+j].(*smartvectors.Pooled)
if pIsPool {
chunksFull[j] = pPool.Regular[startFromCol:stopAtCol]
mask |= (1 << j)
continue
}
}
if mask > 0 {
for col := 0; col < (stopAtCol - startFromCol); col++ {
colChunk := [{{$fieldPerPoly}}]field.Element{}
for j := 0; j < {{$fieldPerPoly}}; j++ {
if chunksFull[j] != nil {
colChunk[j] = chunksFull[j][col]
}
}
limbDecompose(limbs, colChunk[:])
partialFFT[mask](limbs, twiddleCosets)
mulModAcc(localResult[col*{{.ModulusDegree}}:(col+1)*{{$.ModulusDegree}}], limbs, ag[row/{{$fieldPerPoly}}])
}
}
if i == 0 {
var (
cMask = ((1 << {{$fieldPerPoly}}) - 1) ^ mask
chunkConst = make([]field.Element, {{$fieldPerPoly}})
)
if cMask > 0 {
for j := 0; j < {{$fieldPerPoly}}; j++ {
if row+j >= len(pols) {
continue
}
if (cMask>>j)&1 == 1 {
chunkConst[j] = pols[row+j].(*smartvectors.Constant).Get(0)
}
}
limbDecompose(limbs, chunkConst)
partialFFT[cMask](limbs, twiddleCosets)
mulModAcc(constResults, limbs, ag[row/{{$fieldPerPoly}}])
}
}
}
// copy the segment into the main result at the end
copy(mainResults[startFromCol*{{.ModulusDegree}}:stopAtCol*{{.ModulusDegree}}], localResult)
})
// Now, we need to reconciliate the results of the buffer with
// the result for each thread
parallel.Execute(pols[0].Len(), func(start, stop int) {
for col := start; col < stop; col++ {
// Accumulate the const
vector.Add(mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], constResults)
// And run the reverse FFT
domain.FFTInverse(mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], fft.DIT, fft.OnCoset(), fft.WithNbTasks(1))
}
})
return mainResults
}
var _zeroes []field.Element = make([]field.Element, {{.ModulusDegree}})
// zeroize fills `buf` with zeroes.
func zeroize(buf []field.Element) {
copy(buf, _zeroes)
}
// mulModAdd increments each entry `i` of `res` as `res[i] = a[i] * b[i]`. The
// input vectors are trusted to all have the same length.
func mulModAcc(res, a, b []field.Element) {
var tmp field.Element
for i := range res {
tmp.Mul(&a[i], &b[i])
res[i].Add(&res[i], &tmp)
}
}
func limbDecompose(result []field.Element, x []field.Element) {
zeroize(result)
var bytesBuffer = [32]byte{}{{"\n"}}
{{- range $k := iterate 0 $fieldPerPoly}}
{{- $pos := mul (add $k 1) $limbPerField -}}
{{- "\n\t"}}bytesBuffer = x[{{$k}}].Bytes(){{"\n\n"}}
{{- range $i := iterate 0 $limbPerField }}
{{- $resPos := sub (sub $pos $i) 1 }}
{{- if eq $.LogTwoBound 8 -}}
{{- $inpPos0 := $i -}}
{{"\t"}}result[{{$resPos}}][0] = uint64(bytesBuffer[{{$inpPos0}}]){{"\n"}}
{{- else if eq $.LogTwoBound 16 }}
{{- $inpPos0 := mul $i 2 }}
{{- $inpPos1 := add $inpPos0 1 -}}
{{"\t"}}result[{{$resPos}}][0] = uint64(bytesBuffer[{{$inpPos1}}]) | (uint64(bytesBuffer[{{$inpPos0}}]) << 8){{"\n"}}
{{- end}}
{{- end}}{{end}}
{{- "}\n" -}}