linea-monorepo/prover/crypto/ringsis/templates/transversal_hash.go.tmpl

package ringsis_{{.ModulusDegree}}_{{.LogTwoBound}}

import (
	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft"
	"github.com/consensys/linea-monorepo/prover/maths/common/smartvectors"
	"github.com/consensys/linea-monorepo/prover/maths/common/vector"
	"github.com/consensys/linea-monorepo/prover/maths/field"
	"github.com/consensys/linea-monorepo/prover/utils"
	"github.com/consensys/linea-monorepo/prover/utils/parallel"
	ppool "github.com/consensys/linea-monorepo/prover/utils/parallel/pool"
)

{{- $bitPerField := 256}}
{{- $limbPerField := div $bitPerField .LogTwoBound}}
{{- $fieldPerPoly := div .ModulusDegree $limbPerField}}
{{- $numMask := pow 2 $fieldPerPoly}}

func TransversalHash(
	// the Ag for ring-sis
	ag [][]field.Element,
	// A non-transposed list of columns
	// All of the same length
	pols []smartvectors.SmartVector,
	// The precomputed twiddle cosets for the forward FFT
	twiddleCosets []field.Element,
	// The domain for the final inverse-FFT
	domain *fft.Domain,
) []field.Element {

	var (
		// Each field element is encoded in {{$limbPerField}} limbs but the degree is {{.ModulusDegree}}. So, each
		// polynomial multiplication "hashes" {{$fieldPerPoly}} field elements at once. This is
		// important to know for parallelization.
		resultSize = pols[0].Len() * {{.ModulusDegree}}

		// To optimize memory usage, we limit ourself to hash only 16 columns per
		// iteration.
		numColumnPerJob int = 16

		// In theory, it should be a div ceil. But in practice we only process power's
		// of two number of columns. If that's not the case, then the function will panic
		// but we can always change that if this is needed. The rational for the current
		// design is simplicity.
		numJobs = utils.DivExact(pols[0].Len(), numColumnPerJob) // we make blocks of 16 columns

		// Main result of the hashing
		mainResults = make([]field.Element, resultSize)
		// When we encounter a const row, it will have the same additive contribution
		// to the result on every column. So we compute the contribution only once and
		// accumulate it with the other "constant column contributions". And it is only
		// performed by the first thread.
		constResults = make([]field.Element, {{.ModulusDegree}})
	)

	ppool.ExecutePoolChunky(numJobs, func(i int) {
		// We process the columns per segment of `numColumnPerJob`
		var (
			localResult = make([]field.Element, numColumnPerJob*{{.ModulusDegree}})
			limbs       = make([]field.Element, {{.ModulusDegree}})

			// Each segment is processed by packet of `numFieldPerPoly={{$fieldPerPoly}}` rows
			startFromCol = i * numColumnPerJob
			stopAtCol    = (i + 1) * numColumnPerJob
		)

		for row := 0; row < len(pols); row += {{$fieldPerPoly}} {

			var (
				chunksFull = make([][]field.Element, {{$fieldPerPoly}})
				mask       = 0
			)

			for j := 0; j < {{$fieldPerPoly}}; j++ {
				if row+j >= len(pols) {
					continue
				}

				pReg, pIsReg := pols[row+j].(*smartvectors.Regular)
				if pIsReg {
					chunksFull[j] = (*pReg)[startFromCol:stopAtCol]
					mask |= (1 << j)
					continue
				}

				pPool, pIsPool := pols[row+j].(*smartvectors.Pooled)
				if pIsPool {
					chunksFull[j] = pPool.Regular[startFromCol:stopAtCol]
					mask |= (1 << j)
					continue
				}
			}

			if mask > 0 {
				for col := 0; col < (stopAtCol - startFromCol); col++ {
					colChunk := [{{$fieldPerPoly}}]field.Element{}
					for j := 0; j < {{$fieldPerPoly}}; j++ {
						if chunksFull[j] != nil {
							colChunk[j] = chunksFull[j][col]
						}
					}

					limbDecompose(limbs, colChunk[:])
					partialFFT[mask](limbs, twiddleCosets)
					mulModAcc(localResult[col*{{.ModulusDegree}}:(col+1)*{{$.ModulusDegree}}], limbs, ag[row/{{$fieldPerPoly}}])
				}
			}

			if i == 0 {

				var (
					cMask      = ((1 << {{$fieldPerPoly}}) - 1) ^ mask
					chunkConst = make([]field.Element, {{$fieldPerPoly}})
				)

				if cMask > 0 {
					for j := 0; j < {{$fieldPerPoly}}; j++ {
						if row+j >= len(pols) {
							continue
						}

						if (cMask>>j)&1 == 1 {
							chunkConst[j] = pols[row+j].(*smartvectors.Constant).Get(0)
						}
					}

					limbDecompose(limbs, chunkConst)
					partialFFT[cMask](limbs, twiddleCosets)
					mulModAcc(constResults, limbs, ag[row/{{$fieldPerPoly}}])
				}
			}
		}

		// copy the segment into the main result at the end
		copy(mainResults[startFromCol*{{.ModulusDegree}}:stopAtCol*{{.ModulusDegree}}], localResult)
	})

	// Now, we need to reconciliate the results of the buffer with
	// the result for each thread
	parallel.Execute(pols[0].Len(), func(start, stop int) {
		for col := start; col < stop; col++ {
			// Accumulate the const
			vector.Add(mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], constResults)
			// And run the reverse FFT
			domain.FFTInverse(mainResults[col*{{.ModulusDegree}}:(col+1)*{{.ModulusDegree}}], fft.DIT, fft.OnCoset(), fft.WithNbTasks(1))
		}
	})

	return mainResults
}

var _zeroes []field.Element = make([]field.Element, {{.ModulusDegree}})

// zeroize fills `buf` with zeroes.
func zeroize(buf []field.Element) {
	copy(buf, _zeroes)
}

// mulModAdd increments each entry `i` of `res` as `res[i] = a[i] * b[i]`. The
// input vectors are trusted to all have the same length.
func mulModAcc(res, a, b []field.Element) {
	var tmp field.Element
	for i := range res {
		tmp.Mul(&a[i], &b[i])
		res[i].Add(&res[i], &tmp)
	}
}

func limbDecompose(result []field.Element, x []field.Element) {

	zeroize(result)
	var bytesBuffer = [32]byte{}{{"\n"}}
	{{- range $k := iterate 0 $fieldPerPoly}}
	{{- $pos := mul (add $k 1) $limbPerField -}}
	{{- "\n\t"}}bytesBuffer = x[{{$k}}].Bytes(){{"\n\n"}}
	{{- range $i := iterate 0 $limbPerField }}
	{{- $resPos := sub (sub $pos $i) 1 }}
	{{- if eq $.LogTwoBound 8 -}}
	{{- $inpPos0 := $i -}}
	{{"\t"}}result[{{$resPos}}][0] = uint64(bytesBuffer[{{$inpPos0}}]){{"\n"}}
	{{- else if eq $.LogTwoBound 16 }}
	{{- $inpPos0 := mul $i 2 }}
	{{- $inpPos1 := add $inpPos0 1 -}}
	{{"\t"}}result[{{$resPos}}][0] = uint64(bytesBuffer[{{$inpPos1}}]) | (uint64(bytesBuffer[{{$inpPos0}}]) << 8){{"\n"}}
	{{- end}}
	{{- end}}{{end}}
{{- "}\n" -}}