Use memory efficient toHex in pubkey2index map (#3561)

* Research the memory cost of bytes hex

* Benchmark overhead of Uint8Array

* Test template strings

* Improve PubkeyIndexMap memory efficiency
This commit is contained in:
Lion - dapplion
2022-01-04 10:08:09 +01:00
committed by GitHub
parent f49aa8e391
commit c384b36a1b
7 changed files with 533 additions and 7 deletions

1
.gitignore vendored
View File

@@ -31,6 +31,7 @@ packages/lodestar/test-logs/
packages/beacon-state-transition/test-cache
benchmark_data
invalidSszObjects/
packages/lodestar/mainnet_pubkeys.csv
# Autogenerated docs
packages/**/docs

View File

@@ -1,4 +1,4 @@
import {ByteVector, hash, toHexString, BitList, List, readonlyValuesListOfLeafNodeStruct} from "@chainsafe/ssz";
import {ByteVector, hash, BitList, List, readonlyValuesListOfLeafNodeStruct} from "@chainsafe/ssz";
import bls, {CoordType, PublicKey} from "@chainsafe/bls";
import {
BLSSignature,
@@ -67,8 +67,23 @@ export type EpochContextOpts = {
type PubkeyHex = string;
function toHexStringMaybe(hex: ByteVector | string): string {
return typeof hex === "string" ? hex : toHexString(hex);
/**
* toHexString() creates hex strings via string concatenation, which are very memory inneficient.
* Memory benchmarks show that Buffer.toString("hex") produces strings with 10x less memory.
*
* Does not prefix to save memory, thus the prefix is removed from an already string representation.
*
* See https://github.com/ChainSafe/lodestar/issues/3446
*/
function toMemoryEfficientHexStr(hex: ByteVector | Uint8Array | string): string {
if (typeof hex === "string") {
if (hex.startsWith("0x")) {
hex = hex.slice(2);
}
return hex;
}
return Buffer.from(hex as Uint8Array).toString("hex");
}
export class PubkeyIndexMap {
@@ -79,12 +94,15 @@ export class PubkeyIndexMap {
return this.map.size;
}
get(key: ByteVector | PubkeyHex): ValidatorIndex | undefined {
return this.map.get(toHexStringMaybe(key));
/**
* Must support reading with string for API support where pubkeys are already strings
*/
get(key: ByteVector | Uint8Array | PubkeyHex): ValidatorIndex | undefined {
return this.map.get(toMemoryEfficientHexStr(key));
}
set(key: ByteVector | PubkeyHex, value: ValidatorIndex): void {
this.map.set(toHexStringMaybe(key), value);
set(key: ByteVector | Uint8Array, value: ValidatorIndex): void {
this.map.set(toMemoryEfficientHexStr(key), value);
}
}

View File

@@ -0,0 +1,65 @@
import {toHexString} from "@chainsafe/ssz";
import crypto from "crypto";
import {testRunnerMemory} from "./testRunnerMemory";
// Results in Linux Dec 2021
//
// Bytes32 toHexString() - 902.8 bytes / instance
// Bytes32 Buffer.toString(hex) - 86.9 bytes / instance
// Bytes32 Buffer.toString(hex) from Uint8Array - 87.6 bytes / instance
// Bytes32 Buffer.toString(hex) + 0x - 121.7 bytes / instance
// Bytes32 randomBytes32Template() - 924.7 bytes / instance
testRunnerMemoryBpi([
{
id: "Bytes32 toHexString()",
getInstance: () => toHexString(crypto.randomBytes(32)),
},
{
id: "Bytes32 Buffer.toString(hex)",
getInstance: () => crypto.randomBytes(32).toString("hex"),
},
{
id: "Bytes32 Buffer.toString(hex) from Uint8Array",
getInstance: () => Buffer.from(randomBytesUint8Array(32)).toString("hex"),
},
{
id: "Bytes32 Buffer.toString(hex) + 0x",
getInstance: () => "0x" + crypto.randomBytes(32).toString("hex"),
},
{
id: "Bytes32 randomBytes32Template()",
getInstance: () => randomBytes32Template(),
},
]);
/**
* Test bytes per instance in different representations of raw binary data
*/
function testRunnerMemoryBpi(testCases: {getInstance: (bytes: number) => unknown; id: string}[]): void {
const longestId = Math.max(...testCases.map(({id}) => id.length));
for (const {id, getInstance} of testCases) {
const bpi = testRunnerMemory({
getInstance,
convergeFactor: 0.2 / 100,
});
// eslint-disable-next-line no-console
console.log(`${id.padEnd(longestId)} - ${bpi.toFixed(1)} bytes / instance`);
}
}
function randomBytesUint8Array(bytes: number): Uint8Array {
const buf = crypto.randomBytes(bytes);
const uArr = new Uint8Array(bytes);
for (let i = 0; i < bytes; i++) {
uArr[i] = buf[i];
}
return uArr;
}
function randomBytes32Template(): string {
const buf = crypto.randomBytes(32);
return `${buf[0]}${buf[1]}${buf[2]}${buf[3]}${buf[4]}${buf[5]}${buf[6]}${buf[7]}${buf[8]}${buf[9]}${buf[10]}${buf[11]}${buf[12]}${buf[13]}${buf[14]}${buf[15]}${buf[16]}${buf[17]}${buf[18]}${buf[19]}${buf[20]}${buf[21]}${buf[22]}${buf[23]}${buf[24]}${buf[25]}${buf[26]}${buf[27]}${buf[28]}${buf[29]}${buf[30]}${buf[31]}`;
}

View File

@@ -0,0 +1,58 @@
import {toHexString} from "@chainsafe/ssz";
import crypto from "crypto";
import {testRunnerMemory} from "./testRunnerMemory";
// Results in Linux Jan 2022
//
// (pkCount = 100_000)
// Map BLS pubkey 48 bytes toHexString - 144437161.0 bytes / instance
// Map BLS pubkey 48 bytes hex - 14868449.0 bytes / instance
// Map BLS pubkey 4 bytes hex - 6070482.9 bytes / instance
//
// (pkCount = 10_000)
// Map BLS pubkey 48 bytes toHexString - 14539050.1 bytes / instance
// Map BLS pubkey 48 bytes hex - 1578660.8 bytes / instance
// Map BLS pubkey 4 bytes hex - 698867.0 bytes / instance
const pkCount = 10000;
testRunnerMemoryBpi([
{
id: "Map BLS pubkey 48 bytes toHexString",
getInstance: () => getRandomMap(pkCount, () => toHexString(crypto.randomBytes(48))),
},
{
id: "Map BLS pubkey 48 bytes hex",
getInstance: () => getRandomMap(pkCount, () => crypto.randomBytes(48).toString("hex")),
},
{
id: "Map BLS pubkey 4 bytes hex",
getInstance: () => getRandomMap(pkCount, () => crypto.randomBytes(4).toString("hex")),
},
]);
function getRandomMap(n: number, getKey: (i: number) => string): Map<string, unknown> {
const map = new Map<string, unknown>();
for (let i = 0; i < n; i++) {
map.set(getKey(i), i);
}
return map;
}
/**
* Test bytes per instance in different representations of raw binary data
*/
function testRunnerMemoryBpi(testCases: {getInstance: (bytes: number) => unknown; id: string}[]): void {
const longestId = Math.max(...testCases.map(({id}) => id.length));
for (const {id, getInstance} of testCases) {
const bpi = testRunnerMemory({
getInstance,
convergeFactor: 1 / 100,
sampleEvery: 5,
});
// eslint-disable-next-line no-console
console.log(`${id.padEnd(longestId)} - ${bpi.toFixed(1)} bytes / instance`);
}
}

View File

@@ -0,0 +1,215 @@
export type TestRunnerMemoryOpts<T> = {
getInstance: (i: number) => T;
sampleEvery?: number;
maxRssBytes?: number;
maxInstances?: number;
computeUsedMemory?: (memoryUsage: NodeJS.MemoryUsage) => number;
logEachSample?: boolean;
convergeFactor?: number;
};
export async function testRunnerMemoryGc<T>(opts: TestRunnerMemoryOpts<T>): Promise<void> {
const {
getInstance,
/**
* How to compute the total memory usage.
* Defaults to `heapUsed + external`.
* https://nodejs.org/api/process.html#processmemoryusage
*/
computeUsedMemory = (memoryUsage) => memoryUsage.heapUsed + memoryUsage.external,
} = opts;
const rounds = 10;
const instancesPerRound = 1000;
const xs: number[] = [];
const usedMemoryArr: number[] = [];
for (let n = 0; n < rounds; n++) {
global.gc();
global.gc();
await new Promise((r) => setTimeout(r, 100));
global.gc();
global.gc();
const totalUsedMemoryPrev = computeUsedMemory(process.memoryUsage());
const refs: T[] = [];
for (let i = 0; i < instancesPerRound; i++) {
refs.push(getInstance(i));
}
global.gc();
global.gc();
await new Promise((r) => setTimeout(r, 100));
global.gc();
global.gc();
const totalUsedMemory = computeUsedMemory(process.memoryUsage());
const totalUsedMemoryDiff = totalUsedMemory - totalUsedMemoryPrev;
refs.push(null as any);
xs.push(n);
usedMemoryArr.push(totalUsedMemoryDiff);
const usedMemoryReg = linearRegression(xs, usedMemoryArr);
// eslint-disable-next-line no-console
console.log("totalUsedMemoryDiff", totalUsedMemoryDiff, usedMemoryReg);
}
}
export function testRunnerMemory<T>(opts: TestRunnerMemoryOpts<T>): number {
const {
getInstance,
/**
* Sample memory usage every `sampleEvery` instances
*/
sampleEvery = 1000,
/**
* Stop when `process.memoryUsage().rss > maxRssBytes`.
*/
maxRssBytes = 2e9,
/**
* Stop after creating `maxInstances` instances.
*/
maxInstances = Infinity,
/**
* How to compute the total memory usage.
* Defaults to `heapUsed + external`.
* https://nodejs.org/api/process.html#processmemoryusage
*/
computeUsedMemory = (memoryUsage) => memoryUsage.heapUsed + memoryUsage.external,
logEachSample,
convergeFactor = 0.2 / 100, // 0.2%
} = opts;
const refs: T[] = [];
const xs: number[] = [];
const usedMemoryArr: number[] = [];
let prevM0 = 0;
let prevM1 = 0;
for (let i = 0; i < maxInstances; i++) {
refs.push(getInstance(i));
// Stores 5 floating point numbers every 5000 pushes to refs.
// The added memory should be negligible against refs, and linearRegression
// local vars will get garbage collected and won't show up in the .m result
if (i % sampleEvery === 0) {
global.gc();
global.gc();
const memoryUsage = process.memoryUsage();
const usedMemory = computeUsedMemory(memoryUsage);
xs.push(i);
usedMemoryArr.push(usedMemory);
if (usedMemoryArr.length > 1) {
// When is a good time to stop a benchmark? A naive answer is after N miliseconds or M runs.
// This code aims to stop the benchmark when the average fn run time has converged at a value
// within a given convergence factor. To prevent doing expensive math to often for fast fn,
// it only takes samples every `sampleEveryMs`. It stores two past values to be able to compute
// a very rough linear and quadratic convergence.
const m = linearRegression(xs, usedMemoryArr).m;
// Compute convergence (1st order + 2nd order)
const a = prevM0;
const b = prevM1;
const c = m;
// Aprox linear convergence
const convergence1 = Math.abs(c - a);
// Aprox quadratic convergence
const convergence2 = Math.abs(b - (a + c) / 2);
// Take the greater of both to enfore linear and quadratic are below convergeFactor
const convergence = Math.max(convergence1, convergence2) / a;
// Okay to stop + has converged, stop now
if (convergence < convergeFactor) {
return m;
}
if (logEachSample) {
// eslint-disable-next-line no-console
console.log(i, memoryUsage.rss / maxRssBytes, {m});
}
prevM0 = prevM1;
prevM1 = m;
}
}
}
return linearRegression(xs, usedMemoryArr).m;
}
/**
* From https://github.com/simple-statistics/simple-statistics/blob/d0d177baf74976a2421638bce98ab028c5afb537/src/linear_regression.js
*
* [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
* is a simple way to find a fitted line between a set of coordinates.
* This algorithm finds the slope and y-intercept of a regression line
* using the least sum of squares.
*
* @param data an array of two-element of arrays,
* like `[[0, 1], [2, 3]]`
* @returns object containing slope and intersect of regression line
* @example
* linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 }
*/
export function linearRegression(xs: number[], ys: number[]): {m: number; b: number} {
let m: number, b: number;
// Store data length in a local variable to reduce
// repeated object property lookups
const dataLength = xs.length;
//if there's only one point, arbitrarily choose a slope of 0
//and a y-intercept of whatever the y of the initial point is
if (dataLength === 1) {
m = 0;
b = ys[0];
} else {
// Initialize our sums and scope the `m` and `b`
// variables that define the line.
let sumX = 0,
sumY = 0,
sumXX = 0,
sumXY = 0;
// Use local variables to grab point values
// with minimal object property lookups
let x: number, y: number;
// Gather the sum of all x values, the sum of all
// y values, and the sum of x^2 and (x*y) for each
// value.
//
// In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
for (let i = 0; i < dataLength; i++) {
x = xs[i];
y = ys[i];
sumX += x;
sumY += y;
sumXX += x * x;
sumXY += x * y;
}
// `m` is the slope of the regression line
m = (dataLength * sumXY - sumX * sumY) / (dataLength * sumXX - sumX * sumX);
// `b` is the y-intercept of the line.
b = sumY / dataLength - (m * sumX) / dataLength;
}
// Return both values as an object.
return {
m: m,
b: b,
};
}

View File

@@ -0,0 +1,41 @@
import crypto from "crypto";
import {itBench} from "@dapplion/benchmark";
import {toHexString} from "@chainsafe/ssz";
// Results in Linux Dec 2021
//
// misc / bytes32 to hex
// ✓ bytes32 toHexString 1248439 ops/s 801.0000 ns/op - 731181 runs 0.808 s
// ✓ bytes32 Buffer.toString(hex) 1610306 ops/s 621.0000 ns/op - 871116 runs 0.808 s
// ✓ bytes32 Buffer.toString(hex) from Uint8Array 1321004 ops/s 757.0000 ns/op - 567231 runs 0.606 s
// ✓ bytes32 Buffer.toString(hex) + 0x 1647446 ops/s 607.0000 ns/op - 446039 runs 0.404 s
describe("misc / bytes32 to hex", () => {
const bytes32 = crypto.randomBytes(32);
const uint8Arr = randomBytesUint8Array(32);
itBench("bytes32 toHexString", () => {
toHexString(bytes32);
});
itBench("bytes32 Buffer.toString(hex)", () => {
bytes32.toString("hex");
});
itBench("bytes32 Buffer.toString(hex) from Uint8Array", () => {
Buffer.from(uint8Arr).toString("hex");
});
itBench("bytes32 Buffer.toString(hex) + 0x", () => {
"0x" + bytes32.toString("hex");
});
});
function randomBytesUint8Array(bytes: number): Uint8Array {
const buf = crypto.randomBytes(bytes);
const uArr = new Uint8Array(bytes);
for (let i = 0; i < bytes; i++) {
uArr[i] = buf[i];
}
return uArr;
}

View File

@@ -0,0 +1,128 @@
import fs from "fs";
import {getClient} from "@chainsafe/lodestar-api";
import {config} from "@chainsafe/lodestar-config/default";
import {newZeroedArray} from "@chainsafe/lodestar-beacon-state-transition";
import SHA256 from "@chainsafe/as-sha256";
// Script to analyze if a raw BLS pubkey bytes are sufficiently even distributed.
// If so, a shorter slice of the pubkey bytes can be used as key for the pubkey to index map.
//
// # How to use
// ```
// INFURA_ETH2_URL=https://someurl ../../node_modules/.bin/ts-node test/scripts/blsPubkeyBytesFrequency.ts collisions
// ```
// Available commands:
// - `frequency`
// - `collisions`
//
// # Results
// - byte pubkey[0] is not evenly distributed, since it includes some flags. Median frequency is 0.
// - bytes pubkey[1:5] are very evenly distributed.
//
// # Collisions rates
// (not hashed, byte offset = 1)
// bytes 1, collision rate 1
// bytes 2, collision rate 0.92230224609375
// bytes 3, collision rate 0.00013267993927001953
// bytes 4, collision rate 2.0954757928848267e-9
//
// (hashed)
// bytes 1, collision rate 1
// bytes 2, collision rate 0.92401123046875
// bytes 3, collision rate 0.00013625621795654297
// bytes 4, collision rate 3.026798367500305e-9
const filepath = "mainnet_pubkeys.csv";
async function run(): Promise<void> {
// Cache locally to prevent re-fetch
if (!fs.existsSync(filepath)) await writePubkeys();
const pubkeys = fs.readFileSync(filepath, "utf8").trim().split("\n");
switch (process.argv[2]) {
case "frequency":
return analyzeBytesFrequencies(pubkeys);
case "collisions":
return analyzeBytesCollisions(pubkeys);
}
}
function analyzeBytesFrequencies(pubkeys: string[]): void {
for (let i = 0; i < 5; i++) {
const byte0Freq = newZeroedArray(256);
for (const pubkeyStr of pubkeys) {
const byte0 = parseInt(pubkeyStr.slice(i * 2, (i + 1) * 2), 16);
byte0Freq[byte0] = 1 + (byte0Freq[byte0] ?? 0);
}
// eslint-disable-next-line no-console
console.log(
`Byte[${i}] frequency distribution`,
JSON.stringify(
byte0Freq.map((f) => (f * 255) / pubkeys.length),
null,
2
)
);
}
}
function analyzeBytesCollisions(pubkeys: string[]): void {
const offset = 1;
const useHash = true;
for (let i = 1; i <= 4; i++) {
const keySet = new Set<string>();
const collisions = new Map<string, number>();
for (const pubkeyStr of pubkeys) {
let key: string;
if (useHash) {
const pubkey = Buffer.from(pubkeyStr, "hex");
const pubkeyHash = SHA256.digest(pubkey);
key = Buffer.from(pubkeyHash.slice(offset, offset + i)).toString("hex");
} else {
key = pubkeyStr.slice(offset * 2, (offset + i) * 2);
}
if (keySet.has(key)) {
collisions.set(key, 1 + (collisions.get(key) ?? 0));
} else {
keySet.add(key);
}
}
// eslint-disable-next-line no-console
console.log(`bytes ${i}, collision rate ${collisions.size / 256 ** i}`);
}
}
async function writePubkeys(): Promise<void> {
const baseUrl = process.env.INFURA_ETH2_URL;
if (!baseUrl) {
throw Error(`
Must run with INFURA_ETH2_URL ENV, where the URL has the format:
https://\${INFURA_CREDENTIALS}@eth2-beacon-\${NETWORK}.infura.io
`);
}
const client = getClient(config, {baseUrl});
const {data: state} = await client.debug.getStateV2("finalized");
const pubkeys = Array.from(state.validators).map((validator) =>
Buffer.from(validator.pubkey as Uint8Array).toString("hex")
);
fs.writeFileSync("mainnet_pubkeys.csv", pubkeys.join("\n"));
}
run().catch((e) => {
// eslint-disable-next-line no-console
console.error(e);
process.exit(1);
});