profiler for powdr asm compiled from rust (#1394)

Rust profiler (#1174) implemented by tracking the execution of the
`riscv-executor`.
This commit is contained in:
Leandro Pacheco
2024-05-31 11:50:40 -03:00
committed by GitHub
parent adee683796
commit 328cf5474f
7 changed files with 449 additions and 7 deletions

View File

@@ -9,7 +9,9 @@ use log::LevelFilter;
use powdr_number::{BigUint, Bn254Field, FieldElement, GoldilocksField};
use powdr_pipeline::Pipeline;
use powdr_riscv_executor::ProfilerOptions;
use std::ffi::OsStr;
use std::io;
use std::{borrow::Cow, io::Write, path::Path};
use strum::{Display, EnumString, EnumVariantNames};
@@ -122,6 +124,16 @@ enum Commands {
#[arg(short, long)]
#[arg(default_value_t = false)]
witness: bool,
/// Generate a flamegraph plot of the execution ("[file].svg")
#[arg(long)]
#[arg(default_value_t = false)]
generate_flamegraph: bool,
/// Generate callgrind file of the execution ("[file].callgrind")
#[arg(long)]
#[arg(default_value_t = false)]
generate_callgrind: bool,
},
}
@@ -220,13 +232,29 @@ fn run_command(command: Commands) {
output_directory,
continuations,
witness,
generate_flamegraph,
generate_callgrind,
} => {
let profiling = if generate_callgrind || generate_flamegraph {
Some(ProfilerOptions {
file_stem: Path::new(&file)
.file_stem()
.and_then(OsStr::to_str)
.map(String::from),
output_directory: output_directory.clone(),
flamegraph: generate_flamegraph,
callgrind: generate_callgrind,
})
} else {
None
};
call_with_field!(execute::<field>(
Path::new(&file),
split_inputs(&inputs),
Path::new(&output_directory),
continuations,
witness
witness,
profiling
))
}
};
@@ -297,6 +325,7 @@ fn execute<F: FieldElement>(
output_dir: &Path,
continuations: bool,
witness: bool,
profiling: Option<ProfilerOptions>,
) -> Result<(), Vec<String>> {
let mut pipeline = Pipeline::<F>::default()
.from_file(file_name.to_path_buf())
@@ -304,7 +333,7 @@ fn execute<F: FieldElement>(
let bootloader_inputs = if continuations {
pipeline = pipeline.with_prover_inputs(inputs.clone());
powdr_riscv::continuations::rust_continuations_dry_run(&mut pipeline)
powdr_riscv::continuations::rust_continuations_dry_run(&mut pipeline, profiling.clone())
} else {
vec![]
};
@@ -327,6 +356,7 @@ fn execute<F: FieldElement>(
pipeline.data_callback().unwrap(),
&[],
powdr_riscv_executor::ExecMode::Fast,
profiling,
);
log::info!("Execution trace length: {}", trace.len);
}

View File

@@ -22,5 +22,8 @@ num-bigint = "0.4.3"
num-traits = "0.2.15"
k256 = { version = "0.13.3", features = [] }
rustc-demangle = "0.1"
inferno = "0.11.19"
[lints.clippy]
uninlined_format_args = "deny"

View File

@@ -10,7 +10,7 @@
//! from execution.
use std::{
collections::HashMap,
collections::{BTreeMap, HashMap},
fmt::{self, Display, Formatter},
io,
};
@@ -28,9 +28,13 @@ use powdr_ast::{
};
use powdr_number::{FieldElement, LargeInt};
use powdr_riscv_syscalls::SYSCALL_REGISTERS;
pub use profiler::ProfilerOptions;
pub mod arith;
pub mod poseidon_gl;
mod profiler;
use crate::profiler::Profiler;
/// Initial value of the PC.
///
@@ -492,10 +496,18 @@ pub fn get_main_machine(program: &AnalysisASMFile) -> &Machine {
}
struct PreprocessedMain<'a, T: FieldElement> {
/// list of all statements (batches expanded)
statements: Vec<&'a FunctionStatement>,
/// label to batch number
label_map: HashMap<&'a str, Elem<T>>,
/// batch number to its first statement idx
batch_to_line_map: Vec<u32>,
/// file number to (dir,name)
debug_files: Vec<(&'a str, &'a str)>,
/// function label to batch number
function_starts: BTreeMap<usize, &'a str>,
/// .debug loc to batch number
location_starts: BTreeMap<usize, (usize, usize)>,
}
/// Returns the list of instructions, directly indexable by PC, the map from
@@ -511,6 +523,8 @@ fn preprocess_main_function<T: FieldElement>(machine: &Machine) -> PreprocessedM
let mut label_map = HashMap::new();
let mut batch_to_line_map = vec![0; PC_INITIAL_VAL];
let mut debug_files = Vec::new();
let mut function_starts = BTreeMap::new();
let mut location_starts = BTreeMap::new();
for (batch_idx, batch) in orig_statements.iter_batches().enumerate() {
batch_to_line_map.push(statements.len() as u32);
@@ -531,7 +545,11 @@ fn preprocess_main_function<T: FieldElement>(machine: &Machine) -> PreprocessedM
assert_eq!(*idx, debug_files.len() + 1);
debug_files.push((dir.as_str(), file.as_str()));
}
DebugDirective::Loc(_, _, _) | DebugDirective::OriginalInstruction(_) => {
DebugDirective::Loc(file, line, _) => {
location_starts.insert(batch_idx + PC_INITIAL_VAL, (*file, *line));
statements.push(s);
}
DebugDirective::OriginalInstruction(_) => {
// keep debug locs for debugging purposes
statements.push(s);
}
@@ -541,6 +559,10 @@ fn preprocess_main_function<T: FieldElement>(machine: &Machine) -> PreprocessedM
// assert there are no statements in the middle of a block
assert!(!statement_seen);
label_map.insert(name.as_str(), (batch_idx + PC_INITIAL_VAL).into());
// TODO: would looking for "___dot_Lfunc_begin" be less hacky? would require more work to handle ecalls though...
if !name.contains("___dot_L") {
function_starts.insert(batch_idx + PC_INITIAL_VAL, name.as_str());
}
}
}
}
@@ -555,6 +577,8 @@ fn preprocess_main_function<T: FieldElement>(machine: &Machine) -> PreprocessedM
label_map,
batch_to_line_map,
debug_files,
function_starts,
location_starts,
}
}
@@ -979,6 +1003,16 @@ impl<'a, 'b, F: FieldElement> Executor<'a, 'b, F> {
}
}
/// return true if the expression is a jump instruction
fn is_jump(e: &Expression) -> bool {
if let Expression::FunctionCall(_, FunctionCall { function, .. }) = e {
if let Expression::Reference(_, f) = function.as_ref() {
return ["jump", "jump_dyn"].contains(&f.try_to_identifier().unwrap().as_str());
}
}
false
}
pub fn execute_ast<T: FieldElement>(
program: &AnalysisASMFile,
initial_memory: MemoryState,
@@ -986,6 +1020,7 @@ pub fn execute_ast<T: FieldElement>(
bootloader_inputs: &[Elem<T>],
max_steps_to_execute: usize,
mode: ExecMode,
profiling: Option<ProfilerOptions>,
) -> (ExecutionTrace<T>, MemoryState) {
let main_machine = get_main_machine(program);
let PreprocessedMain {
@@ -993,6 +1028,8 @@ pub fn execute_ast<T: FieldElement>(
label_map,
batch_to_line_map,
debug_files,
function_starts,
location_starts,
} = preprocess_main_function(main_machine);
let proc = match TraceBuilder::<'_, T>::new(
@@ -1014,6 +1051,9 @@ pub fn execute_ast<T: FieldElement>(
_stdout: io::stdout(),
};
let mut profiler =
profiling.map(|opt| Profiler::new(opt, &debug_files[..], function_starts, location_starts));
let mut curr_pc = 0u32;
loop {
let stm = statements[curr_pc as usize];
@@ -1022,13 +1062,39 @@ pub fn execute_ast<T: FieldElement>(
match stm {
FunctionStatement::Assignment(a) => {
if let Some(p) = &mut profiler {
p.add_instruction_cost(e.proc.get_pc().u() as usize);
}
let pc_before = e.proc.get_reg("pc").u() as usize;
let results = e.eval_expression(a.rhs.as_ref());
assert_eq!(a.lhs_with_reg.len(), results.len());
let pc_after = e.proc.get_reg("pc").u() as usize;
if is_jump(a.rhs.as_ref()) {
let pc_return = results[0].u() as usize;
assert_eq!(a.lhs_with_reg.len(), 1);
if let Some(p) = &mut profiler {
// in the generated powdr asm, writing to `tmp1` means the returning pc is ignored
if a.lhs_with_reg[0].0 == "tmp1" {
p.jump(pc_after);
} else {
p.jump_and_link(pc_before, pc_after, pc_return);
}
}
}
for ((dest, _), val) in a.lhs_with_reg.iter().zip(results) {
e.proc.set_reg(dest, val);
}
}
FunctionStatement::Instruction(i) => {
assert!(!["jump", "jump_dyn"].contains(&i.instruction.as_str()));
if let Some(p) = &mut profiler {
p.add_instruction_cost(e.proc.get_pc().u() as usize);
}
e.exec_instruction(&i.instruction, &i.inputs);
}
FunctionStatement::Return(_) => break,
@@ -1055,6 +1121,9 @@ pub fn execute_ast<T: FieldElement>(
};
}
if let Some(mut p) = profiler {
p.finish();
}
e.proc.finish()
}
@@ -1073,6 +1142,7 @@ pub fn execute<F: FieldElement>(
inputs: &Callback<F>,
bootloader_inputs: &[Elem<F>],
mode: ExecMode,
profiling: Option<ProfilerOptions>,
) -> (ExecutionTrace<F>, MemoryState) {
log::info!("Parsing...");
let parsed = powdr_parser::parse_asm(None, asm_source).unwrap();
@@ -1089,6 +1159,7 @@ pub fn execute<F: FieldElement>(
bootloader_inputs,
usize::MAX,
mode,
profiling,
)
}

View File

@@ -0,0 +1,333 @@
use std::{
collections::BTreeMap,
fs::File,
io::BufWriter,
io::Write,
path::{Path, PathBuf},
};
use itertools::Itertools;
use rustc_demangle::demangle;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Call<'a> {
from: Loc<'a>,
target: Loc<'a>,
}
/// RISC-V asm profiler.
/// Tracks the self-cost of functions and the cumulative cost of specific function calls (i.e., callgrind style).
pub struct Profiler<'a> {
/// profiling options
options: ProfilerOptions,
/// file number to (dir,file)
debug_files: &'a [(&'a str, &'a str)],
/// pc value of function beginnings
function_begin: BTreeMap<usize, &'a str>,
/// pc value of .debug loc statements
location_begin: BTreeMap<usize, (usize, usize)>,
/// current call stack, entries include running cost
call_stack: Vec<(Call<'a>, usize)>,
/// saved return address of "jump and link" instructions
return_pc_stack: Vec<usize>,
/// cost of each location
location_stats: BTreeMap<Loc<'a>, usize>,
/// (count, cumulative cost) of calls
call_stats: BTreeMap<Call<'a>, (usize, usize)>,
/// stack sampling format for FlameGraph
folded_stack_stats: BTreeMap<Vec<&'a str>, usize>,
}
#[derive(Default, Clone)]
pub struct ProfilerOptions {
pub output_directory: String,
pub file_stem: Option<String>,
pub flamegraph: bool,
pub callgrind: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Loc<'a> {
function: &'a str,
file: usize,
line: usize,
}
impl<'a> Profiler<'a> {
pub fn new(
options: ProfilerOptions,
debug_files: &'a [(&'a str, &'a str)],
function_begin: BTreeMap<usize, &'a str>,
location_begin: BTreeMap<usize, (usize, usize)>,
) -> Self {
Profiler {
options,
debug_files,
function_begin,
location_begin,
call_stack: Default::default(),
return_pc_stack: Default::default(),
location_stats: Default::default(),
call_stats: Default::default(),
folded_stack_stats: Default::default(),
}
}
pub fn write_callgrind<P: AsRef<Path>>(&self, path: P) {
log::info!("Writing callgrind data to {:?}", path.as_ref());
let file = File::create(path).unwrap();
let mut w = BufWriter::new(file);
writeln!(&mut w, "events: Instructions\n").unwrap();
for func in self.function_begin.values() {
let loc_stats: Vec<_> = self
.location_stats
.iter()
.filter_map(|(loc, cost)| {
if &loc.function == func {
Some((loc.file, loc.line, cost))
} else {
None
}
})
.sorted()
.collect();
let call_stats: Vec<_> = self
.call_stats
.iter()
.filter_map(|(call, (count, cost))| {
if &call.from.function == func {
Some((call, count, cost))
} else {
None
}
})
.collect();
if loc_stats.is_empty() && call_stats.is_empty() {
continue;
}
writeln!(w, "fn={}", format_function_name(func)).unwrap();
let mut curr_file = None;
for (file_nr, line, cost) in loc_stats {
if Some(file_nr) != curr_file {
curr_file = Some(file_nr);
let file = self.debug_files[file_nr - 1];
writeln!(w, "fl={}/{}", file.0, file.1).unwrap();
}
writeln!(w, "{line} {cost}").unwrap();
}
for (call, count, cost) in call_stats {
let target_file_nr = call.target.file;
if Some(target_file_nr) != curr_file {
curr_file = Some(target_file_nr);
let file = self.debug_files[target_file_nr - 1];
writeln!(w, "cfi={}/{}", file.0, file.1).unwrap();
}
writeln!(w, "cfn={}", format_function_name(call.target.function)).unwrap();
writeln!(w, "calls={count} {}", call.target.line).unwrap();
writeln!(w, "{} {cost}", call.from.line).unwrap();
}
writeln!(w).unwrap();
}
}
pub fn write_flamegraph<P: AsRef<Path>>(&self, path: P) {
log::info!("Writing flamegraph to {:?}", path.as_ref());
let lines: Vec<_> = self
.folded_stack_stats
.iter()
.map(|(stack, count)| {
let stack = stack
.iter()
.map(|function| format_function_name(function))
.join(";");
format!("{stack} {count}")
})
.collect();
let mut options = Default::default();
let file = File::create(path).unwrap();
let w = BufWriter::new(file);
inferno::flamegraph::from_lines(&mut options, lines.iter().map(|s| s.as_str()), w).unwrap();
}
/// calculate totals and write out results
pub fn finish(&mut self) {
let mut path = PathBuf::from(&self.options.output_directory)
.join(self.options.file_stem.as_deref().unwrap_or("out"));
if self.options.flamegraph {
path.set_extension("svg");
self.write_flamegraph(&path);
}
if self.options.callgrind {
path.set_extension("callgrind");
self.write_callgrind(&path);
}
}
/// profiling only starts once "__runtime_start" is reached
pub fn is_running(&self) -> bool {
!self.call_stack.is_empty()
}
/// function at the top of the call stack
pub fn curr_function(&self) -> Option<&'a str> {
self.call_stack.last().map(|(c, _)| c.target.function)
}
/// get the function name and source location for a given pc value
pub fn location_at(&self, pc: usize) -> Option<Loc<'a>> {
self.function_begin
.range(..=pc)
.last()
.and_then(|(_, function)| {
self.location_begin
.range(..=pc)
.last()
.map(|(_, (file, line))| Loc {
function,
file: *file,
line: *line,
})
})
}
/// add cost for instruction/row
pub fn add_instruction_cost(&mut self, curr_pc: usize) {
if !self.is_running() {
return;
}
// add cost to current location. AFAIU need the function name from the call stack to handle inlining
let function = self.curr_function().unwrap();
let Loc { file, line, .. } = self.location_at(curr_pc).unwrap();
*self
.location_stats
.entry(Loc {
function,
file,
line,
})
.or_default() += 1;
// add cost to current call
self.call_stack.last_mut().unwrap().1 += 1;
// add sample to folded stacks
let stack: Vec<_> = self
.call_stack
.iter()
.map(|(call, _)| call.target.function)
.collect();
*self.folded_stack_stats.entry(stack).or_default() += 1;
}
/// Should be called for instructions that jump and save the returning address in an actual RISC-V register.
/// This is handled as a "call" into a function.
pub fn jump_and_link(&mut self, curr_pc: usize, target_pc: usize, return_pc: usize) {
if let Some(mut target) = self.location_at(target_pc) {
if let Some(curr_function) = self.curr_function() {
let Loc {
file: curr_file,
line: curr_line,
..
} = self.location_at(curr_pc).unwrap();
// ecall handler code have a ".debug loc", so we keep current file/line
if target.function == "__ecall_handler" {
target.file = curr_file;
target.line = curr_line;
}
let call = Call {
from: Loc {
function: curr_function,
file: curr_file,
line: curr_line,
},
target,
};
// increase call count
self.call_stats.entry(call.clone()).or_default().0 += 1;
self.call_stack.push((call, 0));
self.return_pc_stack.push(return_pc);
} else {
// we start profiling on the initial call to "__runtime_start"
if target.function == "__runtime_start" {
let call = Call {
from: Loc {
function: "",
file: 0,
line: 0,
},
target,
};
// increase call count
self.call_stats.entry(call.clone()).or_default().0 += 1;
self.call_stack.push((call, 0));
self.return_pc_stack.push(return_pc);
}
}
} else {
assert!(!self.is_running());
}
}
/// Should be called for jumps that don't save the returning address.
/// This is handled as one of 3 cases:
/// - "return" from function: target_pc equal to last `jump_and_link` saved pc
/// - "tail call": next_function != current_function
/// - control flow: next_function == current_function
pub fn jump(&mut self, target_pc: usize) {
if !self.is_running() {
return;
}
if self
.return_pc_stack
.last()
.is_some_and(|saved_pc| *saved_pc == target_pc)
{
// "return" from current function
let (done_call, cost) = self.call_stack.pop().unwrap();
self.return_pc_stack.pop();
// add to cumulative cost of call and to running cost of caller
if let Some((_curr_call, curr_cost)) = self.call_stack.last_mut() {
self.call_stats.get_mut(&done_call).unwrap().1 += cost;
*curr_cost += cost;
}
} else {
let target = self.location_at(target_pc).unwrap();
let curr_function = self.curr_function().unwrap();
if target.function != curr_function {
// "tail call": replace the current call in the stack
let (done_call, cost) = self.call_stack.pop().unwrap();
// add to cumulative cost of call and to running cost of caller
if let Some((_curr_call, curr_cost)) = self.call_stack.last_mut() {
self.call_stats.get_mut(&done_call).unwrap().1 += cost;
*curr_cost += cost;
}
// push new call.
// here we keep the origin of the current call as the origin of the tail call replacing it
let new_call = Call {
from: done_call.from,
target,
};
self.call_stats.entry(new_call.clone()).or_default().0 += 1;
self.call_stack.push((new_call, 0));
} else {
// "control flow" (or "tail call" to self, if that is a thing), don't think this needs special handling
}
}
}
}
fn format_function_name(name: &str) -> String {
if let Some(prefix) = name.find("___ZN") {
format!("{}", demangle(&name[prefix + 2..]))
} else {
format!("{}", demangle(name))
}
}

View File

@@ -9,7 +9,7 @@ use powdr_ast::{
};
use powdr_number::FieldElement;
use powdr_pipeline::Pipeline;
use powdr_riscv_executor::{get_main_machine, Elem, ExecutionTrace, MemoryState};
use powdr_riscv_executor::{get_main_machine, Elem, ExecutionTrace, MemoryState, ProfilerOptions};
pub mod bootloader;
mod memory_merkle_tree;
@@ -206,6 +206,7 @@ pub fn load_initial_memory(program: &AnalysisASMFile) -> MemoryState {
/// - The number of rows after which the prover should jump to the shutdown routine.
pub fn rust_continuations_dry_run<F: FieldElement>(
pipeline: &mut Pipeline<F>,
profiler_opt: Option<ProfilerOptions>,
) -> Vec<(Vec<F>, u64)> {
// All inputs for all chunks.
let mut bootloader_inputs_and_num_rows = vec![];
@@ -242,6 +243,7 @@ pub fn rust_continuations_dry_run<F: FieldElement>(
&default_input(&[]),
usize::MAX,
powdr_riscv_executor::ExecMode::Trace,
profiler_opt,
)
.0;
(transposed_trace::<F>(&trace), trace.mem_ops)
@@ -342,6 +344,8 @@ pub fn rust_continuations_dry_run<F: FieldElement>(
&bootloader_inputs,
num_rows,
powdr_riscv_executor::ExecMode::Trace,
// profiling was done when full trace was generated
None,
);
(transposed_trace(&trace), memory_snapshot_update)
};

View File

@@ -31,6 +31,7 @@ pub fn verify_riscv_asm_string<S: serde::Serialize + Send + Sync + 'static>(
&[],
usize::MAX,
powdr_riscv_executor::ExecMode::Fast,
Default::default(),
);
verify_pipeline(pipeline, backend).unwrap();
}

View File

@@ -40,7 +40,7 @@ pub fn test_continuations(case: &str) {
Ok(())
};
let bootloader_inputs = rust_continuations_dry_run(&mut pipeline);
let bootloader_inputs = rust_continuations_dry_run(&mut pipeline, Default::default());
rust_continuations(pipeline, pipeline_callback, bootloader_inputs).unwrap();
}
@@ -272,7 +272,7 @@ fn many_chunks_dry() {
let mut pipeline = Pipeline::default()
.from_asm_string(powdr_asm, Some(PathBuf::from(case)))
.with_prover_inputs(Default::default());
rust_continuations_dry_run::<GoldilocksField>(&mut pipeline);
rust_continuations_dry_run::<GoldilocksField>(&mut pipeline, Default::default());
}
use serde::{Deserialize, Serialize};