diff --git a/cli-rs/src/main.rs b/cli-rs/src/main.rs index ec2897b23..a5de43653 100644 --- a/cli-rs/src/main.rs +++ b/cli-rs/src/main.rs @@ -9,7 +9,9 @@ use log::LevelFilter; use powdr_number::{BigUint, Bn254Field, FieldElement, GoldilocksField}; use powdr_pipeline::Pipeline; +use powdr_riscv_executor::ProfilerOptions; +use std::ffi::OsStr; use std::io; use std::{borrow::Cow, io::Write, path::Path}; use strum::{Display, EnumString, EnumVariantNames}; @@ -122,6 +124,16 @@ enum Commands { #[arg(short, long)] #[arg(default_value_t = false)] witness: bool, + + /// Generate a flamegraph plot of the execution ("[file].svg") + #[arg(long)] + #[arg(default_value_t = false)] + generate_flamegraph: bool, + + /// Generate callgrind file of the execution ("[file].callgrind") + #[arg(long)] + #[arg(default_value_t = false)] + generate_callgrind: bool, }, } @@ -220,13 +232,29 @@ fn run_command(command: Commands) { output_directory, continuations, witness, + generate_flamegraph, + generate_callgrind, } => { + let profiling = if generate_callgrind || generate_flamegraph { + Some(ProfilerOptions { + file_stem: Path::new(&file) + .file_stem() + .and_then(OsStr::to_str) + .map(String::from), + output_directory: output_directory.clone(), + flamegraph: generate_flamegraph, + callgrind: generate_callgrind, + }) + } else { + None + }; call_with_field!(execute::( Path::new(&file), split_inputs(&inputs), Path::new(&output_directory), continuations, - witness + witness, + profiling )) } }; @@ -297,6 +325,7 @@ fn execute( output_dir: &Path, continuations: bool, witness: bool, + profiling: Option, ) -> Result<(), Vec> { let mut pipeline = Pipeline::::default() .from_file(file_name.to_path_buf()) @@ -304,7 +333,7 @@ fn execute( let bootloader_inputs = if continuations { pipeline = pipeline.with_prover_inputs(inputs.clone()); - powdr_riscv::continuations::rust_continuations_dry_run(&mut pipeline) + powdr_riscv::continuations::rust_continuations_dry_run(&mut pipeline, profiling.clone()) } else { vec![] }; @@ -327,6 +356,7 @@ fn execute( pipeline.data_callback().unwrap(), &[], powdr_riscv_executor::ExecMode::Fast, + profiling, ); log::info!("Execution trace length: {}", trace.len); } diff --git a/riscv-executor/Cargo.toml b/riscv-executor/Cargo.toml index febb1cbac..fb71b1434 100644 --- a/riscv-executor/Cargo.toml +++ b/riscv-executor/Cargo.toml @@ -22,5 +22,8 @@ num-bigint = "0.4.3" num-traits = "0.2.15" k256 = { version = "0.13.3", features = [] } +rustc-demangle = "0.1" +inferno = "0.11.19" + [lints.clippy] uninlined_format_args = "deny" diff --git a/riscv-executor/src/lib.rs b/riscv-executor/src/lib.rs index 4f43794a9..b01359cd5 100644 --- a/riscv-executor/src/lib.rs +++ b/riscv-executor/src/lib.rs @@ -10,7 +10,7 @@ //! from execution. use std::{ - collections::HashMap, + collections::{BTreeMap, HashMap}, fmt::{self, Display, Formatter}, io, }; @@ -28,9 +28,13 @@ use powdr_ast::{ }; use powdr_number::{FieldElement, LargeInt}; use powdr_riscv_syscalls::SYSCALL_REGISTERS; +pub use profiler::ProfilerOptions; pub mod arith; pub mod poseidon_gl; +mod profiler; + +use crate::profiler::Profiler; /// Initial value of the PC. /// @@ -492,10 +496,18 @@ pub fn get_main_machine(program: &AnalysisASMFile) -> &Machine { } struct PreprocessedMain<'a, T: FieldElement> { + /// list of all statements (batches expanded) statements: Vec<&'a FunctionStatement>, + /// label to batch number label_map: HashMap<&'a str, Elem>, + /// batch number to its first statement idx batch_to_line_map: Vec, + /// file number to (dir,name) debug_files: Vec<(&'a str, &'a str)>, + /// function label to batch number + function_starts: BTreeMap, + /// .debug loc to batch number + location_starts: BTreeMap, } /// Returns the list of instructions, directly indexable by PC, the map from @@ -511,6 +523,8 @@ fn preprocess_main_function(machine: &Machine) -> PreprocessedM let mut label_map = HashMap::new(); let mut batch_to_line_map = vec![0; PC_INITIAL_VAL]; let mut debug_files = Vec::new(); + let mut function_starts = BTreeMap::new(); + let mut location_starts = BTreeMap::new(); for (batch_idx, batch) in orig_statements.iter_batches().enumerate() { batch_to_line_map.push(statements.len() as u32); @@ -531,7 +545,11 @@ fn preprocess_main_function(machine: &Machine) -> PreprocessedM assert_eq!(*idx, debug_files.len() + 1); debug_files.push((dir.as_str(), file.as_str())); } - DebugDirective::Loc(_, _, _) | DebugDirective::OriginalInstruction(_) => { + DebugDirective::Loc(file, line, _) => { + location_starts.insert(batch_idx + PC_INITIAL_VAL, (*file, *line)); + statements.push(s); + } + DebugDirective::OriginalInstruction(_) => { // keep debug locs for debugging purposes statements.push(s); } @@ -541,6 +559,10 @@ fn preprocess_main_function(machine: &Machine) -> PreprocessedM // assert there are no statements in the middle of a block assert!(!statement_seen); label_map.insert(name.as_str(), (batch_idx + PC_INITIAL_VAL).into()); + // TODO: would looking for "___dot_Lfunc_begin" be less hacky? would require more work to handle ecalls though... + if !name.contains("___dot_L") { + function_starts.insert(batch_idx + PC_INITIAL_VAL, name.as_str()); + } } } } @@ -555,6 +577,8 @@ fn preprocess_main_function(machine: &Machine) -> PreprocessedM label_map, batch_to_line_map, debug_files, + function_starts, + location_starts, } } @@ -979,6 +1003,16 @@ impl<'a, 'b, F: FieldElement> Executor<'a, 'b, F> { } } +/// return true if the expression is a jump instruction +fn is_jump(e: &Expression) -> bool { + if let Expression::FunctionCall(_, FunctionCall { function, .. }) = e { + if let Expression::Reference(_, f) = function.as_ref() { + return ["jump", "jump_dyn"].contains(&f.try_to_identifier().unwrap().as_str()); + } + } + false +} + pub fn execute_ast( program: &AnalysisASMFile, initial_memory: MemoryState, @@ -986,6 +1020,7 @@ pub fn execute_ast( bootloader_inputs: &[Elem], max_steps_to_execute: usize, mode: ExecMode, + profiling: Option, ) -> (ExecutionTrace, MemoryState) { let main_machine = get_main_machine(program); let PreprocessedMain { @@ -993,6 +1028,8 @@ pub fn execute_ast( label_map, batch_to_line_map, debug_files, + function_starts, + location_starts, } = preprocess_main_function(main_machine); let proc = match TraceBuilder::<'_, T>::new( @@ -1014,6 +1051,9 @@ pub fn execute_ast( _stdout: io::stdout(), }; + let mut profiler = + profiling.map(|opt| Profiler::new(opt, &debug_files[..], function_starts, location_starts)); + let mut curr_pc = 0u32; loop { let stm = statements[curr_pc as usize]; @@ -1022,13 +1062,39 @@ pub fn execute_ast( match stm { FunctionStatement::Assignment(a) => { + if let Some(p) = &mut profiler { + p.add_instruction_cost(e.proc.get_pc().u() as usize); + } + + let pc_before = e.proc.get_reg("pc").u() as usize; + let results = e.eval_expression(a.rhs.as_ref()); assert_eq!(a.lhs_with_reg.len(), results.len()); + + let pc_after = e.proc.get_reg("pc").u() as usize; + + if is_jump(a.rhs.as_ref()) { + let pc_return = results[0].u() as usize; + assert_eq!(a.lhs_with_reg.len(), 1); + if let Some(p) = &mut profiler { + // in the generated powdr asm, writing to `tmp1` means the returning pc is ignored + if a.lhs_with_reg[0].0 == "tmp1" { + p.jump(pc_after); + } else { + p.jump_and_link(pc_before, pc_after, pc_return); + } + } + } + for ((dest, _), val) in a.lhs_with_reg.iter().zip(results) { e.proc.set_reg(dest, val); } } FunctionStatement::Instruction(i) => { + assert!(!["jump", "jump_dyn"].contains(&i.instruction.as_str())); + if let Some(p) = &mut profiler { + p.add_instruction_cost(e.proc.get_pc().u() as usize); + } e.exec_instruction(&i.instruction, &i.inputs); } FunctionStatement::Return(_) => break, @@ -1055,6 +1121,9 @@ pub fn execute_ast( }; } + if let Some(mut p) = profiler { + p.finish(); + } e.proc.finish() } @@ -1073,6 +1142,7 @@ pub fn execute( inputs: &Callback, bootloader_inputs: &[Elem], mode: ExecMode, + profiling: Option, ) -> (ExecutionTrace, MemoryState) { log::info!("Parsing..."); let parsed = powdr_parser::parse_asm(None, asm_source).unwrap(); @@ -1089,6 +1159,7 @@ pub fn execute( bootloader_inputs, usize::MAX, mode, + profiling, ) } diff --git a/riscv-executor/src/profiler.rs b/riscv-executor/src/profiler.rs new file mode 100644 index 000000000..32968451e --- /dev/null +++ b/riscv-executor/src/profiler.rs @@ -0,0 +1,333 @@ +use std::{ + collections::BTreeMap, + fs::File, + io::BufWriter, + io::Write, + path::{Path, PathBuf}, +}; + +use itertools::Itertools; + +use rustc_demangle::demangle; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct Call<'a> { + from: Loc<'a>, + target: Loc<'a>, +} + +/// RISC-V asm profiler. +/// Tracks the self-cost of functions and the cumulative cost of specific function calls (i.e., callgrind style). +pub struct Profiler<'a> { + /// profiling options + options: ProfilerOptions, + /// file number to (dir,file) + debug_files: &'a [(&'a str, &'a str)], + /// pc value of function beginnings + function_begin: BTreeMap, + /// pc value of .debug loc statements + location_begin: BTreeMap, + /// current call stack, entries include running cost + call_stack: Vec<(Call<'a>, usize)>, + /// saved return address of "jump and link" instructions + return_pc_stack: Vec, + /// cost of each location + location_stats: BTreeMap, usize>, + /// (count, cumulative cost) of calls + call_stats: BTreeMap, (usize, usize)>, + /// stack sampling format for FlameGraph + folded_stack_stats: BTreeMap, usize>, +} + +#[derive(Default, Clone)] +pub struct ProfilerOptions { + pub output_directory: String, + pub file_stem: Option, + pub flamegraph: bool, + pub callgrind: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct Loc<'a> { + function: &'a str, + file: usize, + line: usize, +} + +impl<'a> Profiler<'a> { + pub fn new( + options: ProfilerOptions, + debug_files: &'a [(&'a str, &'a str)], + function_begin: BTreeMap, + location_begin: BTreeMap, + ) -> Self { + Profiler { + options, + debug_files, + function_begin, + location_begin, + call_stack: Default::default(), + return_pc_stack: Default::default(), + location_stats: Default::default(), + call_stats: Default::default(), + folded_stack_stats: Default::default(), + } + } + + pub fn write_callgrind>(&self, path: P) { + log::info!("Writing callgrind data to {:?}", path.as_ref()); + let file = File::create(path).unwrap(); + let mut w = BufWriter::new(file); + writeln!(&mut w, "events: Instructions\n").unwrap(); + for func in self.function_begin.values() { + let loc_stats: Vec<_> = self + .location_stats + .iter() + .filter_map(|(loc, cost)| { + if &loc.function == func { + Some((loc.file, loc.line, cost)) + } else { + None + } + }) + .sorted() + .collect(); + let call_stats: Vec<_> = self + .call_stats + .iter() + .filter_map(|(call, (count, cost))| { + if &call.from.function == func { + Some((call, count, cost)) + } else { + None + } + }) + .collect(); + + if loc_stats.is_empty() && call_stats.is_empty() { + continue; + } + + writeln!(w, "fn={}", format_function_name(func)).unwrap(); + + let mut curr_file = None; + for (file_nr, line, cost) in loc_stats { + if Some(file_nr) != curr_file { + curr_file = Some(file_nr); + let file = self.debug_files[file_nr - 1]; + writeln!(w, "fl={}/{}", file.0, file.1).unwrap(); + } + writeln!(w, "{line} {cost}").unwrap(); + } + for (call, count, cost) in call_stats { + let target_file_nr = call.target.file; + if Some(target_file_nr) != curr_file { + curr_file = Some(target_file_nr); + let file = self.debug_files[target_file_nr - 1]; + writeln!(w, "cfi={}/{}", file.0, file.1).unwrap(); + } + writeln!(w, "cfn={}", format_function_name(call.target.function)).unwrap(); + writeln!(w, "calls={count} {}", call.target.line).unwrap(); + writeln!(w, "{} {cost}", call.from.line).unwrap(); + } + writeln!(w).unwrap(); + } + } + + pub fn write_flamegraph>(&self, path: P) { + log::info!("Writing flamegraph to {:?}", path.as_ref()); + let lines: Vec<_> = self + .folded_stack_stats + .iter() + .map(|(stack, count)| { + let stack = stack + .iter() + .map(|function| format_function_name(function)) + .join(";"); + format!("{stack} {count}") + }) + .collect(); + let mut options = Default::default(); + let file = File::create(path).unwrap(); + let w = BufWriter::new(file); + inferno::flamegraph::from_lines(&mut options, lines.iter().map(|s| s.as_str()), w).unwrap(); + } + + /// calculate totals and write out results + pub fn finish(&mut self) { + let mut path = PathBuf::from(&self.options.output_directory) + .join(self.options.file_stem.as_deref().unwrap_or("out")); + if self.options.flamegraph { + path.set_extension("svg"); + self.write_flamegraph(&path); + } + if self.options.callgrind { + path.set_extension("callgrind"); + self.write_callgrind(&path); + } + } + + /// profiling only starts once "__runtime_start" is reached + pub fn is_running(&self) -> bool { + !self.call_stack.is_empty() + } + + /// function at the top of the call stack + pub fn curr_function(&self) -> Option<&'a str> { + self.call_stack.last().map(|(c, _)| c.target.function) + } + + /// get the function name and source location for a given pc value + pub fn location_at(&self, pc: usize) -> Option> { + self.function_begin + .range(..=pc) + .last() + .and_then(|(_, function)| { + self.location_begin + .range(..=pc) + .last() + .map(|(_, (file, line))| Loc { + function, + file: *file, + line: *line, + }) + }) + } + + /// add cost for instruction/row + pub fn add_instruction_cost(&mut self, curr_pc: usize) { + if !self.is_running() { + return; + } + + // add cost to current location. AFAIU need the function name from the call stack to handle inlining + let function = self.curr_function().unwrap(); + let Loc { file, line, .. } = self.location_at(curr_pc).unwrap(); + *self + .location_stats + .entry(Loc { + function, + file, + line, + }) + .or_default() += 1; + + // add cost to current call + self.call_stack.last_mut().unwrap().1 += 1; + + // add sample to folded stacks + let stack: Vec<_> = self + .call_stack + .iter() + .map(|(call, _)| call.target.function) + .collect(); + *self.folded_stack_stats.entry(stack).or_default() += 1; + } + + /// Should be called for instructions that jump and save the returning address in an actual RISC-V register. + /// This is handled as a "call" into a function. + pub fn jump_and_link(&mut self, curr_pc: usize, target_pc: usize, return_pc: usize) { + if let Some(mut target) = self.location_at(target_pc) { + if let Some(curr_function) = self.curr_function() { + let Loc { + file: curr_file, + line: curr_line, + .. + } = self.location_at(curr_pc).unwrap(); + // ecall handler code have a ".debug loc", so we keep current file/line + if target.function == "__ecall_handler" { + target.file = curr_file; + target.line = curr_line; + } + let call = Call { + from: Loc { + function: curr_function, + file: curr_file, + line: curr_line, + }, + target, + }; + // increase call count + self.call_stats.entry(call.clone()).or_default().0 += 1; + self.call_stack.push((call, 0)); + self.return_pc_stack.push(return_pc); + } else { + // we start profiling on the initial call to "__runtime_start" + if target.function == "__runtime_start" { + let call = Call { + from: Loc { + function: "", + file: 0, + line: 0, + }, + target, + }; + // increase call count + self.call_stats.entry(call.clone()).or_default().0 += 1; + self.call_stack.push((call, 0)); + self.return_pc_stack.push(return_pc); + } + } + } else { + assert!(!self.is_running()); + } + } + + /// Should be called for jumps that don't save the returning address. + /// This is handled as one of 3 cases: + /// - "return" from function: target_pc equal to last `jump_and_link` saved pc + /// - "tail call": next_function != current_function + /// - control flow: next_function == current_function + pub fn jump(&mut self, target_pc: usize) { + if !self.is_running() { + return; + } + + if self + .return_pc_stack + .last() + .is_some_and(|saved_pc| *saved_pc == target_pc) + { + // "return" from current function + let (done_call, cost) = self.call_stack.pop().unwrap(); + self.return_pc_stack.pop(); + // add to cumulative cost of call and to running cost of caller + if let Some((_curr_call, curr_cost)) = self.call_stack.last_mut() { + self.call_stats.get_mut(&done_call).unwrap().1 += cost; + *curr_cost += cost; + } + } else { + let target = self.location_at(target_pc).unwrap(); + let curr_function = self.curr_function().unwrap(); + if target.function != curr_function { + // "tail call": replace the current call in the stack + let (done_call, cost) = self.call_stack.pop().unwrap(); + + // add to cumulative cost of call and to running cost of caller + if let Some((_curr_call, curr_cost)) = self.call_stack.last_mut() { + self.call_stats.get_mut(&done_call).unwrap().1 += cost; + *curr_cost += cost; + } + + // push new call. + // here we keep the origin of the current call as the origin of the tail call replacing it + let new_call = Call { + from: done_call.from, + target, + }; + self.call_stats.entry(new_call.clone()).or_default().0 += 1; + self.call_stack.push((new_call, 0)); + } else { + // "control flow" (or "tail call" to self, if that is a thing), don't think this needs special handling + } + } + } +} + +fn format_function_name(name: &str) -> String { + if let Some(prefix) = name.find("___ZN") { + format!("{}", demangle(&name[prefix + 2..])) + } else { + format!("{}", demangle(name)) + } +} diff --git a/riscv/src/continuations.rs b/riscv/src/continuations.rs index bf3000166..d912d69e8 100644 --- a/riscv/src/continuations.rs +++ b/riscv/src/continuations.rs @@ -9,7 +9,7 @@ use powdr_ast::{ }; use powdr_number::FieldElement; use powdr_pipeline::Pipeline; -use powdr_riscv_executor::{get_main_machine, Elem, ExecutionTrace, MemoryState}; +use powdr_riscv_executor::{get_main_machine, Elem, ExecutionTrace, MemoryState, ProfilerOptions}; pub mod bootloader; mod memory_merkle_tree; @@ -206,6 +206,7 @@ pub fn load_initial_memory(program: &AnalysisASMFile) -> MemoryState { /// - The number of rows after which the prover should jump to the shutdown routine. pub fn rust_continuations_dry_run( pipeline: &mut Pipeline, + profiler_opt: Option, ) -> Vec<(Vec, u64)> { // All inputs for all chunks. let mut bootloader_inputs_and_num_rows = vec![]; @@ -242,6 +243,7 @@ pub fn rust_continuations_dry_run( &default_input(&[]), usize::MAX, powdr_riscv_executor::ExecMode::Trace, + profiler_opt, ) .0; (transposed_trace::(&trace), trace.mem_ops) @@ -342,6 +344,8 @@ pub fn rust_continuations_dry_run( &bootloader_inputs, num_rows, powdr_riscv_executor::ExecMode::Trace, + // profiling was done when full trace was generated + None, ); (transposed_trace(&trace), memory_snapshot_update) }; diff --git a/riscv/tests/common/mod.rs b/riscv/tests/common/mod.rs index 96560d0c4..00f946fc8 100644 --- a/riscv/tests/common/mod.rs +++ b/riscv/tests/common/mod.rs @@ -31,6 +31,7 @@ pub fn verify_riscv_asm_string( &[], usize::MAX, powdr_riscv_executor::ExecMode::Fast, + Default::default(), ); verify_pipeline(pipeline, backend).unwrap(); } diff --git a/riscv/tests/riscv.rs b/riscv/tests/riscv.rs index 81a39f090..c283ff4f4 100644 --- a/riscv/tests/riscv.rs +++ b/riscv/tests/riscv.rs @@ -40,7 +40,7 @@ pub fn test_continuations(case: &str) { Ok(()) }; - let bootloader_inputs = rust_continuations_dry_run(&mut pipeline); + let bootloader_inputs = rust_continuations_dry_run(&mut pipeline, Default::default()); rust_continuations(pipeline, pipeline_callback, bootloader_inputs).unwrap(); } @@ -272,7 +272,7 @@ fn many_chunks_dry() { let mut pipeline = Pipeline::default() .from_asm_string(powdr_asm, Some(PathBuf::from(case))) .with_prover_inputs(Default::default()); - rust_continuations_dry_run::(&mut pipeline); + rust_continuations_dry_run::(&mut pipeline, Default::default()); } use serde::{Deserialize, Serialize};