initial support for label reading for rv64 (#3060)

This PR adds a module to the `riscv-elf` crate that collects labels, their addresses, and jumpdests from a RV64 ELF binary. It does not compute debug infos and all the other stuff that the current 32-bit module does. --------- Co-authored-by: Steve Wang <qian.wang.wg24@wharton.upenn.edu>
2026-01-10 11:38:11 -05:00 · 2025-07-22 12:34:13 +02:00
parent 9a97fca7c6
commit e6ff8810b8
4 changed files with 512 additions and 0 deletions
--- a/riscv-elf/Cargo.toml
+++ b/riscv-elf/Cargo.toml
@@ -24,3 +24,7 @@ workspace = true

 [lib]
 bench = false # See https://github.com/bheisler/criterion.rs/issues/458
+
+[[bin]]
+name = "elf-labels"
+path = "src/bin/elf-labels.rs"
--- a/riscv-elf/src/bin/elf-labels.rs
+++ b/riscv-elf/src/bin/elf-labels.rs
@@ -0,0 +1,245 @@
+#![allow(clippy::print_stdout)]
+
+use goblin::elf::{
+    header::{EI_CLASS, ELFCLASS32, ELFCLASS64},
+    Elf,
+};
+use powdr_riscv_elf::{load_elf, rv64};
+use std::env;
+use std::fs;
+use std::panic;
+use std::path::Path;
+use std::process;
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    if args.len() != 2 {
+        eprintln!("Usage: {} <elf-file>", args[0]);
+        process::exit(1);
+    }
+
+    let elf_path = Path::new(&args[1]);
+
+    if !elf_path.exists() {
+        eprintln!("Error: File '{}' does not exist", elf_path.display());
+        process::exit(1);
+    }
+
+    // Read the file to check if it's 32-bit or 64-bit
+    let file_buffer = match fs::read(elf_path) {
+        Ok(buffer) => buffer,
+        Err(e) => {
+            eprintln!("Error reading file: {e}");
+            process::exit(1);
+        }
+    };
+
+    let elf = match Elf::parse(&file_buffer) {
+        Ok(elf) => elf,
+        Err(e) => {
+            eprintln!("Error parsing ELF header: {e}");
+            process::exit(1);
+        }
+    };
+
+    match elf.header.e_ident[EI_CLASS] {
+        ELFCLASS32 => {
+            // The load_elf function panics on errors, so we catch it
+            let result = panic::catch_unwind(|| load_elf(elf_path));
+
+            match result {
+                Ok(program) => {
+                    println!(
+                        "RV32 ELF file analyzed successfully: {}",
+                        elf_path.display()
+                    );
+                    println!();
+                    print_elf_info_32(&program);
+                }
+                Err(_) => {
+                    eprintln!("Error loading RV32 ELF file: The file may be corrupted or not a valid RISC-V ELF");
+                    process::exit(1);
+                }
+            }
+        }
+        ELFCLASS64 => {
+            // The load_elf_rv64 function panics on errors, so we catch it
+            let result = panic::catch_unwind(|| rv64::compute_jumpdests(elf_path));
+
+            match result {
+                Ok(labels) => {
+                    println!(
+                        "RV64 ELF file analyzed successfully: {}",
+                        elf_path.display()
+                    );
+                    println!();
+                    print_elf_info_64(&labels);
+                }
+                Err(_) => {
+                    eprintln!("Error loading RV64 ELF file: The file may be corrupted or not a valid RISC-V ELF");
+                    process::exit(1);
+                }
+            }
+        }
+        _ => {
+            eprintln!("Unsupported ELF class");
+            process::exit(1);
+        }
+    }
+}
+
+fn print_elf_info_32(program: &powdr_riscv_elf::ElfProgram) {
+    // Get text labels from the program
+    let text_labels = program.text_labels();
+
+    if text_labels.is_empty() {
+        println!("No text labels found in the ELF file.");
+    } else {
+        println!("Text labels found: {}", text_labels.len());
+        println!();
+        println!("{:<16}", "Address");
+        println!("{}", "-".repeat(16));
+
+        // Text labels are already sorted in BTreeSet
+        for address in text_labels {
+            println!("0x{address:08x}");
+        }
+    }
+
+    // Report on debug symbols
+    let debug_info = program.debug_info();
+    println!();
+    println!("Debug information:");
+
+    // Since we can't iterate over SymbolTable directly, we'll use text_labels
+    // and look up each address
+    let mut symbol_count = 0;
+    let mut function_symbols = Vec::new();
+
+    for &addr in text_labels {
+        if let Some(name) = debug_info.symbols.try_get_one(addr) {
+            symbol_count += 1;
+            // Simple heuristic for functions: doesn't start with $ or contain .
+            if !name.starts_with("$") && !name.contains(".") {
+                function_symbols.push((addr, name));
+            }
+        }
+    }
+
+    println!("  Symbols at text label addresses: {symbol_count}");
+    println!("  Function symbols: {}", function_symbols.len());
+
+    if !function_symbols.is_empty() {
+        println!();
+        println!("Function symbols:");
+        println!("{:<16} {:<40}", "Address", "Symbol");
+        println!("{}", "-".repeat(60));
+
+        for (address, name) in function_symbols {
+            println!("0x{address:08x}      {name}");
+        }
+    }
+
+    // Also show notes if available
+    if !debug_info.notes.is_empty() {
+        println!();
+        println!("Debug notes:");
+        let mut notes: Vec<_> = debug_info.notes.iter().collect();
+        notes.sort_by_key(|(addr, _)| *addr);
+
+        for (addr, note) in notes {
+            println!("0x{addr:08x}: {note}");
+        }
+    }
+}
+
+fn print_elf_info_64(labels: &rv64::Rv64Labels) {
+    println!("Entry point: 0x{:016x}", labels.entry_point);
+    println!("PC base: 0x{:016x}", labels.pc_base);
+    println!();
+
+    if labels.jumpdests.is_empty() {
+        println!("No text labels or jump destinations found.");
+    } else {
+        println!(
+            "Text labels and jump destinations found: {}",
+            labels.jumpdests.len()
+        );
+        println!();
+
+        // Show all labels with symbols if available
+        println!("{:<20} {:<40}", "Address", "Symbol (if available)");
+        println!("{}", "-".repeat(60));
+
+        for &addr in &labels.jumpdests {
+            // Find symbol name if available
+            let symbol = labels
+                .symbols
+                .iter()
+                .find(|(sym_addr, _)| *sym_addr == addr)
+                .map(|(_, name)| name.as_str())
+                .unwrap_or("");
+
+            println!("0x{addr:016x}  {symbol}");
+        }
+
+        // Summary of symbols
+        println!();
+        println!("Summary:");
+        println!("  Total labels/jumpdests: {}", labels.jumpdests.len());
+        println!("  Named symbols: {}", labels.symbols.len());
+        println!(
+            "  Jumpdests without symbols: {}",
+            labels.jumpdests_with_debug_info.len()
+        );
+
+        // Show function-like symbols separately
+        let function_symbols: Vec<_> = labels
+            .symbols
+            .iter()
+            .filter(|(_, name)| !name.starts_with("$") && !name.contains("."))
+            .collect();
+
+        if !function_symbols.is_empty() {
+            println!("  Function symbols: {}", function_symbols.len());
+        }
+
+        // Show label to address map
+        println!();
+        println!("=== Label to Address Map ===");
+        println!("{:<40} {:<20}", "Label", "Address");
+        println!("{}", "-".repeat(60));
+
+        let mut sorted_symbols = labels.symbols.clone();
+        sorted_symbols.sort_by(|a, b| a.1.cmp(&b.1));
+
+        for (addr, name) in sorted_symbols {
+            println!("{name:<40} 0x{addr:016x}");
+        }
+
+        // Show jumpdests that are not labels
+        println!();
+        println!("=== Jump Destinations Without Symbols ===");
+        println!(
+            "{:<20} {:<20} {:<40}",
+            "Target Address", "From Address", "Instruction"
+        );
+        println!("{}", "-".repeat(80));
+
+        let mut sorted_jumpdests: Vec<_> = labels.jumpdests_with_debug_info.iter().collect();
+        sorted_jumpdests.sort_by_key(|(addr, _)| *addr);
+
+        for (target_addr, sources) in sorted_jumpdests {
+            for source in sources {
+                println!(
+                    "0x{:016x}  0x{:016x}  {}",
+                    target_addr, source.from_addr, source.instruction
+                );
+            }
+        }
+
+        println!();
+        println!("PC Base: 0x{:016x}", labels.pc_base);
+    }
+}
--- a/riscv-elf/src/lib.rs
+++ b/riscv-elf/src/lib.rs
@@ -26,6 +26,7 @@ use powdr_riscv_types::{
 };

 pub mod debug_info;
+pub mod rv64;

 use self::debug_info::{DebugInfo, SymbolTable};

--- a/riscv-elf/src/rv64.rs
+++ b/riscv-elf/src/rv64.rs
@@ -0,0 +1,262 @@
+use std::collections::BTreeSet;
+use std::fs;
+use std::path::Path;
+
+use goblin::elf::{
+    header::{EI_CLASS, EI_DATA, ELFCLASS64, ELFDATA2LSB, EM_RISCV},
+    Elf,
+};
+use raki::{decode::Decode, instruction::OpcodeKind as Op, Isa};
+
+/// Information about a jump destination
+#[derive(Debug, Clone)]
+pub struct JumpDest {
+    /// The instruction address that generates this jump
+    pub from_addr: u64,
+    /// The instruction that generates this jump
+    pub instruction: String,
+}
+
+/// Minimal RV64 ELF program representation for label/jumpdest collection
+pub struct Rv64Labels {
+    /// All text labels and jump destinations
+    pub jumpdests: BTreeSet<u64>,
+    /// Entry point address
+    pub entry_point: u64,
+    /// Symbol table for debugging
+    pub symbols: Vec<(u64, String)>,
+    /// Jump destinations that are not symbols (address -> source instructions)
+    pub jumpdests_with_debug_info: BTreeMap<u64, Vec<JumpDest>>,
+    /// PC base (lowest executable address)
+    pub pc_base: u64,
+}
+
+pub fn compute_jumpdests(file_name: &Path) -> Rv64Labels {
+    log::info!("Loading RV64 ELF file: {}", file_name.display());
+    let file_buffer = fs::read(file_name).unwrap();
+    compute_jumpdests_from_buffer(&file_buffer)
+}
+
+pub fn compute_jumpdests_from_buffer(file_buffer: &[u8]) -> Rv64Labels {
+    let elf = Elf::parse(file_buffer).unwrap();
+
+    // Verify it's a 64-bit RISC-V ELF
+    assert_eq!(
+        elf.header.e_ident[EI_CLASS], ELFCLASS64,
+        "Only 64-bit ELF files are supported by rv64 module!"
+    );
+    assert_eq!(
+        elf.header.e_ident[EI_DATA], ELFDATA2LSB,
+        "Only little-endian ELF files are supported!"
+    );
+    assert_eq!(
+        elf.header.e_machine, EM_RISCV,
+        "Only RISC-V ELF files are supported!"
+    );
+
+    let mut jumpdests = BTreeSet::new();
+    let mut jumpdests_with_debug_info = BTreeMap::new();
+
+    // Add entry point
+    jumpdests.insert(elf.entry);
+
+    // Find PC base (lowest executable address)
+    let pc_base = elf
+        .program_headers
+        .iter()
+        .filter(|ph| ph.is_executable())
+        .map(|ph| ph.p_vaddr)
+        .min()
+        .unwrap_or(0);
+
+    // Collect symbols that are in text sections
+    let mut symbols = Vec::new();
+    let mut symbol_addrs = BTreeSet::new();
+    for sym in elf.syms.iter() {
+        if sym.st_value != 0 {
+            // Check if this symbol is in an executable section
+            let in_text = elf.program_headers.iter().any(|ph| {
+                ph.is_executable()
+                    && sym.st_value >= ph.p_vaddr
+                    && sym.st_value < ph.p_vaddr + ph.p_memsz
+            });
+
+            if in_text {
+                jumpdests.insert(sym.st_value);
+                symbol_addrs.insert(sym.st_value);
+                if let Some(name) = elf.strtab.get_at(sym.st_name) {
+                    symbols.push((sym.st_value, name.to_string()));
+                }
+            }
+        }
+    }
+
+    // Scan text sections for jump destinations
+    for ph in elf.program_headers.iter() {
+        if ph.is_executable() {
+            let seg = &file_buffer[ph.p_offset as usize..(ph.p_offset + ph.p_filesz) as usize];
+            scan_for_jump_targets(
+                ph.p_vaddr,
+                seg,
+                &mut jumpdests,
+                &mut jumpdests_with_debug_info,
+                &symbol_addrs,
+            );
+        }
+    }
+
+    Rv64Labels {
+        jumpdests,
+        entry_point: elf.entry,
+        symbols,
+        jumpdests_with_debug_info,
+        pc_base,
+    }
+}
+
+use std::collections::BTreeMap;
+
+fn scan_for_jump_targets(
+    base_addr: u64,
+    data: &[u8],
+    jumpdests: &mut BTreeSet<u64>,
+    jumpdests_with_debug_info: &mut BTreeMap<u64, Vec<JumpDest>>,
+    label_addrs: &BTreeSet<u64>,
+) {
+    let mut addr = base_addr;
+    let mut remaining = data;
+    let mut last_was_auipc = false;
+
+    while remaining.len() >= 4 {
+        // Assert that we have a 32-bit instruction.
+        assert!(remaining[0] & 0b11 == 0b11);
+        let insn_bytes = u32::from_le_bytes(remaining[0..4].try_into().unwrap());
+
+        if let Ok(insn) = insn_bytes.decode(Isa::Rv64) {
+            // Check for jump/branch instructions
+            match insn.opc {
+                Op::JAL => {
+                    // JAL has a PC-relative immediate
+                    if let Some(imm) = insn.imm {
+                        let target = (addr as i64 + imm as i64) as u64;
+                        jumpdests.insert(target);
+
+                        // Track non-symbol jumpdests
+                        if !label_addrs.contains(&target) {
+                            let jump_info = JumpDest {
+                                from_addr: addr,
+                                instruction: format!(
+                                    "jal {}, 0x{:x}",
+                                    insn.rd
+                                        .map(|r| format!("x{r}"))
+                                        .unwrap_or_else(|| "?".to_string()),
+                                    target
+                                ),
+                            };
+                            jumpdests_with_debug_info
+                                .entry(target)
+                                .or_default()
+                                .push(jump_info);
+                        }
+                    }
+                }
+                Op::BEQ | Op::BNE | Op::BLT | Op::BGE | Op::BLTU | Op::BGEU => {
+                    // Conditional branches have PC-relative immediates
+                    if let Some(imm) = insn.imm {
+                        let target = (addr as i64 + imm as i64) as u64;
+                        jumpdests.insert(target);
+
+                        // Track non-symbol jumpdests
+                        if !label_addrs.contains(&target) {
+                            let jump_info = JumpDest {
+                                from_addr: addr,
+                                instruction: format!(
+                                    "{} {}, {}, 0x{:x}",
+                                    format!("{:?}", insn.opc).to_lowercase(),
+                                    insn.rs1
+                                        .map(|r| format!("x{r}"))
+                                        .unwrap_or_else(|| "?".to_string()),
+                                    insn.rs2
+                                        .map(|r| format!("x{r}"))
+                                        .unwrap_or_else(|| "?".to_string()),
+                                    target
+                                ),
+                            };
+                            jumpdests_with_debug_info
+                                .entry(target)
+                                .or_default()
+                                .push(jump_info);
+                        }
+                    }
+                }
+                Op::AUIPC => {
+                    // AUIPC is often followed by JALR for function calls and long jumps
+                    // In statically linked binaries, these usually target known symbols
+                    if remaining.len() >= 8 {
+                        let next_insn_bytes =
+                            u32::from_le_bytes(remaining[4..8].try_into().unwrap());
+                        if let Ok(next_insn) = next_insn_bytes.decode(Isa::Rv64) {
+                            if matches!(next_insn.opc, Op::JALR) && insn.rd == next_insn.rs1 {
+                                // This is an AUIPC+JALR pair
+                                if let (Some(auipc_imm), Some(jalr_imm)) = (insn.imm, next_insn.imm)
+                                {
+                                    let target =
+                                        (addr as i64 + auipc_imm as i64 + jalr_imm as i64) as u64;
+                                    jumpdests.insert(target);
+
+                                    // Track non-symbol jumpdests
+                                    if !label_addrs.contains(&target) {
+                                        let jump_info = JumpDest {
+                                            from_addr: addr,
+                                            instruction: format!("auipc+jalr -> 0x{target:x}"),
+                                        };
+                                        jumpdests_with_debug_info
+                                            .entry(target)
+                                            .or_default()
+                                            .push(jump_info);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                Op::JALR => {
+                    // Only process if this JALR is not part of an AUIPC+JALR pair
+                    if !last_was_auipc {
+                        // Standalone JALR without preceding AUIPC
+                        // These are dynamic jumps we can't resolve statically:
+                        // - Return instructions (jalr x0, x1, 0)
+                        // - Indirect calls through function pointers
+                        // - Computed jumps (switch statements, vtables)
+                        // We just note their existence for completeness
+
+                        let rs1_str = insn
+                            .rs1
+                            .map(|r| format!("x{r}"))
+                            .unwrap_or_else(|| "?".to_string());
+                        let rd_str = insn
+                            .rd
+                            .map(|r| format!("x{r}"))
+                            .unwrap_or_else(|| "?".to_string());
+                        let imm = insn.imm.unwrap_or(0);
+
+                        // Only log if it's not a standard return (jalr x0, x1, 0)
+                        if !(insn.rd == Some(0) && insn.rs1 == Some(1) && imm == 0) {
+                            eprintln!(
+                                "Note: Dynamic jump at 0x{addr:x}: jalr {rd_str}, {rs1_str}, {imm}",
+                            );
+                        }
+                    }
+                }
+                _ => {}
+            }
+            // Update for next iteration
+            last_was_auipc = matches!(insn.opc, Op::AUIPC);
+        } else {
+            panic!("Could not decode instruction")
+        }
+
+        addr += 4;
+        remaining = &remaining[4..];
+    }
+}