diff --git a/README.md b/README.md index 466fd91..e849925 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Candidates from all three signals are merged and scored. ELF-specific false-posi When the binary contains an `.eh_frame` section, resurgo parses its FDE (Frame Description Entry) records and uses their `initial_location` fields as a high-confidence function entry set. These addresses were written by the compiler - not inferred by heuristics - and are typically present in stripped ELF binaries where `.symtab` and `.debug_*` are long gone. -When `.eh_frame` is present it acts as an authoritative whitelist: disassembly candidates not covered by any FDE are dropped as noise, and FDE entries with no matching disassembly candidate are promoted directly. See [docs/CFI.md](docs/CFI.md). +The `EhFrameDetector` emits these addresses as candidates. The `EhFrameFilter` then retains only candidates confirmed by an FDE, dropping disassembly noise. See [docs/CFI.md](docs/CFI.md). ## Usage @@ -45,15 +45,15 @@ When `.eh_frame` is present it acts as an authoritative whitelist: disassembly c package main import ( + "debug/elf" "fmt" "log" - "os" "github.com/maxgio92/resurgo" ) func main() { - f, err := os.Open("./myapp") + f, err := elf.Open("./myapp") if err != nil { log.Fatal(err) } @@ -83,50 +83,46 @@ func main() { ### Raw bytes (format-agnostic) +For non-ELF binaries or raw memory dumps, use the lower-level primitives directly: + ```go -candidates, err := resurgo.DetectFunctions(data, 0x400000, resurgo.ArchAMD64) +prologues, err := resurgo.DetectPrologues(data, 0x400000, resurgo.ArchAMD64) +edges, err := resurgo.DetectCallSites(data, 0x400000, resurgo.ArchAMD64) ``` ## API Reference ```go -// DetectFunctions merges prologue, call-site, and boundary signals on raw -// machine code bytes. baseAddr is the virtual address of the first byte of -// code. arch selects architecture-specific detection logic. -func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandidate, error) - -// DetectFunctionsFromELF parses an ELF binary, runs all detection signals, -// applies false-positive filters (PLT ranges, intra-function jump targets), -// and, when .eh_frame is present, uses CFI FDE entries as a whitelist. -// Architecture is inferred from the ELF header. opts may include WithFilters -// to replace the default filter pipeline. -func DetectFunctionsFromELF(r io.ReaderAt, opts ...Option) ([]FunctionCandidate, error) - -// WithFilters replaces the active filter pipeline. filters run in the order -// provided. Pass no arguments to disable all filters. +// DetectFunctionsFromELF runs all detectors then all filters against f and +// returns a deduplicated, sorted slice of function candidates. +// Architecture is inferred from the ELF header. +// opts may include WithDetectors or WithFilters to replace either pipeline. +func DetectFunctionsFromELF(f *elf.File, opts ...Option) ([]FunctionCandidate, error) + +// WithDetectors replaces the default detector pipeline. +// Detectors run in order; results are merged before filtering. +func WithDetectors(detectors ...CandidateDetector) Option + +// WithFilters replaces the default filter pipeline. +// Filters run in order. Pass no arguments to disable all filters. func WithFilters(filters ...CandidateFilter) Option +// Built-in detectors, enabled by default in the order listed: +var DisasmDetector CandidateDetector // prologue, call-site, and alignment-boundary detection +var EhFrameDetector CandidateDetector // emits candidates from .eh_frame FDE records + // Built-in filters, enabled by default in the order listed: -var PLTFilter CandidateFilter // removes PLT-section candidates var CETFilter CandidateFilter // drops non-ENDBR64 aligned entries on CET AMD64 binaries -var EhFrameFilter CandidateFilter // applies .eh_frame FDE whitelist +var EhFrameFilter CandidateFilter // retains only FDE-confirmed candidates +var PLTFilter CandidateFilter // removes PLT-section candidates (always last) // DetectPrologues scans raw machine code bytes for architecture-specific // function prologue patterns. Works on any binary format. func DetectPrologues(code []byte, baseAddr uint64, arch Arch) ([]Prologue, error) -// DetectProloguesFromELF parses an ELF binary and returns detected function -// prologues. Architecture is inferred from the ELF header. -func DetectProloguesFromELF(r io.ReaderAt) ([]Prologue, error) - // DetectCallSites scans raw machine code bytes for CALL and JMP instructions // and returns their resolved target addresses. Works on any binary format. func DetectCallSites(code []byte, baseAddr uint64, arch Arch) ([]CallSiteEdge, error) - -// DetectCallSitesFromELF parses an ELF binary and returns detected call sites, -// filtered to targets within the .text section. -// Architecture is inferred from the ELF header. -func DetectCallSitesFromELF(r io.ReaderAt) ([]CallSiteEdge, error) ``` Key types: @@ -157,40 +153,43 @@ type FunctionCandidate struct { ``` +------------------+ -| ELF Binary | -+------------------+ - | - v +| *elf.File | +------------------+ -| ELF Parser | (debug/elf) -+--------+---------+ | +-------------------------------+ | | v v +------------------+ +------------------+ -| Disassembler | | CFI Parser | +| DisasmDetector | | EhFrameDetector | | (.text bytes) | | (.eh_frame) | +---+---------+----+ +--------+---------+ | | | | v v v v +------+ +------+ +--------+ +------------------+ |Prolog| |Call | |Boundary| | FDE entry VAs | -|ues | |Sites | |Analysis| | (whitelist) | +|ues | |Sites | |Analysis| | (DetectionCFI) | +--+---+ +--+---+ +---+----+ +--------+---------+ | | | | - +--------+---------+ | - v | - +------------------+ | - | DetectFunctions | | - | (merge + score) | | - +--------+---------+ | - | | - v | - +------------------+ | - | FP Filters | <--------------+ - | PLT, anchor, | - | CFI whitelist | + +--------+---------+----------------+ + v + +------------------+ + | mergeCandidates | + | (dedup by addr) | + +--------+---------+ + | + v + +------------------+ + | CETFilter | drops non-ENDBR64 aligned entries (CET binaries) + +--------+---------+ + | + v + +------------------+ + | EhFrameFilter | retains only FDE-confirmed candidates + +--------+---------+ + | + v + +------------------+ + | PLTFilter | removes PLT-section candidates +--------+---------+ | v diff --git a/callsite.go b/callsite.go index 6aadf0b..2c962a1 100644 --- a/callsite.go +++ b/callsite.go @@ -1,9 +1,7 @@ package resurgo import ( - "debug/elf" "fmt" - "io" "golang.org/x/arch/arm64/arm64asm" "golang.org/x/arch/x86/x86asm" @@ -70,55 +68,6 @@ func DetectCallSites(code []byte, baseAddr uint64, arch Arch) ([]CallSiteEdge, e } } -// DetectCallSitesFromELF parses an ELF binary from the given reader, extracts -// the .text section, and returns detected call sites. -// The architecture is inferred from the ELF header. -func DetectCallSitesFromELF(r io.ReaderAt) ([]CallSiteEdge, error) { - f, err := elf.NewFile(r) - if err != nil { - return nil, fmt.Errorf("failed to parse ELF file: %w", err) - } - defer f.Close() - - textSec := f.Section(".text") - if textSec == nil { - return nil, fmt.Errorf("no .text section found") - } - - code, err := textSec.Data() - if err != nil && err != io.EOF { - return nil, fmt.Errorf("failed to read .text section: %w", err) - } - - var edges []CallSiteEdge - switch f.Machine { - case elf.EM_X86_64: - edges, err = detectCallSitesAMD64(code, textSec.Addr) - case elf.EM_AARCH64: - edges, err = detectCallSitesARM64(code, textSec.Addr) - default: - return nil, fmt.Errorf("unsupported ELF machine: %s", f.Machine) - } - - if err != nil { - return nil, err - } - - // Filter edges to only include targets within the .text section - filtered := make([]CallSiteEdge, 0, len(edges)) - textStart := textSec.Addr - textEnd := textSec.Addr + textSec.Size - for _, edge := range edges { - // Only include edges with resolvable targets within .text - if edge.Confidence != ConfidenceNone && - edge.TargetAddr >= textStart && - edge.TargetAddr < textEnd { - filtered = append(filtered, edge) - } - } - - return filtered, nil -} func detectCallSitesAMD64(code []byte, baseAddr uint64) ([]CallSiteEdge, error) { var result []CallSiteEdge diff --git a/callsite_test.go b/callsite_test.go index 6c72a37..6a78b27 100644 --- a/callsite_test.go +++ b/callsite_test.go @@ -2,6 +2,7 @@ package resurgo_test import ( "bytes" + "debug/elf" "os" "os/exec" "path/filepath" @@ -347,178 +348,6 @@ func TestDetectCallSites_UnsupportedArch(t *testing.T) { } } -func TestDetectCallSitesFromELF_Go(t *testing.T) { - tests := []struct { - name string - goarch string - minCalls int - }{ - { - name: "amd64", - goarch: "amd64", - minCalls: 1, - }, - { - name: "arm64", - goarch: "arm64", - minCalls: 1, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - binPath := filepath.Join(t.TempDir(), "demo-app") - args := []string{"build", "-o", binPath, "testdata/demo-app.go"} - - cmd := exec.Command("go", args...) - cmd.Env = append(os.Environ(), "CGO_ENABLED=0", "GOARCH="+tt.goarch) - if out, err := cmd.CombinedOutput(); err != nil { - t.Fatalf("failed to compile demo-app: %v\n%s", err, out) - } - - f, err := os.Open(binPath) - if err != nil { - t.Fatalf("failed to open compiled binary: %v", err) - } - defer f.Close() - - edges, err := resurgo.DetectCallSitesFromELF(f) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if len(edges) == 0 { - t.Fatal("expected at least one call site edge, got none") - } - - // Count by type - calls := 0 - jumps := 0 - for _, e := range edges { - if e.Type == resurgo.CallSiteCall { - calls++ - } else if e.Type == resurgo.CallSiteJump { - jumps++ - } - } - t.Logf("total edges: %d (calls: %d, jumps: %d)", len(edges), calls, jumps) - - if calls < tt.minCalls { - t.Errorf("expected at least %d calls, got %d", tt.minCalls, calls) - } - }) - } -} - -func TestDetectCallSitesFromELF_InvalidReader(t *testing.T) { - r := bytes.NewReader([]byte{0x00, 0x01, 0x02, 0x03}) - _, err := resurgo.DetectCallSitesFromELF(r) - if err == nil { - t.Fatal("expected error for invalid ELF data, got nil") - } -} - -func TestDetectFunctions(t *testing.T) { - // Test combined prologue + call site detection - // Create code with: - // 1. A function with prologue that is also called - // 2. A function with prologue but not called - // 3. A called address without prologue - - // AMD64 code: - // 0x00: push rbp (0x55) - // 0x01: mov rbp, rsp (0x48 0x89 0xe5) - // 0x04: call 0x20 (0xE8 0x17 0x00 0x00 0x00) - calls function at 0x20 - // 0x09: ret (0xC3) - // 0x0A: padding - // ... - // 0x20: push rbp (0x55) - function with prologue, called from 0x04 - // 0x21: mov rbp, rsp (0x48 0x89 0xe5) - // 0x24: ret (0xC3) - // 0x25: padding - // ... - // 0x30: push rbx (0x53) - function with prologue, not called - // 0x31: ret (0xC3) - // 0x32: padding - // ... - // 0x40: ret (0xC3) - called target without prologue - - code := make([]byte, 0x50) - // Function at 0x00 with prologue - code[0x00] = 0x55 // push rbp - code[0x01] = 0x48 // mov rbp, rsp - code[0x02] = 0x89 - code[0x03] = 0xe5 - // Call to 0x20 (rel32 = 0x20 - (0x04 + 5) = 0x17) - code[0x04] = 0xE8 // call - code[0x05] = 0x17 - code[0x06] = 0x00 - code[0x07] = 0x00 - code[0x08] = 0x00 - code[0x09] = 0xC3 // ret - - // Function at 0x20 with prologue, called - code[0x20] = 0x55 // push rbp - code[0x21] = 0x48 // mov rbp, rsp - code[0x22] = 0x89 - code[0x23] = 0xe5 - code[0x24] = 0xC3 // ret - - // Add RET before function at 0x30 to establish function boundary - code[0x2F] = 0xC3 // ret - - // Function at 0x30 with prologue, not called - code[0x30] = 0x53 // push rbx (callee-saved) - code[0x31] = 0xC3 // ret - - // No code at 0x40 (would be called target without prologue in a real scenario) - - candidates, err := resurgo.DetectFunctions(code, 0, resurgo.ArchAMD64) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - // Should find at least 3 candidates: - // - 0x00: prologue-only (main function, might not be called internally) - // - 0x20: both (prologue + called) - // - 0x30: prologue-only (not called) - - if len(candidates) < 3 { - t.Fatalf("expected at least 3 candidates, got %d: %+v", len(candidates), candidates) - } - - // Find candidate at 0x20 (should have both prologue and call target) - var candidate0x20 *resurgo.FunctionCandidate - for i := range candidates { - if candidates[i].Address == 0x20 { - candidate0x20 = &candidates[i] - break - } - } - - if candidate0x20 == nil { - t.Fatal("expected candidate at address 0x20, got none") - } - - if candidate0x20.DetectionType != resurgo.DetectionPrologueCallSite { - t.Errorf("expected detection type 'both', got %s", candidate0x20.DetectionType) - } - - if candidate0x20.Confidence != resurgo.ConfidenceHigh { - t.Errorf("expected high confidence, got %s", candidate0x20.Confidence) - } - - if len(candidate0x20.CalledFrom) == 0 { - t.Error("expected at least one caller, got none") - } - - t.Logf("Found %d function candidates", len(candidates)) - for _, c := range candidates { - t.Logf(" 0x%x: %s (confidence: %s, called from: %d, jumped from: %d)", - c.Address, c.DetectionType, c.Confidence, - len(c.CalledFrom), len(c.JumpedFrom)) - } -} - func TestDetectCallSitesAMD64_ENDBR(t *testing.T) { // ENDBR64 (f3 0f 1e fa) followed by a call should detect the call, // skipping the ENDBR64 instruction transparently. @@ -582,24 +411,11 @@ func TestDetectCallSites_EmptyInput(t *testing.T) { } } -func TestDetectFunctions_UnsupportedArch(t *testing.T) { - _, err := resurgo.DetectFunctions([]byte{0x00}, 0, resurgo.Arch("mips")) - if err == nil { - t.Fatal("expected error for unsupported architecture, got nil") - } -} - -func TestDetectFunctions_JumpTarget(t *testing.T) { - // Verify that unconditional JMPs create jump-target candidates - // (fix #2: medium-confidence edges now pass the filter). - // - // AMD64 code: - // 0x00: jmp 0x10 (E9 0B 00 00 00) - jump to 0x10 - // 0x05: nop padding... - // 0x10: ret (C3) - jump target without prologue +func TestDetectCallSites_JumpTarget(t *testing.T) { + // Verify that unconditional JMPs create jump-target candidates. + // jmp 0x10: E9 0B 00 00 00 at 0x00, target = 0x10 code := make([]byte, 0x20) - // jmp to 0x10: rel32 = 0x10 - (0x00 + 5) = 0x0B code[0x00] = 0xE9 code[0x01] = 0x0B code[0x02] = 0x00 @@ -607,30 +423,20 @@ func TestDetectFunctions_JumpTarget(t *testing.T) { code[0x04] = 0x00 code[0x10] = 0xC3 // ret - candidates, err := resurgo.DetectFunctions(code, 0, resurgo.ArchAMD64) + edges, err := resurgo.DetectCallSites(code, 0, resurgo.ArchAMD64) if err != nil { t.Fatalf("unexpected error: %v", err) } - // Look for a jump-target candidate at 0x10 - var found *resurgo.FunctionCandidate - for i := range candidates { - if candidates[i].Address == 0x10 { - found = &candidates[i] + var found *resurgo.CallSiteEdge + for i := range edges { + if edges[i].TargetAddr == 0x10 { + found = &edges[i] break } } - if found == nil { - t.Fatal("expected candidate at address 0x10, got none") - } - - if found.DetectionType != resurgo.DetectionJumpTarget { - t.Errorf("expected detection type 'jump-target', got %s", found.DetectionType) - } - - if len(found.JumpedFrom) == 0 { - t.Error("expected at least one JumpedFrom entry, got none") + t.Fatal("expected jump edge to 0x10, got none") } } @@ -644,9 +450,9 @@ func TestDetectFunctionsFromELF(t *testing.T) { t.Fatalf("failed to compile demo-app: %v\n%s", err, out) } - f, err := os.Open(binPath) + f, err := elf.Open(binPath) if err != nil { - t.Fatalf("failed to open compiled binary: %v", err) + t.Fatalf("failed to open ELF binary: %v", err) } defer f.Close() @@ -659,24 +465,75 @@ func TestDetectFunctionsFromELF(t *testing.T) { t.Fatal("expected at least one function candidate, got none") } - // Count by detection type counts := make(map[resurgo.DetectionType]int) for _, c := range candidates { counts[c.DetectionType]++ } t.Logf("total candidates: %d, by type: %v", len(candidates), counts) - // Should have both prologue-only and call-target candidates at minimum if counts[resurgo.DetectionPrologueOnly] == 0 { t.Error("expected at least one prologue-only candidate") } } -func TestDetectFunctionsFromELF_InvalidReader(t *testing.T) { +// TestDisasmDetector verifies that DisasmDetector, when run against a real ELF +// binary, produces candidates with the expected detection types and that +// functions both called and matching a prologue pattern are promoted to +// DetectionPrologueCallSite with ConfidenceHigh. +func TestDisasmDetector(t *testing.T) { + binPath := filepath.Join(t.TempDir(), "demo-app") + cmd := exec.Command("go", "build", "-o", binPath, "testdata/demo-app.go") + cmd.Env = append(os.Environ(), "CGO_ENABLED=0", "GOARCH=amd64") + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("failed to compile demo-app: %v\n%s", err, out) + } + + f, err := elf.Open(binPath) + if err != nil { + t.Fatalf("failed to open ELF binary: %v", err) + } + defer f.Close() + + candidates, err := resurgo.DisasmDetector(f) + if err != nil { + t.Fatalf("DisasmDetector: %v", err) + } + + if len(candidates) == 0 { + t.Fatal("expected at least one candidate, got none") + } + + counts := make(map[resurgo.DetectionType]int) + for _, c := range candidates { + counts[c.DetectionType]++ + } + t.Logf("total candidates: %d, by type: %v", len(candidates), counts) + + // Disasm must find functions via prologue pattern. + if counts[resurgo.DetectionPrologueOnly] == 0 && counts[resurgo.DetectionPrologueCallSite] == 0 { + t.Error("expected prologue-based candidates, got none") + } + + // Functions confirmed by both prologue and call-site must be ConfidenceHigh + // and must carry at least one caller address. + for _, c := range candidates { + if c.DetectionType == resurgo.DetectionPrologueCallSite { + if c.Confidence != resurgo.ConfidenceHigh { + t.Errorf("0x%x: expected ConfidenceHigh for prologue-callsite, got %s", c.Address, c.Confidence) + } + if len(c.CalledFrom) == 0 && len(c.JumpedFrom) == 0 { + t.Errorf("0x%x: prologue-callsite candidate has no caller or jump source", c.Address) + } + } + } +} + +func TestDetectFunctionsFromELF_InvalidELF(t *testing.T) { r := bytes.NewReader([]byte{0x00, 0x01, 0x02, 0x03}) - _, err := resurgo.DetectFunctionsFromELF(r) + f, err := elf.NewFile(r) if err == nil { - t.Fatal("expected error for invalid ELF data, got nil") + f.Close() + t.Fatal("expected elf.NewFile to fail on invalid data") } } @@ -709,3 +566,65 @@ func TestDetectCallSitesARM64_BConditional(t *testing.T) { t.Errorf("expected low confidence for conditional branch, got %s", edge.Confidence) } } + +func TestDetectCallSites_Go(t *testing.T) { + tests := []struct { + name string + goarch string + minCalls int + }{ + {name: "amd64", goarch: "amd64", minCalls: 1}, + {name: "arm64", goarch: "arm64", minCalls: 1}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + binPath := filepath.Join(t.TempDir(), "demo-app") + cmd := exec.Command("go", "build", "-o", binPath, "testdata/demo-app.go") + cmd.Env = append(os.Environ(), "CGO_ENABLED=0", "GOARCH="+tt.goarch) + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("failed to compile demo-app: %v\n%s", err, out) + } + + f, err := elf.Open(binPath) + if err != nil { + t.Fatalf("failed to open ELF: %v", err) + } + defer f.Close() + + textSec := f.Section(".text") + if textSec == nil { + t.Fatal("no .text section") + } + code, err := textSec.Data() + if err != nil { + t.Fatalf("failed to read .text: %v", err) + } + + arch := resurgo.ArchAMD64 + if tt.goarch == "arm64" { + arch = resurgo.ArchARM64 + } + + edges, err := resurgo.DetectCallSites(code, textSec.Addr, arch) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(edges) == 0 { + t.Fatal("expected at least one call site edge, got none") + } + + calls := 0 + for _, e := range edges { + if e.Type == resurgo.CallSiteCall { + calls++ + } + } + t.Logf("total edges: %d (calls: %d)", len(edges), calls) + + if calls < tt.minCalls { + t.Errorf("expected at least %d calls, got %d", tt.minCalls, calls) + } + }) + } +} diff --git a/convergence_test.go b/convergence_test.go index 355c9a5..9bc7601 100644 --- a/convergence_test.go +++ b/convergence_test.go @@ -31,56 +31,78 @@ func arm64BranchInsn(opBase uint32, source, target uint64) uint32 { return opBase | imm26 } -// assertConvergence calls DetectFunctions, logs all candidates, and asserts -// minimum convergence between prologue and call-site detection. +// assertConvergence checks convergence between prologue and call-site detection +// by running both independently and counting addresses found by both signals. // minTotal is the minimum number of candidates expected, minBoth the minimum -// number of "prologue-callsite" candidates, and minRatio the minimum convergence ratio -// (both / total). +// number of addresses confirmed by both signals, and minRatio the minimum +// convergence ratio (both / total). func assertConvergence(t *testing.T, code []byte, baseAddr uint64, arch resurgo.Arch, minTotal, minBoth int, minRatio float64) { t.Helper() - candidates, err := resurgo.DetectFunctions(code, baseAddr, arch) + prologues, err := resurgo.DetectPrologues(code, baseAddr, arch) if err != nil { - t.Fatalf("DetectFunctions: %v", err) + t.Fatalf("DetectPrologues: %v", err) + } + edges, err := resurgo.DetectCallSites(code, baseAddr, arch) + if err != nil { + t.Fatalf("DetectCallSites: %v", err) } - counts := make(map[resurgo.DetectionType]int) - for _, c := range candidates { - counts[c.DetectionType]++ - t.Logf(" 0x%x: %-15s (prologue: %s, calls: %d, jumps: %d)", - c.Address, c.DetectionType, c.PrologueType, - len(c.CalledFrom), len(c.JumpedFrom)) + prologueSet := make(map[uint64]resurgo.PrologueType, len(prologues)) + for _, p := range prologues { + prologueSet[p.Address] = p.Type + } + callSet := make(map[uint64]struct{}, len(edges)) + for _, e := range edges { + callSet[e.TargetAddr] = struct{}{} } - total := len(candidates) - both := counts[resurgo.DetectionPrologueCallSite] - ratio := float64(both) / float64(total) + allAddrs := make(map[uint64]struct{}) + for _, p := range prologues { + allAddrs[p.Address] = struct{}{} + } + for _, e := range edges { + allAddrs[e.TargetAddr] = struct{}{} + } - t.Logf("total=%d both=%d prologue-only=%d call-target=%d jump-target=%d ratio=%.3f", - total, both, - counts[resurgo.DetectionPrologueOnly], - counts[resurgo.DetectionCallTarget], - counts[resurgo.DetectionJumpTarget], - ratio) + var bothCount, prologueOnly, callTarget int + for addr := range allAddrs { + _, hasPrologue := prologueSet[addr] + _, hasCall := callSet[addr] + switch { + case hasPrologue && hasCall: + bothCount++ + t.Logf(" 0x%x: %-15s (prologue: %s)", addr, resurgo.DetectionPrologueCallSite, prologueSet[addr]) + case hasPrologue: + prologueOnly++ + t.Logf(" 0x%x: %-15s (prologue: %s)", addr, resurgo.DetectionPrologueOnly, prologueSet[addr]) + case hasCall: + callTarget++ + t.Logf(" 0x%x: %-15s", addr, resurgo.DetectionCallTarget) + } + } + + total := len(allAddrs) + ratio := float64(bothCount) / float64(total) + + t.Logf("total=%d both=%d prologue-only=%d call-target=%d ratio=%.3f", + total, bothCount, prologueOnly, callTarget, ratio) if total < minTotal { t.Errorf("expected >= %d candidates, got %d", minTotal, total) } - if both < minBoth { - t.Errorf("expected >= %d 'both' candidates, got %d", minBoth, both) + if bothCount < minBoth { + t.Errorf("expected >= %d 'both' candidates, got %d", minBoth, bothCount) } if ratio < minRatio { t.Errorf("convergence ratio %.3f < %.3f", ratio, minRatio) } - if counts[resurgo.DetectionPrologueOnly] < 1 { + if prologueOnly < 1 { t.Error("expected at least one prologue-only candidate") } - if counts[resurgo.DetectionCallTarget] < 1 { + if callTarget < 1 { t.Error("expected at least one call-target candidate") } - if counts[resurgo.DetectionJumpTarget] < 1 { - t.Error("expected at least one jump-target candidate") - } } // buildSyntheticAMD64 builds a synthetic AMD64 .text section with 12 functions @@ -295,7 +317,7 @@ func buildSyntheticARM64() (code []byte, baseAddr uint64) { return code, base } -func TestDetectFunctions_Convergence(t *testing.T) { +func TestDetectFunctionsFromELF_Convergence(t *testing.T) { // Call graph (both architectures): // main → funcA, funcB, funcC (calls) // funcA → funcD, funcE, funcI (calls) diff --git a/detector.go b/detector.go index 73785c8..bac39fd 100644 --- a/detector.go +++ b/detector.go @@ -62,30 +62,96 @@ type FunctionCandidate struct { Confidence Confidence `json:"confidence"` } -// isENDBR reports whether the 4 bytes at code[i:i+4] encode an ENDBR64 -// (F3 0F 1E FA) or ENDBR32 (F3 0F 1E FB) instruction. -// golang.org/x/arch/x86/x86asm does not recognise these CET instructions, -// so callers must skip them explicitly before invoking the decoder. -func isENDBR(code []byte, i int) bool { - return i+4 <= len(code) && - code[i] == endbr64Byte0 && - code[i+1] == endbr64Byte1 && - code[i+2] == endbr64Byte2 && - (code[i+3] == endbr64Byte3 || code[i+3] == endbr32Byte3) +// CandidateDetector reads an ELF file and emits function candidates. +// Detectors run before filters; their results are merged with those of other +// detectors (deduplicated by address) before the filter pipeline is applied. +type CandidateDetector func(*elf.File) ([]FunctionCandidate, error) + +// Option configures the behaviour of DetectFunctionsFromELF. +type Option func(*options) + +type options struct { + detectors []CandidateDetector + filters []CandidateFilter +} + +// WithDetectors replaces the default detector pipeline with the provided +// detectors. They run in the order provided and their results are merged +// before filtering. Pass no arguments to disable all detectors. +func WithDetectors(detectors ...CandidateDetector) Option { + return func(o *options) { + o.detectors = detectors + } +} + +// DetectFunctionsFromELF returns detected function candidates from f by running all +// detectors then all filters in order. +// +// By default the detector pipeline is [DisasmDetector, EhFrameDetector] and +// the filter pipeline is [CETFilter, EhFrameFilter, PLTFilter]. +// opts may include WithDetectors or WithFilters to replace either pipeline. +func DetectFunctionsFromELF(f *elf.File, opts ...Option) ([]FunctionCandidate, error) { + o := &options{ + detectors: []CandidateDetector{DisasmDetector, EhFrameDetector}, + filters: []CandidateFilter{CETFilter, EhFrameFilter, PLTFilter}, + } + for _, opt := range opts { + opt(o) + } + + var candidates []FunctionCandidate + for _, detect := range o.detectors { + candidate, err := detect(f) + if err != nil { + return nil, err + } + candidates = mergeCandidates(candidates, candidate) + } + + var err error + for _, filter := range o.filters { + candidates, err = filter(candidates, f) + if err != nil { + return nil, err + } + } + + return candidates, nil } -// DetectFunctions combines prologue detection, call site analysis, and -// alignment-based boundary detection to identify function entry points. -// Functions detected by multiple methods receive higher confidence ratings. -func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandidate, error) { +// DisasmDetector is a CandidateDetector that runs the disassembly-based +// pipeline (prologue matching, call-site analysis, alignment-based boundary +// detection) against the .text section of f. +// The architecture is inferred from the ELF header. +func DisasmDetector(f *elf.File) ([]FunctionCandidate, error) { + textSec := f.Section(".text") + if textSec == nil { + return nil, fmt.Errorf("no .text section found") + } + + code, err := textSec.Data() + if err != nil && err != io.EOF { + return nil, fmt.Errorf("failed to read .text section: %w", err) + } + + var arch Arch + switch f.Machine { + case elf.EM_X86_64: + arch = ArchAMD64 + case elf.EM_AARCH64: + arch = ArchARM64 + default: + return nil, fmt.Errorf("unsupported ELF machine: %s", f.Machine) + } + // Detect prologues - prologues, err := DetectPrologues(code, baseAddr, arch) + prologues, err := DetectPrologues(code, textSec.Addr, arch) if err != nil { return nil, fmt.Errorf("failed to detect prologues: %w", err) } // Detect call sites - edges, err := DetectCallSites(code, baseAddr, arch) + edges, err := DetectCallSites(code, textSec.Addr, arch) if err != nil { return nil, fmt.Errorf("failed to detect call sites: %w", err) } @@ -109,9 +175,7 @@ func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandida if edge.Confidence != ConfidenceHigh && edge.Confidence != ConfidenceMedium { continue } - - candidate, exists := candidates[edge.TargetAddr] - if exists { + if candidate, exists := candidates[edge.TargetAddr]; exists { // Address has both prologue and is called/jumped to - highest confidence candidate.DetectionType = DetectionPrologueCallSite candidate.Confidence = ConfidenceHigh @@ -126,7 +190,6 @@ func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandida if edge.Type == CallSiteJump { detType = DetectionJumpTarget } - calledFrom := []uint64{} jumpedFrom := []uint64{} if edge.Type == CallSiteCall { @@ -134,7 +197,6 @@ func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandida } else { jumpedFrom = []uint64{edge.SourceAddr} } - candidates[edge.TargetAddr] = &FunctionCandidate{ Address: edge.TargetAddr, DetectionType: detType, @@ -149,15 +211,15 @@ func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandida // no call-site signal (e.g. pure-leaf functions with external linkage // that were never called due to inlining or compile-time evaluation). // - // These receive ConfidenceLow because the pattern (ret + NOP padding → + // These receive ConfidenceLow because the pattern (ret + NOP padding -> // 16-byte aligned address) is reliable for function separators but can // also match intra-function alignment at loop heads. var alignedEntries []uint64 switch arch { case ArchAMD64: - alignedEntries = detectAlignedEntriesAMD64(code, baseAddr) + alignedEntries = detectAlignedEntriesAMD64(code, textSec.Addr) case ArchARM64: - alignedEntries = detectAlignedEntriesARM64(code, baseAddr) + alignedEntries = detectAlignedEntriesARM64(code, textSec.Addr) } for _, addr := range alignedEntries { if _, exists := candidates[addr]; !exists { @@ -176,74 +238,12 @@ func DetectFunctions(code []byte, baseAddr uint64, arch Arch) ([]FunctionCandida for _, candidate := range candidates { result = append(result, *candidate) } - slices.SortFunc(result, func(a, b FunctionCandidate) int { return cmp.Compare(a.Address, b.Address) }) - return result, nil } -// DetectFunctionsFromELF parses an ELF binary from the given reader, extracts -// the .text section, and returns detected function candidates using combined -// prologue detection, call site analysis, and alignment-based boundary -// detection, followed by FP filters (PLT section ranges, intra-function jump -// targets). When .eh_frame is present, FDE entries are used as a whitelist to -// discard disassembly candidates that are not confirmed by the compiler, and -// any function entries visible only in .eh_frame are added to the result. -// The architecture is inferred from the ELF header. -// -// By default the full filter pipeline (PLTFilter, CETFilter, EhFrameFilter) -// is applied. opts may include WithFilters to replace the default pipeline. -func DetectFunctionsFromELF(r io.ReaderAt, opts ...Option) ([]FunctionCandidate, error) { - o := &options{ - filters: []CandidateFilter{PLTFilter, CETFilter, EhFrameFilter}, - } - for _, opt := range opts { - opt(o) - } - - f, err := elf.NewFile(r) - if err != nil { - return nil, fmt.Errorf("failed to parse ELF file: %w", err) - } - defer f.Close() - - textSec := f.Section(".text") - if textSec == nil { - return nil, fmt.Errorf("no .text section found") - } - - code, err := textSec.Data() - if err != nil && err != io.EOF { - return nil, fmt.Errorf("failed to read .text section: %w", err) - } - - var arch Arch - switch f.Machine { - case elf.EM_X86_64: - arch = ArchAMD64 - case elf.EM_AARCH64: - arch = ArchARM64 - default: - return nil, fmt.Errorf("unsupported ELF machine: %s", f.Machine) - } - - candidates, err := DetectFunctions(code, textSec.Addr, arch) - if err != nil { - return nil, err - } - - for _, filter := range o.filters { - candidates, err = filter(candidates, f) - if err != nil { - return nil, err - } - } - - return candidates, nil -} - // DetectPrologues analyzes raw machine code bytes and returns detected function // prologues. baseAddr is the virtual address corresponding to the start of code. // arch selects the architecture-specific detection logic. @@ -259,6 +259,37 @@ func DetectPrologues(code []byte, baseAddr uint64, arch Arch) ([]Prologue, error } } +// mergeCandidates merges two candidate slices, deduplicating by address. +// When the same address appears in both, the entry from a takes precedence. +func mergeCandidates(a, b []FunctionCandidate) []FunctionCandidate { + seen := make(map[uint64]struct{}, len(a)) + for _, candidate := range a { + seen[candidate.Address] = struct{}{} + } + merged := append([]FunctionCandidate(nil), a...) + for _, candidate := range b { + if _, ok := seen[candidate.Address]; !ok { + merged = append(merged, candidate) + } + } + slices.SortFunc(merged, func(a, b FunctionCandidate) int { + return cmp.Compare(a.Address, b.Address) + }) + return merged +} + +// isENDBR reports whether the 4 bytes at code[i:i+4] encode an ENDBR64 +// (F3 0F 1E FA) or ENDBR32 (F3 0F 1E FB) instruction. +// golang.org/x/arch/x86/x86asm does not recognise these CET instructions, +// so callers must skip them explicitly before invoking the decoder. +func isENDBR(code []byte, i int) bool { + return i+4 <= len(code) && + code[i] == endbr64Byte0 && + code[i+1] == endbr64Byte1 && + code[i+2] == endbr64Byte2 && + (code[i+3] == endbr64Byte3 || code[i+3] == endbr32Byte3) +} + func detectProloguesAMD64(code []byte, baseAddr uint64) ([]Prologue, error) { var result []Prologue @@ -440,33 +471,3 @@ func detectProloguesARM64(code []byte, baseAddr uint64) ([]Prologue, error) { return result, nil } - -// DetectProloguesFromELF parses an ELF binary from the given reader, extracts -// the .text section, and returns detected function prologues. -// The architecture is inferred from the ELF header. -func DetectProloguesFromELF(r io.ReaderAt) ([]Prologue, error) { - f, err := elf.NewFile(r) - if err != nil { - return nil, fmt.Errorf("failed to parse ELF file: %w", err) - } - defer f.Close() - - textSec := f.Section(".text") - if textSec == nil { - return nil, fmt.Errorf("no .text section found") - } - - code, err := textSec.Data() - if err != nil && err != io.EOF { - return nil, fmt.Errorf("failed to read .text section: %w", err) - } - - switch f.Machine { - case elf.EM_X86_64: - return detectProloguesAMD64(code, textSec.Addr) - case elf.EM_AARCH64: - return detectProloguesARM64(code, textSec.Addr) - default: - return nil, fmt.Errorf("unsupported ELF machine: %s", f.Machine) - } -} diff --git a/detector_test.go b/detector_test.go index d89f27f..d18c285 100644 --- a/detector_test.go +++ b/detector_test.go @@ -1,7 +1,7 @@ package resurgo_test import ( - "bytes" + "debug/elf" "encoding/binary" "os" "os/exec" @@ -215,7 +215,16 @@ func TestDetectPrologues_UnsupportedArch(t *testing.T) { } } -func TestDetectProloguesFromELF_Go(t *testing.T) { +// arm64Insn encodes ARM64 instructions as little-endian bytes. +func arm64Insn(insns ...uint32) []byte { + buf := make([]byte, 4*len(insns)) + for i, insn := range insns { + binary.LittleEndian.PutUint32(buf[i*4:], insn) + } + return buf +} + +func TestDetectPrologues_Go(t *testing.T) { tests := []struct { name string goarch string @@ -269,13 +278,27 @@ func TestDetectProloguesFromELF_Go(t *testing.T) { t.Fatalf("failed to compile demo-app: %v\n%s", err, out) } - f, err := os.Open(binPath) + f, err := elf.Open(binPath) if err != nil { - t.Fatalf("failed to open compiled binary: %v", err) + t.Fatalf("failed to open ELF: %v", err) } defer f.Close() - prologues, err := resurgo.DetectProloguesFromELF(f) + textSec := f.Section(".text") + if textSec == nil { + t.Fatal("no .text section") + } + code, err := textSec.Data() + if err != nil { + t.Fatalf("failed to read .text: %v", err) + } + + arch := resurgo.ArchAMD64 + if tt.goarch == "arm64" { + arch = resurgo.ArchARM64 + } + + prologues, err := resurgo.DetectPrologues(code, textSec.Addr, arch) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -298,7 +321,7 @@ func TestDetectProloguesFromELF_Go(t *testing.T) { } } -func TestDetectProloguesFromELF_C(t *testing.T) { +func TestDetectPrologues_C(t *testing.T) { const cSource = "testdata/demo-app.c" tests := []struct { @@ -344,29 +367,12 @@ func TestDetectProloguesFromELF_C(t *testing.T) { if tt.name == "amd64/gcc/optimized" { minCounts = gccOptimizedExpectations(t) } - prologues := compileAndDetect(t, tt.compiler, tt.args, cSource) + prologues := compileAndDetectPrologues(t, tt.compiler, tt.args, cSource) assertPrologues(t, prologues, minCounts) }) } } -func TestDetectProloguesFromELF_InvalidReader(t *testing.T) { - r := bytes.NewReader([]byte{0x00, 0x01, 0x02, 0x03}) - _, err := resurgo.DetectProloguesFromELF(r) - if err == nil { - t.Fatal("expected error for invalid ELF data, got nil") - } -} - -// arm64Insn encodes ARM64 instructions as little-endian bytes. -func arm64Insn(insns ...uint32) []byte { - buf := make([]byte, 4*len(insns)) - for i, insn := range insns { - binary.LittleEndian.PutUint32(buf[i*4:], insn) - } - return buf -} - // gccMajorVersion returns the major version of the GCC compiler at the given // path, or 0 if it cannot be determined. func gccMajorVersion(compiler string) int { @@ -384,24 +390,10 @@ func gccMajorVersion(compiler string) int { // gccOptimizedExpectations returns the expected prologue types for GCC -O2 // output based on the installed GCC version. -// -// GCC version determines which prologue patterns are generated when frame -// pointers are omitted (-O2 default): -// -// - GCC >= 15: emits push rbp (callee-saved) followed by interleaved movs -// before sub rsp; the push rbp at the function boundary is detected as -// PushOnly. -// - GCC 13-14: emits endbr64 (CET) followed immediately by push rbx; -// sub rsp, which is detected as NoFramePointer after the ENDBR skip -// and relaxed boundary check. func gccOptimizedExpectations(t *testing.T) map[resurgo.PrologueType]int { t.Helper() v := gccMajorVersion("gcc") switch { - case v >= 15: - return map[resurgo.PrologueType]int{ - resurgo.ProloguePushOnly: 1, - } case v >= 13: return map[resurgo.PrologueType]int{ resurgo.ProloguePushOnly: 1, @@ -412,9 +404,9 @@ func gccOptimizedExpectations(t *testing.T) map[resurgo.PrologueType]int { } } -// compileAndDetect compiles cSource with the given compiler and flags, runs -// prologue detection on the result, and returns the detected prologues. -func compileAndDetect(t *testing.T, compiler string, args []string, cSource string) []resurgo.Prologue { +// compileAndDetectPrologues compiles cSource with the given compiler and flags, +// extracts the .text section, and returns prologues detected on the raw bytes. +func compileAndDetectPrologues(t *testing.T, compiler string, args []string, cSource string) []resurgo.Prologue { t.Helper() if _, err := exec.LookPath(compiler); err != nil { t.Skipf("%s not found, skipping", compiler) @@ -428,13 +420,27 @@ func compileAndDetect(t *testing.T, compiler string, args []string, cSource stri t.Fatalf("failed to compile %s: %v\n%s", cSource, err, out) } - f, err := os.Open(outPath) + f, err := elf.Open(outPath) if err != nil { - t.Fatalf("failed to open compiled binary: %v", err) + t.Fatalf("failed to open ELF: %v", err) } defer f.Close() - prologues, err := resurgo.DetectProloguesFromELF(f) + textSec := f.Section(".text") + if textSec == nil { + t.Fatal("no .text section") + } + code, err := textSec.Data() + if err != nil { + t.Fatalf("failed to read .text: %v", err) + } + + arch := resurgo.ArchAMD64 + if f.Machine == elf.EM_AARCH64 { + arch = resurgo.ArchARM64 + } + + prologues, err := resurgo.DetectPrologues(code, textSec.Addr, arch) if err != nil { t.Fatalf("unexpected error: %v", err) } diff --git a/doc.go b/doc.go index c5c04bf..05d940b 100644 --- a/doc.go +++ b/doc.go @@ -9,19 +9,14 @@ // - DWARF CFI-based: when the binary contains an .eh_frame section, the // initial_location fields from its FDE records are used as an authoritative // whitelist. These addresses were written by the compiler and survive -// making CFI the highest-confidence source available on stripped binaries. +// stripping, making CFI the highest-confidence source available. // -// The primary entry point for most callers is [DetectFunctionsFromELF], which -// accepts an [io.ReaderAt] (e.g. *os.File), infers the target architecture from -// the ELF header, and returns a deduplicated, filtered slice of -// [FunctionCandidate] values. Each candidate carries its virtual address, -// detection type, and a confidence rating. +// The primary entry point is [DetectFunctionsFromELF], which accepts a parsed +// [*elf.File], runs all detectors and filters, and returns a deduplicated, +// filtered slice of [FunctionCandidate] values. // -// For format-agnostic use (non-ELF binaries, raw memory dumps) use -// [DetectFunctions], which accepts raw machine code bytes and a base address. -// -// Lower-level APIs ([DetectPrologues], [DetectCallSites] and their FromELF -// variants) are available when only a single signal is needed. +// For format-agnostic use (non-ELF binaries, raw memory dumps) the lower-level +// [DetectPrologues] and [DetectCallSites] APIs accept raw machine code bytes. // // Supported architectures: x86_64 (AMD64) and ARM64 (AArch64). package resurgo diff --git a/docs/CALLSITES.md b/docs/CALLSITES.md index 83f88aa..46c62f7 100644 --- a/docs/CALLSITES.md +++ b/docs/CALLSITES.md @@ -127,7 +127,7 @@ Confidence indicates the likelihood that a detected edge points to a function en ### Confidence Escalation -When combined with prologue detection using `DetectFunctions()`: +When combined with prologue detection via `DetectFunctionsFromELF()`: - Prologue + called -> **High confidence** - Prologue only -> **Medium confidence** - Called only -> **Medium confidence** @@ -148,17 +148,26 @@ When combined with prologue detection using `DetectFunctions()`: ### 1. Use Combined Analysis -For best results, use `DetectFunctions()` which merges both signals: +For best results, use `DetectFunctionsFromELF()` which runs all detectors and filters: ```go -candidates, err := resurgo.DetectFunctions(code, baseAddr, arch) +f, err := elf.Open("./myapp") +// ... +candidates, err := resurgo.DetectFunctionsFromELF(f) ``` This provides: -- **Highest confidence** for functions detected by both methods +- **Highest confidence** for functions detected by both disassembly and CFI - **Broader coverage** than either method alone - **Source tracking** (which addresses call each function) +For raw bytes, combine the primitives manually: + +```go +prologues, _ := resurgo.DetectPrologues(code, baseAddr, arch) +edges, _ := resurgo.DetectCallSites(code, baseAddr, arch) +``` + ### 2. Filter by Confidence Focus on high-confidence edges for function identification: @@ -260,7 +269,8 @@ for _, e := range edges { ### Building a Call Graph ```go -candidates, _ := resurgo.DetectFunctions(code, baseAddr, arch) +f, _ := elf.Open("./myapp") +candidates, _ := resurgo.DetectFunctionsFromELF(f) // Create adjacency list callGraph := make(map[uint64][]uint64) @@ -274,7 +284,8 @@ for _, c := range candidates { ### Identifying Entry Points ```go -candidates, _ := resurgo.DetectFunctions(code, baseAddr, arch) +f, _ := elf.Open("./myapp") +candidates, _ := resurgo.DetectFunctionsFromELF(f) // Functions never called (potential entry points) for _, c := range candidates { diff --git a/docs/CFI.md b/docs/CFI.md index 1f11f9d..7fa566b 100644 --- a/docs/CFI.md +++ b/docs/CFI.md @@ -180,19 +180,19 @@ Walking algorithm: ## Integration in `DetectFunctionsFromELF` -`DetectFunctionsFromELF` runs an ordered list of `CandidateFilter` functions -after the disassembly pipeline. The CFI filter is one entry in that list. -When called it: +CFI data is handled by two independent pipeline components: -1. Calls `parseEhFrameEntries(f)` to get all FDE entry addresses. -2. If the result is empty (`.eh_frame` absent): returns candidates unchanged. -3. Drops disassembly candidates not confirmed by any FDE (FP elimination). -4. Appends pure CFI hits: functions visible only in `.eh_frame`, not found - by any disassembly heuristic, emitted as `DetectionCFI` with high - confidence. -5. Re-sorts the result by address. +**`EhFrameDetector`** (detector phase) calls `parseEhFrameEntries(f)` and +emits one `FunctionCandidate` per FDE address with `DetectionCFI` and +`ConfidenceHigh`. If `.eh_frame` is absent it returns an empty slice; the +pipeline falls back to disassembly-only results. -PLT filtering still runs before this step. +**`EhFrameFilter`** (filter phase) retains only candidates whose address +appears in the FDE set, upgrading their confidence to `ConfidenceHigh`. It is +a pure filter - it only removes candidates, never adds them. + +The two components are independent: callers can use `EhFrameDetector` alone +via `WithDetectors`, or `EhFrameFilter` alone via `WithFilters`. ## Confidence and detection type @@ -208,8 +208,9 @@ promoted or merged with the richer disassembly metadata. has FDE entries if the programmer adds `.cfi_*` directives. - **Aggressive stripping:** `strip -R .eh_frame` removes the section entirely. The fallback path handles this transparently. -- **ELF-specific:** this strategy lives in `DetectFunctionsFromELF` only. - The format-agnostic `DetectFunctions` is not affected. +- **ELF-specific:** this strategy is only available via `DetectFunctionsFromELF` + (or directly through `EhFrameDetector`/`EhFrameFilter`). The raw-bytes APIs + `DetectPrologues` and `DetectCallSites` are not affected. - **Inlined regions:** a single function can produce multiple FDEs if the compiler splits it into hot/cold regions. The current parser emits one candidate per FDE; deduplication by VA handles this correctly. diff --git a/dwarf.go b/dwarf.go index 2fbbb1f..ea6f3b1 100644 --- a/dwarf.go +++ b/dwarf.go @@ -1,11 +1,9 @@ package resurgo import ( - "cmp" "debug/elf" "encoding/binary" "fmt" - "slices" ) const ( @@ -37,25 +35,36 @@ type cieInfo struct { fdeEncoding byte // DW_EH_PE_* byte from 'R' augmentation datum } -// EhFrameFilter parses .eh_frame from f and applies the FDE whitelist to cs. -// See applyEhFrame for the merge logic. -func EhFrameFilter(cs []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { +// EhFrameDetector is a CandidateDetector that emits function candidates +// sourced from .eh_frame FDE records. Each candidate carries DetectionCFI +// and ConfidenceHigh. Returns an empty slice (no error) when .eh_frame is +// absent; the caller falls back to disassembly-only results. +func EhFrameDetector(f *elf.File) ([]FunctionCandidate, error) { fdeVAs, err := parseEhFrameEntries(f) if err != nil { return nil, fmt.Errorf("parse .eh_frame: %w", err) } - return applyEhFrame(cs, fdeVAs), nil + candidates := make([]FunctionCandidate, 0, len(fdeVAs)) + for _, va := range fdeVAs { + candidates = append(candidates, FunctionCandidate{ + Address: va, + DetectionType: DetectionCFI, + Confidence: ConfidenceHigh, + }) + } + return candidates, nil } -// applyEhFrame applies .eh_frame FDE data to the candidate slice. -// When fdeVAs is empty it returns candidates unchanged (fallback for binaries -// without .eh_frame). Otherwise it: -// - drops candidates whose address is not confirmed by any FDE -// - appends pure FDE hits (functions invisible to disassembly heuristics) -// - re-sorts the result by address -func applyEhFrame(candidates []FunctionCandidate, fdeVAs []uint64) []FunctionCandidate { +// EhFrameFilter retains only candidates whose address is confirmed by an FDE +// record in .eh_frame, upgrading their confidence to ConfidenceHigh. +// When .eh_frame is absent the slice is returned unchanged. +func EhFrameFilter(candidates []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { + fdeVAs, err := parseEhFrameEntries(f) + if err != nil { + return nil, fmt.Errorf("parse .eh_frame: %w", err) + } if len(fdeVAs) == 0 { - return candidates + return candidates, nil } fdeSet := make(map[uint64]struct{}, len(fdeVAs)) @@ -71,27 +80,7 @@ func applyEhFrame(candidates []FunctionCandidate, fdeVAs []uint64) []FunctionCan filtered = append(filtered, c) } } - candidates = filtered - - // Append FDE-only hits not already found by disassembly. - disasmSet := make(map[uint64]struct{}, len(candidates)) - for _, c := range candidates { - disasmSet[c.Address] = struct{}{} - } - for _, va := range fdeVAs { - if _, ok := disasmSet[va]; !ok { - candidates = append(candidates, FunctionCandidate{ - Address: va, - DetectionType: DetectionCFI, - Confidence: ConfidenceHigh, - }) - } - } - - slices.SortFunc(candidates, func(a, b FunctionCandidate) int { - return cmp.Compare(a.Address, b.Address) - }) - return candidates + return filtered, nil } // parseEhFrameEntries parses the .eh_frame section of f and returns the diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 0e1c1a0..deb5bc9 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -264,7 +264,7 @@ func measure( t.Fatalf("ground truth missing functions: %v", missing) } - f, err := os.Open(stripped) + f, err := elf.Open(stripped) if err != nil { t.Fatalf("os.Open: %v", err) } @@ -321,14 +321,14 @@ func measure( return byVA, truth, stats } -// TestDetectFunctionsFromELF_StrippedC_Unoptimized verifies that +// TestDetectFunctions_StrippedC_Unoptimized verifies that // DetectFunctionsFromELF finds all user-defined functions in a stripped C // binary compiled without optimisation. // // Source: testdata/stripped-app.c - the same 16-function realistic fixture // used by the optimized tests. At -O0 -fno-inline all 16 functions survive // as distinct symbols; 100% recall is required. -func TestDetectFunctionsFromELF_StrippedC_Unoptimized(t *testing.T) { +func TestDetectFunctions_StrippedC_Unoptimized(t *testing.T) { userFuncs := []string{ "word_count", "longest_word", "vowel_count", "char_count", "is_printable", "checksum", @@ -369,7 +369,7 @@ func TestDetectFunctionsFromELF_StrippedC_Unoptimized(t *testing.T) { logStatsTable(t, statsRow{"result", stats}) } -// TestDetectFunctionsFromELF_StrippedC_Unoptimized_ARM64 verifies that +// TestDetectFunctions_StrippedC_Unoptimized_ARM64 verifies that // DetectFunctionsFromELF finds all user-defined functions in a // cross-compiled ARM64 stripped binary compiled without optimisation. // @@ -378,7 +378,7 @@ func TestDetectFunctionsFromELF_StrippedC_Unoptimized(t *testing.T) { // boundaries without prologues, so 100% recall is expected. // // Skipped if aarch64-linux-gnu-gcc or aarch64-linux-gnu-strip are not in PATH. -func TestDetectFunctionsFromELF_StrippedC_Unoptimized_ARM64(t *testing.T) { +func TestDetectFunctions_StrippedC_Unoptimized_ARM64(t *testing.T) { userFuncs := []string{ "word_count", "longest_word", "vowel_count", "char_count", "is_printable", "checksum", @@ -419,7 +419,7 @@ func TestDetectFunctionsFromELF_StrippedC_Unoptimized_ARM64(t *testing.T) { logStatsTable(t, statsRow{"result", stats}) } -// TestDetectFunctionsFromELF_StrippedC_Optimized validates that +// TestDetectFunctions_StrippedC_Optimized validates that // DetectFunctionsFromELF correctly identifies all user functions in a // stripped C binary compiled at -O2. // @@ -427,7 +427,7 @@ func TestDetectFunctionsFromELF_StrippedC_Unoptimized_ARM64(t *testing.T) { // functions covering a range of shapes: loop-heavy leaves, multi-caller // aggregators, a nested-loop sort, and two recursive functions (fib, gcd). // gcc -O2 preserves all 16 as distinct symbols on AMD64. -func TestDetectFunctionsFromELF_StrippedC_Optimized(t *testing.T) { +func TestDetectFunctions_StrippedC_Optimized(t *testing.T) { userFuncs := []string{ "word_count", "longest_word", "vowel_count", "char_count", "is_printable", "checksum", @@ -467,7 +467,7 @@ func TestDetectFunctionsFromELF_StrippedC_Optimized(t *testing.T) { logStatsTable(t, statsRow{"result", stats}) } -// TestDetectFunctionsFromELF_RealWorld_Grep_AMD64 validates detection on a +// TestDetectFunctions_RealWorld_Grep_AMD64 validates detection on a // real-world AMD64 stripped binary: Debian grep 3.11-4 compiled with full // gcc hardening. // @@ -490,7 +490,7 @@ func TestDetectFunctionsFromELF_StrippedC_Optimized(t *testing.T) { // // Skipped if /usr/bin/grep is not stripped or grep-dbgsym is not installed. -func TestDetectFunctionsFromELF_RealWorld_Grep_AMD64(t *testing.T) { +func TestDetectFunctions_RealWorld_Grep_AMD64(t *testing.T) { const binPath = "/usr/bin/grep" if !isStripped(t, binPath) { @@ -518,7 +518,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_AMD64(t *testing.T) { } } - f, err := os.Open(binPath) + f, err := elf.Open(binPath) if err != nil { t.Fatalf("os.Open(%s): %v", binPath, err) } @@ -640,7 +640,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_AMD64(t *testing.T) { } } -// TestDetectFunctionsFromELF_RealWorld_Grep_ARM64 validates detection on a +// TestDetectFunctions_RealWorld_Grep_ARM64 validates detection on a // real-world ARM64 stripped binary: the same Debian grep package built for // arm64. // @@ -656,7 +656,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_AMD64(t *testing.T) { // // Skipped if the binary or its debug file is not present (e.g. outside the // e2e Docker image or CI container). -func TestDetectFunctionsFromELF_RealWorld_Grep_ARM64(t *testing.T) { +func TestDetectFunctions_RealWorld_Grep_ARM64(t *testing.T) { const binPath = "/opt/grep-arm64/usr/bin/grep" if _, err := os.Stat(binPath); err != nil { @@ -688,7 +688,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_ARM64(t *testing.T) { } } - f, err := os.Open(binPath) + f, err := elf.Open(binPath) if err != nil { t.Fatalf("os.Open(%s): %v", binPath, err) } @@ -809,7 +809,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_ARM64(t *testing.T) { } } -// TestDetectFunctionsFromELF_StrippedC_Optimized_ARM64 validates detection +// TestDetectFunctions_StrippedC_Optimized_ARM64 validates detection // on a cross-compiled ARM64 optimized stripped binary. // // The test cross-compiles testdata/stripped-app.c with aarch64-linux-gnu-gcc @@ -819,7 +819,7 @@ func TestDetectFunctionsFromELF_RealWorld_Grep_ARM64(t *testing.T) { // is now expected. // // Skipped if aarch64-linux-gnu-gcc or aarch64-linux-gnu-strip are not in PATH. -func TestDetectFunctionsFromELF_StrippedC_Optimized_ARM64(t *testing.T) { +func TestDetectFunctions_StrippedC_Optimized_ARM64(t *testing.T) { userFuncs := []string{ "word_count", "longest_word", "vowel_count", "char_count", "is_printable", "checksum", diff --git a/example_test.go b/example_test.go index 79aaa7b..9104f29 100644 --- a/example_test.go +++ b/example_test.go @@ -1,9 +1,9 @@ package resurgo_test import ( + "debug/elf" "fmt" "log" - "os" "github.com/maxgio92/resurgo" ) @@ -22,23 +22,6 @@ func ExampleDetectPrologues() { // [classic] 0x1001: push rbp; mov rbp, rsp } -func ExampleDetectProloguesFromELF() { - f, err := os.Open("/usr/bin/ls") - if err != nil { - log.Fatal(err) - } - defer f.Close() - - prologues, err := resurgo.DetectProloguesFromELF(f) - if err != nil { - log.Fatal(err) - } - - for _, p := range prologues { - fmt.Printf("[%s] 0x%x: %s\n", p.Type, p.Address, p.Instructions) - } -} - func ExampleDetectCallSites() { // x86-64 machine code: call $+0x20 (E8 1B 00 00 00) // At address 0x1000, calls target at 0x1000 + 5 + 0x1B = 0x1020 @@ -55,64 +38,24 @@ func ExampleDetectCallSites() { // [call] 0x1000 -> 0x1020 (pc-relative, high) } -func ExampleDetectCallSitesFromELF() { - f, err := os.Open("/usr/bin/ls") +func ExampleDetectFunctionsFromELF() { + f, err := elf.Open("/usr/bin/ls") if err != nil { log.Fatal(err) } defer f.Close() - edges, err := resurgo.DetectCallSitesFromELF(f) - if err != nil { - log.Fatal(err) - } - - // Show first 5 high-confidence call edges - count := 0 - for _, e := range edges { - if e.Type == resurgo.CallSiteCall && e.Confidence == resurgo.ConfidenceHigh { - fmt.Printf("[%s] 0x%x -> 0x%x (%s)\n", - e.Type, e.SourceAddr, e.TargetAddr, e.AddressMode) - count++ - if count >= 5 { - break - } - } - } -} - -func ExampleDetectFunctions() { - // x86-64 code with prologue and call: - // 0x1000: push rbp; mov rbp, rsp - // 0x1004: call 0x1020 - // ... - // 0x1020: push rbp; mov rbp, rsp (called function) - code := make([]byte, 0x30) - code[0x00] = 0x55 // push rbp - code[0x01] = 0x48 // mov rbp, rsp - code[0x02] = 0x89 - code[0x03] = 0xe5 - code[0x04] = 0xE8 // call - code[0x05] = 0x17 // rel32 = 0x17 - code[0x06] = 0x00 - code[0x07] = 0x00 - code[0x08] = 0x00 - code[0x09] = 0xC3 // ret (establish function boundary) - code[0x20] = 0x55 // push rbp at target - code[0x21] = 0x48 // mov rbp, rsp - code[0x22] = 0x89 - code[0x23] = 0xe5 - - candidates, err := resurgo.DetectFunctions(code, 0x1000, resurgo.ArchAMD64) + candidates, err := resurgo.DetectFunctionsFromELF(f) if err != nil { log.Fatal(err) } + // Count candidates by detection type. + counts := make(map[resurgo.DetectionType]int) for _, c := range candidates { - fmt.Printf("0x%x: %s (confidence: %s)\n", - c.Address, c.DetectionType, c.Confidence) + counts[c.DetectionType]++ } - // Output: - // 0x1000: prologue-only (confidence: medium) - // 0x1020: prologue-callsite (confidence: high) + fmt.Printf("total: %d\n", len(candidates)) + fmt.Printf("prologue+callsite: %d\n", counts[resurgo.DetectionPrologueCallSite]) + fmt.Printf("cfi: %d\n", counts[resurgo.DetectionCFI]) } diff --git a/filter.go b/filter.go index b03bc39..05692d6 100644 --- a/filter.go +++ b/filter.go @@ -9,51 +9,44 @@ import ( // Each filter reads only what it needs from f and returns the updated slice. type CandidateFilter func([]FunctionCandidate, *elf.File) ([]FunctionCandidate, error) -// Option configures the behaviour of DetectFunctionsFromELF. -type Option func(*options) - -type options struct { - filters []CandidateFilter -} - -// WithFilters sets the filter pipeline applied after disassembly. filters run -// in the order provided. Pass no arguments to disable all filters. +// WithFilters replaces the default filter pipeline with the provided filters. +// They run in the order provided. Pass no arguments to disable all filters. func WithFilters(filters ...CandidateFilter) Option { return func(o *options) { o.filters = filters } } -// PLTFilter removes candidates from cs that land inside linker-generated PLT +// PLTFilter removes candidates that land inside linker-generated PLT // sections (.plt, .plt.got, .plt.sec, .iplt) as reported by f. -func PLTFilter(cs []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { +func PLTFilter(candidates []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { var pltRanges [][2]uint64 for _, name := range []string{".plt", ".plt.got", ".plt.sec", ".iplt"} { if sec := f.Section(name); sec != nil { pltRanges = append(pltRanges, [2]uint64{sec.Addr, sec.Addr + sec.Size}) } } - return filterCandidatesInRanges(cs, pltRanges), nil + return filterCandidatesInRanges(candidates, pltRanges), nil } -// CETFilter filters cs using the CET-aware ENDBR64 heuristic, reading the -// .text section from f. Non-AMD64 binaries are returned unchanged. The filter -// must run before EhFrameFilter so that any aligned-entry candidate it drops -// can be recovered as DetectionCFI when its address appears in an FDE record -// (e.g. _start has no ENDBR64 but does have an FDE entry). -func CETFilter(cs []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { +// CETFilter filters candidates using the CET-aware ENDBR64 heuristic, reading +// the .text section from f. Non-AMD64 binaries are returned unchanged. +// The ELF entry point is exempt from the ENDBR64 requirement: it is not an +// indirect branch target and therefore never carries ENDBR64 even in CET +// binaries (e.g. _start). The filter must run before EhFrameFilter. +func CETFilter(candidates []FunctionCandidate, f *elf.File) ([]FunctionCandidate, error) { if f.Machine != elf.EM_X86_64 { - return cs, nil + return candidates, nil } textSec := f.Section(".text") if textSec == nil { - return cs, nil + return candidates, nil } textBytes, err := textSec.Data() if err != nil { return nil, err } - return filterAlignedEntriesCETAMD64(cs, textBytes, textSec.Addr, f.Entry), nil + return filterAlignedEntriesCETAMD64(candidates, textBytes, textSec.Addr, f.Entry), nil } // filterAlignedEntriesCETAMD64 drops aligned-entry candidates lacking ENDBR64