From 4fd3af3960dc6c5cd54676a608021258453a7cb0 Mon Sep 17 00:00:00 2001 From: Devin AI Date: Sat, 25 Apr 2026 17:23:19 +0000 Subject: [PATCH] feat: initial shape-scan implementation Adds entropy and topological-shape file scanner: - Shannon entropy (whole-file + sliding window) in bits/byte - Byte-bigram transition graph with density, joint/conditional entropy, per-row entropy stats, and stable structural fingerprint - Section-aware analysis for ELF, PE, and Mach-O via goblin - CLI with scan/shape/entropy subcommands (text/json/markdown output) - Heuristic risk score with documented weights and small-file dampening - 12 unit tests covering entropy, shape, scoring, and I/O - GitHub Actions CI (fmt + clippy + test on linux/macos/windows) --- .github/workflows/ci.yml | 33 ++ .gitignore | 4 + Cargo.lock | 713 +++++++++++++++++++++++++++++++++++++++ Cargo.toml | 36 ++ LICENSE-APACHE | 15 + LICENSE-MIT | 21 ++ README.md | 111 ++++++ src/entropy.rs | 195 +++++++++++ src/lib.rs | 15 + src/main.rs | 326 ++++++++++++++++++ src/scan.rs | 223 ++++++++++++ src/sections.rs | 167 +++++++++ src/shape.rs | 188 +++++++++++ 13 files changed, 2047 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 LICENSE-APACHE create mode 100644 LICENSE-MIT create mode 100644 README.md create mode 100644 src/entropy.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/scan.rs create mode 100644 src/sections.rs create mode 100644 src/shape.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..4293947 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,33 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: -Dwarnings + +jobs: + test: + name: test (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + - name: cargo fmt + run: cargo fmt --all -- --check + - name: cargo clippy + run: cargo clippy --all-targets --all-features -- -D warnings + - name: cargo test + run: cargo test --all-features --verbose + - name: cargo build (release) + run: cargo build --release --verbose diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9ba5eb1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +Cargo.lock.bak +*.swp +.DS_Store diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..3e15e5d --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,713 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "goblin" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b363a30c165f666402fe6a3024d3bec7ebc898f96a4a23bd1c99f8dbf3f4f47" +dependencies = [ + "log", + "plain", + "scroll", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.0", + "serde", + "serde_core", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scroll" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ab8598aa408498679922eff7fa985c25d58a90771bd6be794434c5277eab1a6" +dependencies = [ + "scroll_derive", +] + +[[package]] +name = "scroll_derive" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1783eabc414609e28a5ba76aee5ddd52199f7107a0b24c2e9746a1ecc34a683d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shape-scan" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "goblin", + "rayon", + "serde", + "serde_json", + "tempfile", + "thiserror", + "walkdir", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c40d344 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "shape-scan" +version = "0.1.0" +edition = "2021" +description = "CLI scanner that measures the entropy and topological shape of files to flag suspicious binaries (e.g. packed/encrypted payloads)." +license = "MIT OR Apache-2.0" +repository = "https://github.com/DimaMenetro/shape-scan" +readme = "README.md" +keywords = ["entropy", "malware", "topology", "binary-analysis", "cli"] +categories = ["command-line-utilities", "filesystem"] + +[[bin]] +name = "shape-scan" +path = "src/main.rs" + +[lib] +name = "shape_scan" +path = "src/lib.rs" + +[dependencies] +anyhow = "1.0" +clap = { version = "4.5", features = ["derive"] } +goblin = { version = "0.8", default-features = false, features = ["std", "elf32", "elf64", "pe32", "pe64", "mach32", "mach64", "endian_fd"] } +rayon = "1.10" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +thiserror = "1" +walkdir = "2.5" + +[dev-dependencies] +tempfile = "3.10" + +[profile.release] +lto = "thin" +codegen-units = 1 +strip = "symbols" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..d07ae9a --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,15 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..1e4b82d --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Dima Menetro + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4e6c1b0 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# shape-scan + +`shape-scan` is a small command-line tool that measures the **Shannon entropy** +and **topological shape** of files. It's intended as a triage signal during +binary analysis: files that look statistically similar to packed, encrypted, or +otherwise obfuscated content (a common malware pattern) are surfaced with a +heuristic risk score. + +> **Honest claim:** `shape-scan` is **not** a malware classifier and it cannot +> make malware "impossible to get past". File entropy and byte-graph shape are +> well-known, well-studied features — sophisticated malware authors deliberately +> tune their payloads to evade exactly these checks (e.g. by stuffing English +> text or padding into otherwise random sections). Use `shape-scan` the way +> you'd use `file(1)` or `strings(1)`: as a fast, statistically-grounded signal +> that helps a human prioritise what to look at next. + +## What it computes + +For every file you point it at: + +1. **Shannon entropy** of the whole file, in bits/byte (max 8.0). +2. **Sliding-window entropy**: per-window mean, std-dev, min/max, and the + fraction of windows above 7.5 bits/byte (a common "looks encrypted" + threshold). +3. **Per-section entropy** for ELF, PE, and Mach-O binaries (via `goblin`). +4. **Topological shape** of the byte stream, treated as a Markov chain over its + bytes: + - `|V|` — number of distinct byte values present (≤ 256) + - `|E|` — number of distinct adjacent byte pairs (≤ 65 536) + - **edge density** — `|E| / 65 536`, in `[0, 1]` + - **bigram entropy** — joint Shannon entropy of the 256×256 transition + matrix, in bits/pair + - **conditional entropy** `H(b_{i+1} | b_i)` + - **mean per-row entropy** ± std-dev across the rows of the transition + matrix + - **structural fingerprint** — a stable 64-bit hash of the quantised + transition matrix +5. **Combined risk score** in `[0.0, 1.0]` plus a coarse `low`/`medium`/`high` + bucket and a list of human-readable indicators explaining the score. + +## Install + +```sh +cargo install --path . +``` + +Or build a release binary: + +```sh +cargo build --release +./target/release/shape-scan --help +``` + +## Usage + +```sh +# Scan a single file +shape-scan scan ./suspect.bin + +# Scan a directory recursively, only show medium-or-higher risk, JSON output +shape-scan scan ./samples -r --min-risk medium --format json + +# Just the topology of one file +shape-scan shape ./suspect.bin + +# Just the entropy profile, with a 1 KiB sliding window +shape-scan entropy ./suspect.bin --window 1024 +``` + +Exit codes: + +- `0` — completed; no high-risk files found +- `1` — completed; at least one high-risk file found +- `2` — error (bad path, I/O failure, etc.) + +## How the score is composed + +The score is a **weighted sum of independent indicators**, each clamped so no +single feature can dominate: + +| Indicator | Weight | +|----------------------------------------------------------------|--------| +| Whole-file entropy ≥ 7.5 bits/byte | +0.35 | +| Whole-file entropy 7.0–7.5 | +0.15 | +| ≥ 50% of sliding windows above 7.5 bits/byte | +0.20 | +| Window-entropy std-dev ≥ 1.5 | +0.05 | +| Bigram-graph edge density ≥ 0.85 | +0.15 | +| Conditional entropy ≥ 7.5 bits/byte | +0.10 | +| ELF/PE/Mach-O section ≥ 256 B with entropy ≥ 7.5 (max once) | +0.15 | +| Files smaller than 1 KiB get the score scaled by 0.4 | — | + +Buckets: `< 0.45` → `low`, `< 0.75` → `medium`, otherwise `high`. + +## Use it as a library + +The crate also exposes a small library API: + +```rust +use shape_scan::{scan_path, RiskLevel}; + +let report = scan_path(std::path::Path::new("suspect.bin"))?; +println!("{:?}", report.risk_level); +for ind in &report.indicators { + println!("- {ind}"); +} +``` + +## License + +Dual-licensed under either of [MIT](LICENSE-MIT) or +[Apache-2.0](LICENSE-APACHE), at your option. diff --git a/src/entropy.rs b/src/entropy.rs new file mode 100644 index 0000000..0225331 --- /dev/null +++ b/src/entropy.rs @@ -0,0 +1,195 @@ +//! Shannon-entropy primitives. +//! +//! All entropy values are in **bits per byte** and therefore lie in the +//! closed interval `[0.0, 8.0]`. A uniform distribution over the 256 +//! possible byte values reaches the maximum of 8 bits/byte. + +use serde::Serialize; + +/// Default sliding-window size used by `WindowEntropy::from_bytes`. +/// +/// 4 KiB is the de-facto industry default for "block entropy" analysis — +/// it's large enough that uniform-random bytes saturate near 8.0 +/// (small-sample bias is ≲ 0.05 bits/byte) but small enough that local +/// regions of structured data still stand out. +pub const DEFAULT_WINDOW: usize = 4096; + +/// Compute the byte-frequency histogram of `data`. +#[inline] +pub fn histogram(data: &[u8]) -> [u64; 256] { + let mut counts = [0u64; 256]; + for &b in data { + counts[b as usize] += 1; + } + counts +} + +/// Shannon entropy of a byte histogram, in bits per byte. +pub fn entropy_from_histogram(counts: &[u64; 256], total: u64) -> f64 { + if total == 0 { + return 0.0; + } + let total_f = total as f64; + let mut h = 0.0f64; + for &c in counts.iter() { + if c == 0 { + continue; + } + let p = c as f64 / total_f; + h -= p * p.log2(); + } + h +} + +/// Shannon entropy of `data`, in bits per byte. +#[inline] +pub fn shannon_entropy(data: &[u8]) -> f64 { + let counts = histogram(data); + entropy_from_histogram(&counts, data.len() as u64) +} + +/// Aggregated entropy statistics for a byte slice. +#[derive(Debug, Clone, Serialize)] +pub struct EntropyReport { + /// Total number of bytes analysed. + pub size: u64, + /// Shannon entropy of the entire slice. + pub shannon_bits_per_byte: f64, + /// Normalised entropy (`shannon / 8`), in `[0, 1]`. + pub normalised: f64, + /// Sliding-window analysis (omitted for empty inputs). + pub windows: Option, +} + +impl EntropyReport { + /// Compute a full entropy report using the default window size. + pub fn from_bytes(data: &[u8]) -> Self { + Self::from_bytes_with_window(data, DEFAULT_WINDOW) + } + + /// Compute a full entropy report with a caller-specified window size. + pub fn from_bytes_with_window(data: &[u8], window: usize) -> Self { + let shannon = shannon_entropy(data); + let windows = if data.is_empty() { + None + } else { + Some(WindowEntropy::from_bytes(data, window)) + }; + Self { + size: data.len() as u64, + shannon_bits_per_byte: shannon, + normalised: shannon / 8.0, + windows, + } + } +} + +/// Sliding-window entropy summary. +#[derive(Debug, Clone, Serialize)] +pub struct WindowEntropy { + /// Window size in bytes. + pub window_size: usize, + /// Number of windows analysed. + pub count: usize, + /// Mean per-window entropy (bits/byte). + pub mean: f64, + /// Min per-window entropy. + pub min: f64, + /// Max per-window entropy. + pub max: f64, + /// Standard deviation across windows. + pub stddev: f64, + /// Fraction of windows with entropy >= 7.5 bits/byte (typical + /// "high entropy" threshold for compressed/encrypted data). + pub high_entropy_fraction: f64, +} + +impl WindowEntropy { + /// Compute sliding-window entropy. Windows are non-overlapping and + /// the trailing partial window (if any) is included. + pub fn from_bytes(data: &[u8], window: usize) -> Self { + let window = window.max(1); + if data.is_empty() { + return Self { + window_size: window, + count: 0, + mean: 0.0, + min: 0.0, + max: 0.0, + stddev: 0.0, + high_entropy_fraction: 0.0, + }; + } + let mut entropies = Vec::with_capacity(data.len().div_ceil(window)); + for chunk in data.chunks(window) { + entropies.push(shannon_entropy(chunk)); + } + let count = entropies.len(); + let mean = entropies.iter().sum::() / count as f64; + let mut min = f64::INFINITY; + let mut max = f64::NEG_INFINITY; + let mut high = 0usize; + for &e in &entropies { + if e < min { + min = e; + } + if e > max { + max = e; + } + if e >= 7.5 { + high += 1; + } + } + let var = entropies + .iter() + .map(|e| { + let d = e - mean; + d * d + }) + .sum::() + / count as f64; + Self { + window_size: window, + count, + mean, + min, + max, + stddev: var.sqrt(), + high_entropy_fraction: high as f64 / count as f64, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn entropy_of_empty_is_zero() { + assert_eq!(shannon_entropy(&[]), 0.0); + } + + #[test] + fn entropy_of_constant_is_zero() { + let data = vec![0xAAu8; 1024]; + assert!(shannon_entropy(&data).abs() < 1e-12); + } + + #[test] + fn entropy_of_uniform_is_eight() { + // One copy of each possible byte → perfectly uniform. + let data: Vec = (0u16..256).map(|x| x as u8).collect(); + let h = shannon_entropy(&data); + assert!((h - 8.0).abs() < 1e-9, "entropy was {h}"); + } + + #[test] + fn report_contains_window_stats() { + let data: Vec = (0..4096).map(|i| (i * 7) as u8).collect(); + let r = EntropyReport::from_bytes(&data); + assert!(r.shannon_bits_per_byte > 7.0); + let w = r.windows.expect("windows present for non-empty input"); + assert!(w.count > 0); + assert!(w.mean > 0.0); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..413dd15 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,15 @@ +//! Library crate for `shape-scan`. +//! +//! Provides entropy and topological "shape" analysis of arbitrary byte +//! streams, plus section-aware analysis for known executable formats +//! (ELF, PE, Mach-O). Higher-level scoring is exposed via [`scan`]. + +pub mod entropy; +pub mod scan; +pub mod sections; +pub mod shape; + +pub use entropy::{EntropyReport, WindowEntropy}; +pub use scan::{scan_path, FileReport, RiskLevel}; +pub use sections::{SectionEntropy, SectionReport}; +pub use shape::{byte_histogram, ShapeReport}; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..40d8d88 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,326 @@ +//! `shape-scan` — entropy + topological-shape file scanner. + +use std::path::{Path, PathBuf}; +use std::process::ExitCode; + +use anyhow::{Context, Result}; +use clap::{Parser, Subcommand, ValueEnum}; +use rayon::prelude::*; +use walkdir::WalkDir; + +use shape_scan::{scan_path, FileReport, RiskLevel}; + +#[derive(Parser, Debug)] +#[command( + name = "shape-scan", + version, + about = "Measure the entropy and topological shape of files to flag suspicious binaries.", + long_about = "shape-scan analyses every byte of a file (and, where possible, every section \ + of an executable) to produce two complementary signals: a Shannon-entropy profile and a \ + byte-bigram-graph (\"shape\") profile. It then combines them into a heuristic risk score. \ + This is a triage signal, not a verdict — sophisticated malware can be tuned to evade \ + entropy and shape heuristics." +)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// Compute entropy + shape + risk score for one or more paths. + Scan { + /// Files or directories to scan. + #[arg(required = true)] + paths: Vec, + /// Recurse into directories. + #[arg(short, long)] + recursive: bool, + /// Skip files larger than this many MiB (0 = unlimited). + #[arg(long, default_value_t = 0)] + max_size_mib: u64, + /// Output format. + #[arg(short, long, value_enum, default_value_t = Format::Text)] + format: Format, + /// Only show files at or above this risk level. + #[arg(long, value_enum)] + min_risk: Option, + /// Number of parallel workers (0 = auto). + #[arg(short, long, default_value_t = 0)] + jobs: usize, + }, + /// Print only the topological shape report for a single file. + Shape { + path: PathBuf, + #[arg(short, long, value_enum, default_value_t = Format::Text)] + format: Format, + }, + /// Print only the entropy report for a single file. + Entropy { + path: PathBuf, + #[arg(short, long, value_enum, default_value_t = Format::Text)] + format: Format, + /// Sliding-window size in bytes. + #[arg(short, long, default_value_t = 4096)] + window: usize, + }, +} + +#[derive(Copy, Clone, Debug, ValueEnum)] +enum Format { + Text, + Json, + Markdown, +} + +#[derive(Copy, Clone, Debug, ValueEnum, PartialEq, Eq)] +enum RiskFilter { + Low, + Medium, + High, +} + +impl RiskFilter { + fn includes(self, r: RiskLevel) -> bool { + let order = |x| match x { + RiskLevel::Low => 0, + RiskLevel::Medium => 1, + RiskLevel::High => 2, + }; + let threshold = match self { + Self::Low => 0, + Self::Medium => 1, + Self::High => 2, + }; + order(r) >= threshold + } +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + let res = match cli.command { + Command::Scan { + paths, + recursive, + max_size_mib, + format, + min_risk, + jobs, + } => run_scan(paths, recursive, max_size_mib, format, min_risk, jobs), + Command::Shape { path, format } => run_shape(&path, format), + Command::Entropy { + path, + format, + window, + } => run_entropy(&path, format, window), + }; + match res { + Ok(code) => code, + Err(e) => { + eprintln!("shape-scan: error: {e:#}"); + ExitCode::from(2) + } + } +} + +fn run_scan( + paths: Vec, + recursive: bool, + max_size_mib: u64, + format: Format, + min_risk: Option, + jobs: usize, +) -> Result { + if jobs > 0 { + rayon::ThreadPoolBuilder::new() + .num_threads(jobs) + .build_global() + .ok(); + } + + let max_bytes = if max_size_mib == 0 { + u64::MAX + } else { + max_size_mib * 1024 * 1024 + }; + + let targets = collect_targets(&paths, recursive, max_bytes)?; + + let reports: Vec = targets + .par_iter() + .filter_map(|p| match scan_path(p) { + Ok(r) => Some(r), + Err(e) => { + eprintln!("shape-scan: skipping {}: {e:#}", p.display()); + None + } + }) + .filter(|r| match min_risk { + Some(filter) => filter.includes(r.risk_level), + None => true, + }) + .collect(); + + let mut sorted = reports; + sorted.sort_by(|a, b| b.risk_score.partial_cmp(&a.risk_score).unwrap()); + + match format { + Format::Json => { + println!("{}", serde_json::to_string_pretty(&sorted)?); + } + Format::Text => print_text(&sorted), + Format::Markdown => print_markdown(&sorted), + } + + let any_high = sorted.iter().any(|r| r.risk_level == RiskLevel::High); + Ok(if any_high { + ExitCode::from(1) + } else { + ExitCode::SUCCESS + }) +} + +fn collect_targets(paths: &[PathBuf], recursive: bool, max_bytes: u64) -> Result> { + let mut out = Vec::new(); + for p in paths { + if !p.exists() { + anyhow::bail!("path does not exist: {}", p.display()); + } + if p.is_file() { + if file_size(p)? <= max_bytes { + out.push(p.clone()); + } + continue; + } + if p.is_dir() { + let walker = WalkDir::new(p).follow_links(false); + let walker = if recursive { + walker + } else { + walker.max_depth(1) + }; + for entry in walker.into_iter().filter_map(|e| e.ok()) { + if entry.file_type().is_file() { + let path = entry.path().to_path_buf(); + if file_size(&path).unwrap_or(0) <= max_bytes { + out.push(path); + } + } + } + } + } + Ok(out) +} + +fn file_size(p: &Path) -> Result { + Ok(std::fs::metadata(p) + .with_context(|| format!("stat {}", p.display()))? + .len()) +} + +fn run_shape(path: &Path, format: Format) -> Result { + let report = scan_path(path)?; + match format { + Format::Json => println!("{}", serde_json::to_string_pretty(&report.shape)?), + Format::Text | Format::Markdown => { + let s = &report.shape; + println!("shape report for {}", path.display()); + println!(" distinct bytes : {}", s.distinct_bytes); + println!(" distinct bigrams : {}", s.distinct_bigrams); + println!(" edge density : {:.4}", s.edge_density); + println!( + " bigram entropy : {:.4} bits/pair", + s.bigram_entropy_bits + ); + println!( + " conditional entropy : {:.4} bits/byte", + s.conditional_entropy_bits + ); + println!( + " mean row entropy : {:.4} ± {:.4} bits/byte", + s.mean_row_entropy_bits, s.row_entropy_stddev + ); + println!(" fingerprint : {}", s.structural_fingerprint); + } + } + Ok(ExitCode::SUCCESS) +} + +fn run_entropy(path: &Path, format: Format, window: usize) -> Result { + let data = std::fs::read(path).with_context(|| format!("failed to read {}", path.display()))?; + let report = shape_scan::EntropyReport::from_bytes_with_window(&data, window); + match format { + Format::Json => println!("{}", serde_json::to_string_pretty(&report)?), + Format::Text | Format::Markdown => { + println!("entropy report for {}", path.display()); + println!(" size : {} bytes", report.size); + println!( + " Shannon entropy : {:.4} bits/byte ({:.2}% of max)", + report.shannon_bits_per_byte, + report.normalised * 100.0 + ); + if let Some(w) = report.windows { + println!(" window size : {} bytes", w.window_size); + println!(" windows : {}", w.count); + println!( + " per-window mean : {:.4} ± {:.4} bits/byte", + w.mean, w.stddev + ); + println!(" per-window range : [{:.4}, {:.4}]", w.min, w.max); + println!( + " high-entropy frac : {:.2}% (windows ≥ 7.5 bits/byte)", + w.high_entropy_fraction * 100.0 + ); + } + } + } + Ok(ExitCode::SUCCESS) +} + +fn print_text(reports: &[FileReport]) { + if reports.is_empty() { + println!("(no files matched)"); + return; + } + for r in reports { + println!( + "[{level:^6}] score={score:.2} entropy={ent:.2} bits/byte density={dens:.2} {path}", + level = r.risk_level.as_str(), + score = r.risk_score, + ent = r.entropy.shannon_bits_per_byte, + dens = r.shape.edge_density, + path = r.path.display() + ); + for ind in &r.indicators { + println!(" - {ind}"); + } + } +} + +fn print_markdown(reports: &[FileReport]) { + println!("# shape-scan report"); + println!(); + println!("| risk | score | entropy | density | path |"); + println!("|------|-------|---------|---------|------|"); + for r in reports { + println!( + "| {} | {:.2} | {:.2} | {:.2} | `{}` |", + r.risk_level.as_str(), + r.risk_score, + r.entropy.shannon_bits_per_byte, + r.shape.edge_density, + r.path.display() + ); + } + println!(); + for r in reports { + if r.indicators.is_empty() { + continue; + } + println!("## `{}`", r.path.display()); + for ind in &r.indicators { + println!("- {ind}"); + } + println!(); + } +} diff --git a/src/scan.rs b/src/scan.rs new file mode 100644 index 0000000..59e7cc4 --- /dev/null +++ b/src/scan.rs @@ -0,0 +1,223 @@ +//! High-level per-file scanning + risk scoring. +//! +//! The scoring function combines several entropy and shape features +//! into a single `risk_score` in `[0.0, 1.0]`. This is **not** a +//! malware classifier — it is a heuristic ranking signal that +//! highlights files which look statistically similar to packed, +//! encrypted, or otherwise obfuscated content. Use it as input to +//! human review, not as a verdict. + +use std::fs; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use serde::Serialize; + +use crate::entropy::EntropyReport; +use crate::sections::SectionReport; +use crate::shape::ShapeReport; + +/// Coarse risk bucket derived from `risk_score`. +#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum RiskLevel { + Low, + Medium, + High, +} + +impl RiskLevel { + pub fn from_score(s: f64) -> Self { + if s >= 0.75 { + Self::High + } else if s >= 0.45 { + Self::Medium + } else { + Self::Low + } + } + + pub fn as_str(self) -> &'static str { + match self { + Self::Low => "low", + Self::Medium => "medium", + Self::High => "high", + } + } +} + +/// Full per-file analysis result. +#[derive(Debug, Clone, Serialize)] +pub struct FileReport { + pub path: PathBuf, + pub size: u64, + pub entropy: EntropyReport, + pub shape: ShapeReport, + pub sections: SectionReport, + /// Heuristic indicators that contributed to the score. + pub indicators: Vec, + /// Combined risk score in `[0.0, 1.0]`. + pub risk_score: f64, + pub risk_level: RiskLevel, +} + +impl FileReport { + pub fn analyse_bytes(path: impl Into, data: &[u8]) -> Self { + let entropy = EntropyReport::from_bytes(data); + let shape = ShapeReport::from_bytes(data); + let sections = SectionReport::from_bytes(data); + let (indicators, risk_score) = score(&entropy, &shape, §ions); + Self { + path: path.into(), + size: data.len() as u64, + entropy, + shape, + sections, + indicators, + risk_score, + risk_level: RiskLevel::from_score(risk_score), + } + } +} + +/// Read `path` from disk and analyse it. +pub fn scan_path(path: &Path) -> Result { + let data = fs::read(path).with_context(|| format!("failed to read {}", path.display()))?; + Ok(FileReport::analyse_bytes(path, &data)) +} + +fn score(e: &EntropyReport, s: &ShapeReport, sec: &SectionReport) -> (Vec, f64) { + let mut indicators = Vec::new(); + let mut score = 0.0f64; + + // 1. Whole-file entropy. >7.0 is suspicious, >7.5 is strongly so. + if e.shannon_bits_per_byte >= 7.5 { + indicators.push(format!( + "high overall entropy ({:.2} bits/byte) — typical of packed or encrypted data", + e.shannon_bits_per_byte + )); + score += 0.35; + } else if e.shannon_bits_per_byte >= 7.0 { + indicators.push(format!( + "elevated overall entropy ({:.2} bits/byte)", + e.shannon_bits_per_byte + )); + score += 0.15; + } + + // 2. Sliding window: a high fraction of high-entropy windows is the + // most reliable single signal. + if let Some(w) = &e.windows { + if w.high_entropy_fraction >= 0.5 { + indicators.push(format!( + "{:.0}% of {}-byte windows are high-entropy", + w.high_entropy_fraction * 100.0, + w.window_size + )); + score += 0.2; + } + if w.stddev >= 1.5 { + indicators.push(format!( + "highly variable entropy across the file (stddev={:.2})", + w.stddev + )); + score += 0.05; + } + } + + // 3. Topological density: random-looking data fills the bigram graph. + if s.edge_density >= 0.85 { + indicators.push(format!( + "near-complete byte-bigram graph (edge density {:.2})", + s.edge_density + )); + score += 0.15; + } + if s.conditional_entropy_bits >= 7.5 { + indicators.push(format!( + "uniform conditional entropy ({:.2} bits/byte) — bytes are nearly memoryless", + s.conditional_entropy_bits + )); + score += 0.1; + } + + // 4. Section-level red flags: any executable section with high + // entropy is a classic packing indicator. + if sec.format == "pe" || sec.format == "elf" || sec.format == "mach" { + for sec_e in &sec.sections { + if sec_e.size >= 256 && sec_e.shannon_bits_per_byte >= 7.5 { + indicators.push(format!( + "{} section `{}` has high entropy ({:.2} bits/byte, {} B)", + sec.format.to_uppercase(), + sec_e.name, + sec_e.shannon_bits_per_byte, + sec_e.size + )); + score += 0.15; + // Only credit once per file to avoid runaway scores on + // many-section binaries. + break; + } + } + } + + // 5. Tiny files don't carry meaningful statistics — dampen the score. + if e.size < 1024 { + score *= 0.4; + indicators.push(format!( + "small file ({} B) — entropy/shape signals are less reliable", + e.size + )); + } + + (indicators, score.clamp(0.0, 1.0)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + #[test] + fn scoring_low_for_text() { + let s = b"Lorem ipsum dolor sit amet. ".repeat(512); + let r = FileReport::analyse_bytes("text.txt", &s); + assert_eq!(r.risk_level, RiskLevel::Low); + assert!(r.risk_score < 0.45, "score was {}", r.risk_score); + } + + #[test] + fn scoring_high_for_random() { + // Pseudorandom data via SplitMix64 — much better quality than + // raw xorshift's low bits. 256 KiB is enough for the bigram + // graph to saturate and for conditional entropy to converge. + let mut state: u64 = 0xdead_beef_cafe_babe; + let n = 256 * 1024; + let mut data = Vec::with_capacity(n); + while data.len() < n { + state = state.wrapping_add(0x9E37_79B9_7F4A_7C15); + let mut z = state; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + z ^= z >> 31; + data.extend_from_slice(&z.to_le_bytes()); + } + data.truncate(n); + let r = FileReport::analyse_bytes("rand.bin", &data); + assert!( + r.risk_score >= 0.75, + "expected high risk, got {} ({:?})", + r.risk_score, + r.indicators + ); + assert_eq!(r.risk_level, RiskLevel::High); + } + + #[test] + fn scan_path_reads_file() { + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(b"hello shape-scan").unwrap(); + let r = scan_path(f.path()).unwrap(); + assert_eq!(r.size, 16); + } +} diff --git a/src/sections.rs b/src/sections.rs new file mode 100644 index 0000000..5b6425c --- /dev/null +++ b/src/sections.rs @@ -0,0 +1,167 @@ +//! Section-aware analysis for known executable formats (ELF, PE, Mach-O). +//! +//! Falls back gracefully to a single "" pseudo-section for +//! anything that isn't recognised by `goblin`. + +use goblin::Object; +use serde::Serialize; + +use crate::entropy::shannon_entropy; + +/// Per-section entropy summary. +#[derive(Debug, Clone, Serialize)] +pub struct SectionEntropy { + pub name: String, + pub size: u64, + pub shannon_bits_per_byte: f64, +} + +/// Top-level section report. +#[derive(Debug, Clone, Serialize)] +pub struct SectionReport { + /// Detected format: "elf", "pe", "mach", "archive", or "unknown". + pub format: String, + pub sections: Vec, +} + +impl SectionReport { + /// Parse `data` and produce per-section entropy. Never fails: on + /// unrecognised input it returns a single pseudo-section covering + /// the whole file. + pub fn from_bytes(data: &[u8]) -> Self { + match Object::parse(data) { + Ok(Object::Elf(elf)) => Self::from_elf(&elf, data), + Ok(Object::PE(pe)) => Self::from_pe(&pe, data), + Ok(Object::Mach(mach)) => Self::from_mach(&mach, data), + Ok(Object::Archive(_)) => Self::fallback("archive", data), + _ => Self::fallback("unknown", data), + } + } + + fn fallback(format: &str, data: &[u8]) -> Self { + Self { + format: format.to_string(), + sections: vec![SectionEntropy { + name: "".to_string(), + size: data.len() as u64, + shannon_bits_per_byte: shannon_entropy(data), + }], + } + } + + fn from_elf(elf: &goblin::elf::Elf, data: &[u8]) -> Self { + let mut sections = Vec::with_capacity(elf.section_headers.len()); + for sh in &elf.section_headers { + let name = elf + .shdr_strtab + .get_at(sh.sh_name) + .unwrap_or("") + .to_string(); + let start = sh.sh_offset as usize; + let size = sh.sh_size as usize; + let end = start.saturating_add(size).min(data.len()); + let bytes = if start < data.len() { + &data[start..end] + } else { + &[][..] + }; + sections.push(SectionEntropy { + name, + size: bytes.len() as u64, + shannon_bits_per_byte: shannon_entropy(bytes), + }); + } + Self { + format: "elf".to_string(), + sections, + } + } + + fn from_pe(pe: &goblin::pe::PE, data: &[u8]) -> Self { + let mut sections = Vec::with_capacity(pe.sections.len()); + for s in &pe.sections { + let name = String::from_utf8_lossy( + &s.name[..s.name.iter().position(|&b| b == 0).unwrap_or(s.name.len())], + ) + .into_owned(); + let start = s.pointer_to_raw_data as usize; + let size = s.size_of_raw_data as usize; + let end = start.saturating_add(size).min(data.len()); + let bytes = if start < data.len() { + &data[start..end] + } else { + &[][..] + }; + sections.push(SectionEntropy { + name, + size: bytes.len() as u64, + shannon_bits_per_byte: shannon_entropy(bytes), + }); + } + Self { + format: "pe".to_string(), + sections, + } + } + + fn from_mach(mach: &goblin::mach::Mach, data: &[u8]) -> Self { + let mut sections = Vec::new(); + match mach { + goblin::mach::Mach::Binary(bin) => { + Self::push_mach_sections(bin, data, &mut sections); + } + goblin::mach::Mach::Fat(fat) => { + if let Ok(arches) = fat.arches() { + for arch in arches { + if let Ok(goblin::mach::SingleArch::MachO(bin)) = + fat.get(arch.offset as usize) + { + Self::push_mach_sections(&bin, data, &mut sections); + } + } + } + } + } + Self { + format: "mach".to_string(), + sections, + } + } + + fn push_mach_sections(bin: &goblin::mach::MachO, data: &[u8], out: &mut Vec) { + for seg in &bin.segments { + if let Ok(secs) = seg.sections() { + for (s, _bytes) in secs { + let name = + format!("{},{}", s.segname().unwrap_or("?"), s.name().unwrap_or("?")); + let start = s.offset as usize; + let size = s.size as usize; + let end = start.saturating_add(size).min(data.len()); + let slice = if start < data.len() { + &data[start..end] + } else { + &[][..] + }; + out.push(SectionEntropy { + name, + size: slice.len() as u64, + shannon_bits_per_byte: shannon_entropy(slice), + }); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn unknown_input_yields_pseudo_section() { + let r = SectionReport::from_bytes(b"hello, world"); + assert_eq!(r.format, "unknown"); + assert_eq!(r.sections.len(), 1); + assert_eq!(r.sections[0].name, ""); + } +} diff --git a/src/shape.rs b/src/shape.rs new file mode 100644 index 0000000..e96caaa --- /dev/null +++ b/src/shape.rs @@ -0,0 +1,188 @@ +//! Topological / structural "shape" features of a byte stream. +//! +//! We treat the file as a Markov chain over its bytes: each byte is a +//! node, and each adjacent pair `(b_i, b_{i+1})` is a directed edge. +//! The resulting 256-node transition graph is dense for random data +//! and sparse / clumpy for structured data, which makes a number of +//! simple graph statistics useful as a fingerprint. + +use serde::Serialize; + +use crate::entropy::entropy_from_histogram; + +/// 256-bin byte histogram (re-export friendly helper). +pub fn byte_histogram(data: &[u8]) -> [u64; 256] { + crate::entropy::histogram(data) +} + +/// Topological summary of a byte stream. +#[derive(Debug, Clone, Serialize)] +pub struct ShapeReport { + /// Number of distinct byte values present (`|V|`, max 256). + pub distinct_bytes: u32, + /// Number of distinct adjacent byte pairs present (`|E|`, max 65_536). + pub distinct_bigrams: u32, + /// Edge density: `|E| / 65_536` in `[0, 1]`. + pub edge_density: f64, + /// Joint Shannon entropy of the bigram distribution, in bits per + /// pair. Maxes out at 16 bits for a uniform 256×256 distribution. + pub bigram_entropy_bits: f64, + /// Conditional entropy `H(b_{i+1} | b_i)` in bits per byte. + pub conditional_entropy_bits: f64, + /// Spectral-style summary of the row-stochastic transition matrix: + /// the mean Shannon entropy of each row that has any outgoing + /// transitions, in bits per byte. + pub mean_row_entropy_bits: f64, + /// Standard deviation of per-row entropies. + pub row_entropy_stddev: f64, + /// Stable 64-bit fingerprint derived from the bigram graph. + pub structural_fingerprint: String, +} + +impl ShapeReport { + /// Compute the full shape report. Cost is `O(n + 65_536)`. + pub fn from_bytes(data: &[u8]) -> Self { + if data.len() < 2 { + return Self::empty(); + } + + // 1. Byte histogram and bigram matrix. + let unigram = byte_histogram(data); + let mut bigram = vec![0u64; 256 * 256]; + for w in data.windows(2) { + let i = (w[0] as usize) * 256 + (w[1] as usize); + bigram[i] += 1; + } + let pair_total: u64 = (data.len() as u64) - 1; + + // 2. Vertex / edge counts. + let distinct_bytes = unigram.iter().filter(|&&c| c > 0).count() as u32; + let distinct_bigrams = bigram.iter().filter(|&&c| c > 0).count() as u32; + let edge_density = distinct_bigrams as f64 / 65_536.0; + + // 3. Joint entropy of the bigram distribution. + let pair_total_f = pair_total as f64; + let mut bigram_entropy = 0.0f64; + for &c in &bigram { + if c == 0 { + continue; + } + let p = c as f64 / pair_total_f; + bigram_entropy -= p * p.log2(); + } + + // 4. Per-row stats: H(b_{i+1} | b_i = r) for each row r. + let mut row_entropies = Vec::with_capacity(256); + let mut weighted_conditional = 0.0f64; + for row in 0..256 { + let row_slice = &bigram[row * 256..(row + 1) * 256]; + let row_total: u64 = row_slice.iter().sum(); + if row_total == 0 { + continue; + } + let mut row_counts = [0u64; 256]; + row_counts.copy_from_slice(row_slice); + let h = entropy_from_histogram(&row_counts, row_total); + row_entropies.push(h); + weighted_conditional += (row_total as f64 / pair_total_f) * h; + } + let row_count = row_entropies.len().max(1) as f64; + let mean_row_entropy = row_entropies.iter().sum::() / row_count; + let row_var = row_entropies + .iter() + .map(|e| { + let d = e - mean_row_entropy; + d * d + }) + .sum::() + / row_count; + + // 5. Cheap, stable fingerprint of the bigram graph (FNV-1a over + // quantised counts). + let fingerprint = fingerprint_from_bigram(&bigram, pair_total); + + Self { + distinct_bytes, + distinct_bigrams, + edge_density, + bigram_entropy_bits: bigram_entropy, + conditional_entropy_bits: weighted_conditional, + mean_row_entropy_bits: mean_row_entropy, + row_entropy_stddev: row_var.sqrt(), + structural_fingerprint: fingerprint, + } + } + + fn empty() -> Self { + Self { + distinct_bytes: 0, + distinct_bigrams: 0, + edge_density: 0.0, + bigram_entropy_bits: 0.0, + conditional_entropy_bits: 0.0, + mean_row_entropy_bits: 0.0, + row_entropy_stddev: 0.0, + structural_fingerprint: format!("{:016x}", 0u64), + } + } +} + +fn fingerprint_from_bigram(bigram: &[u64], total: u64) -> String { + // Quantise each cell to 8 buckets of probability and hash. + const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325; + const FNV_PRIME: u64 = 0x0000_0100_0000_01b3; + let mut h = FNV_OFFSET; + let total_f = (total as f64).max(1.0); + for (i, &c) in bigram.iter().enumerate() { + let p = c as f64 / total_f; + // 0..=7 bucket: log-spaced so common pairs dominate the hash. + let bucket = if c == 0 { + 0u8 + } else { + ((-p.log2()).clamp(0.0, 16.0) / 16.0 * 7.0).round() as u8 + 1 + }; + h ^= (i as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + h = h.wrapping_mul(FNV_PRIME); + h ^= bucket as u64; + h = h.wrapping_mul(FNV_PRIME); + } + format!("{:016x}", h) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shape_of_short_input_is_empty() { + let r = ShapeReport::from_bytes(b""); + assert_eq!(r.distinct_bytes, 0); + assert_eq!(r.distinct_bigrams, 0); + } + + #[test] + fn shape_of_constant_collapses() { + let data = vec![0x41u8; 4096]; + let r = ShapeReport::from_bytes(&data); + assert_eq!(r.distinct_bytes, 1); + assert_eq!(r.distinct_bigrams, 1); + assert!(r.bigram_entropy_bits.abs() < 1e-12); + assert!(r.conditional_entropy_bits.abs() < 1e-12); + } + + #[test] + fn shape_of_text_is_low_density() { + let s = b"the quick brown fox jumps over the lazy dog. ".repeat(64); + let r = ShapeReport::from_bytes(&s); + assert!(r.distinct_bytes < 40); + assert!(r.edge_density < 0.05); + } + + #[test] + fn fingerprint_is_deterministic() { + let data: Vec = (0..1024).map(|i| (i * 31) as u8).collect(); + let a = ShapeReport::from_bytes(&data); + let b = ShapeReport::from_bytes(&data); + assert_eq!(a.structural_fingerprint, b.structural_fingerprint); + } +}