From 5ac5f82480b65a7be1fb362e6d52ef96febe3b30 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 7 Jun 2026 10:02:41 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Optimize=20is=5Flikely=5Fco?= =?UTF-8?q?de=5Ftext=20with=20early=20exit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 💡 What: Added an early exit to `is_likely_code_text` heuristic and updated range check to use `.contains()`. 🎯 Why: To avoid scanning up to 8192 bytes of binary files when the failure threshold has already been met. 📊 Impact: Reduces time spent on binary files significantly in mismatch cases by preventing full scans. 🔬 Measurement: Early exit prevents full 8KB scans, confirmed via ad-hoc microbenchmarking. Co-authored-by: mudcube <101564+mudcube@users.noreply.github.com> --- .jules/bolt.md | 4 ++++ .../services/reference_updater/detectors/generic.rs | 11 +++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 541c0ab51..e8bda7e82 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -17,3 +17,7 @@ ## 2025-05-19 - File Discovery Allocations **Learning:** In `discover_importing_files`, `WalkBuilder` results were being converted to `PathBuf` via `.map(|e| e.into_path())` *before* filtering. This caused allocations for every single file in the workspace (including excluded files and directories). **Action:** Filter `ignore::DirEntry` directly using `entry.file_type()` and `entry.path()` before mapping to `PathBuf`. This avoids allocations for non-matching files. + +## 2024-05-31 - is_likely_code_text Binary File Scanning Optimization +**Learning:** In `is_likely_code_text` heuristic functions, counting non-printable characters across an entire buffer without an early exit caused a significant performance penalty when scanning binary files, forcing full 8KB scans even after the failure threshold was met. Furthermore, simple mathematical checks (`non_printable * 20 < sample_len`) without early bounds checks can be heavily optimized for mismatch cases. +**Action:** When writing heuristics that iterate over buffers to classify data, always add an early exit condition as soon as a failure/success threshold is met. diff --git a/crates/mill-services/src/services/reference_updater/detectors/generic.rs b/crates/mill-services/src/services/reference_updater/detectors/generic.rs index bea6dc076..5dfcd5e50 100644 --- a/crates/mill-services/src/services/reference_updater/detectors/generic.rs +++ b/crates/mill-services/src/services/reference_updater/detectors/generic.rs @@ -519,12 +519,19 @@ fn is_likely_code_text(content: &str) -> bool { let sample = &content.as_bytes()[..sample_len]; let mut non_printable = 0usize; for &b in sample { - let is_text = b == b'\n' || b == b'\r' || b == b'\t' || (b >= 0x20 && b <= 0x7e); + // Optimize range check to use .contains() to prevent clippy warnings. + let is_text = b == b'\n' || b == b'\r' || b == b'\t' || (0x20..=0x7e).contains(&b); if !is_text { non_printable += 1; + // Early exit optimization: abort scanning as soon as failure threshold is met + // avoiding full 8KB scan for large binary files. + if non_printable * 20 >= sample_len { + return false; + } } } - non_printable * 20 < sample_len + // Correctly handle zero-length inputs preserving original logic + sample_len > 0 } fn is_obviously_irrelevant_extension(ext: &str) -> bool {