diff --git a/.jules/bolt.md b/.jules/bolt.md index 541c0ab51..1bc3ff1e2 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -17,3 +17,7 @@ ## 2025-05-19 - File Discovery Allocations **Learning:** In `discover_importing_files`, `WalkBuilder` results were being converted to `PathBuf` via `.map(|e| e.into_path())` *before* filtering. This caused allocations for every single file in the workspace (including excluded files and directories). **Action:** Filter `ignore::DirEntry` directly using `entry.file_type()` and `entry.path()` before mapping to `PathBuf`. This avoids allocations for non-matching files. + +## 2024-06-09 - String Match Line Number Allocations +**Learning:** In `find_literal_matches`, calculating line and column numbers by rescanning the string from index 0 for every match found via `.match_indices()` results in O(N*M) time complexity. +**Action:** Track line and column numbers statefully by maintaining `current_byte_idx`, `current_line`, and `current_column` variables across the loop, counting newlines only in the string slice between the previous match and the current match to achieve O(N) complexity. diff --git a/crates/mill-handlers/src/handlers/workspace/literal_matcher.rs b/crates/mill-handlers/src/handlers/workspace/literal_matcher.rs index 24fb81d6b..57b0e6613 100644 --- a/crates/mill-handlers/src/handlers/workspace/literal_matcher.rs +++ b/crates/mill-handlers/src/handlers/workspace/literal_matcher.rs @@ -59,30 +59,6 @@ fn has_word_boundary_at(content: &str, byte_pos: usize) -> bool { .unwrap_or(true) } -/// Convert byte offset to line and column (both 1-indexed) -/// -/// This handles UTF-8 correctly by counting characters, not bytes. -/// Line breaks are detected by '\n' (supports Unix, Windows CRLF is counted correctly). -fn byte_offset_to_line_column(content: &str, byte_offset: usize) -> (u32, u32) { - let mut line = 1u32; - let mut column = 1u32; - - for (byte_idx, ch) in content.char_indices() { - if byte_idx >= byte_offset { - break; - } - - if ch == '\n' { - line += 1; - column = 1; - } else { - column += 1; - } - } - - (line, column) -} - /// Find all literal matches of a pattern in content /// /// # Arguments @@ -111,6 +87,9 @@ pub fn find_literal_matches(content: &str, pattern: &str, whole_word: bool) -> V } let mut matches = Vec::new(); + let mut current_byte_idx = 0; + let mut current_line = 1u32; + let mut current_column = 1u32; // Use efficient string search (std::str::match_indices uses Boyer-Moore-like algorithm) for (byte_offset, matched_str) in content.match_indices(pattern) { @@ -143,14 +122,23 @@ pub fn find_literal_matches(content: &str, pattern: &str, whole_word: bool) -> V } } - let (line, column) = byte_offset_to_line_column(content, byte_offset); + // Update line and column based on characters since the last match + for ch in content[current_byte_idx..byte_offset].chars() { + if ch == '\n' { + current_line += 1; + current_column = 1; + } else { + current_column += 1; + } + } + current_byte_idx = byte_offset; matches.push(Match { start_byte: byte_offset, end_byte: byte_offset + pattern.len(), matched_text: matched_str.to_string(), - line, - column, + line: current_line, + column: current_column, }); } @@ -284,20 +272,6 @@ mod tests { assert_eq!(matches[1].column, 8); // After emoji } - #[test] - fn test_line_column_calculation() { - let content = "abc\ndefgh\nijkl"; - // Position of 'i' is at byte 10, line 3, column 1 - let (line, column) = byte_offset_to_line_column(content, 10); - assert_eq!(line, 3); - assert_eq!(column, 1); - - // Position of 'e' is at byte 5, line 2, column 2 - let (line, column) = byte_offset_to_line_column(content, 5); - assert_eq!(line, 2); - assert_eq!(column, 2); - } - #[test] fn test_windows_line_endings() { let content = "line1\r\nline2\r\nline3";