From 19fc3111a941ff16478ef96b3916ad0c96012d6b Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 19:02:38 +0000 Subject: [PATCH] Optimize repository discovery by refactoring WalkBuilder logic Refactored `find_repos_from_path` in `src/core/discovery.rs` to move repository detection logic inside the `WalkBuilder::filter_entry` closure. Previously, the walker would yield every directory, and the visitor would perform a `path.join(".git").exists()` check, incurring a PathBuf allocation and a syscall for every visited directory. The new implementation: 1. Enables `.hidden(false)` on the walker to allow it to see `.git` entries. 2. Detects `.git` entries directly in `filter_entry`. 3. When `.git` is found, it validates the repository and adds the parent path to the `DashMap`, then returns `false` to prevent descending into `.git`. 4. Manually filters other hidden files (e.g. `.config`, `.ssh`) to preserve the default behavior of ignoring hidden directories. 5. Retains the optimization of checking `contains_key` before allocation to handle duplicates efficiently (though duplicates are rare with this walker configuration). This change eliminates O(N) allocations and syscalls (where N is the number of directories visited), significantly improving efficiency for deep directory trees and sparse repository structures. Co-authored-by: mudcube <101564+mudcube@users.noreply.github.com> --- src/core/discovery.rs | 98 ++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 57 deletions(-) diff --git a/src/core/discovery.rs b/src/core/discovery.rs index 74606f8..8aa5faa 100644 --- a/src/core/discovery.rs +++ b/src/core/discovery.rs @@ -47,91 +47,59 @@ pub fn find_repos_from_path(search_path: impl AsRef) -> Vec<(String, PathB let name_counts = Arc::new(DashMap::with_capacity(ESTIMATED_REPO_COUNT)); let search_path_buf = search_path.to_path_buf(); + // Clone for closure + let repos_map_clone = Arc::clone(&repos_map); + let name_counts_clone = Arc::clone(&name_counts); + let search_path_buf_clone = search_path_buf.clone(); + // Build parallel walker with optimizations let walker = WalkBuilder::new(search_path) .follow_links(true) // Follow symlinks to find symlinked repos .max_depth(Some(MAX_SCAN_DEPTH)) // Limit depth to avoid deep recursion .threads(num_cpus::get().min(8)) // Use up to 8 threads for directory walking - .filter_entry(|entry| { + .hidden(false) // Enable hidden files to see .git + .filter_entry(move |entry| { let file_name = entry.file_name().to_str().unwrap_or(""); - // Skip common build/dependency directories - if SKIP_DIRECTORIES.contains(&file_name) { - return false; - } - - // Skip .git directories themselves (don't descend into them) - // This prevents scanning thousands of files in .git/objects/, etc. + // If we find .git, check if it's a valid repo and add the parent directory if file_name == ".git" { - return false; - } - - true - }) - .build_parallel(); - - // Walk the directory tree in parallel - walker.run(|| { - let repos_map = Arc::clone(&repos_map); - let name_counts = Arc::clone(&name_counts); - let search_path_buf = search_path_buf.clone(); - - Box::new(move |result| { - use ignore::WalkState; - - if let Ok(entry) = result { let path = entry.path(); - - // Only check directories - if !entry.file_type().is_some_and(|ft| ft.is_dir()) { - return WalkState::Continue; - } - - // Check if this directory contains a .git entry - let git_path = path.join(".git"); - - if git_path.exists() { - let is_git_repo = if git_path.is_dir() { + // .git found, check its parent (the repo root) + if let Some(repo_path) = path.parent() { + let is_git_repo = if entry.file_type().is_some_and(|ft| ft.is_dir()) { true - } else if git_path.is_file() { - // Submodules and worktrees expose a .git file - is_git_file(&git_path) } else { - false + // Submodules and worktrees expose a .git file + is_git_file(path) }; if is_git_repo { // Skip if we've already seen this exact path // Check existence first to avoid allocation - if repos_map.contains_key(path) { - return WalkState::Continue; - } - - let path_buf = path.to_path_buf(); + if !repos_map_clone.contains_key(repo_path) { + let path_buf = repo_path.to_path_buf(); - // Use entry API to atomically check and insert - // This avoids allocating a second PathBuf copy if the entry is new - match repos_map.entry(path_buf) { - Entry::Occupied(_) => return WalkState::Continue, - Entry::Vacant(entry) => { - let base_name = if path == search_path_buf { + // Use entry API to atomically check and insert + if let Entry::Vacant(entry) = repos_map_clone.entry(path_buf) { + let base_name = if repo_path == search_path_buf_clone { // If this is the search path itself, use its directory name - search_path_buf + search_path_buf_clone .file_name() .and_then(|n| n.to_str()) .unwrap_or(DEFAULT_REPO_NAME) .to_string() } else { - path.file_name() + repo_path + .file_name() .and_then(|n| n.to_str()) .unwrap_or(UNKNOWN_REPO_NAME) .to_string() }; // Handle duplicate names by adding a suffix - // DashMap's entry API provides atomic counter increment let repo_name = { - let mut entry = name_counts.entry(base_name.clone()).or_insert(0); + let mut entry = + name_counts_clone.entry(base_name.clone()).or_insert(0); *entry += 1; let count = *entry; if count > 1 { @@ -146,11 +114,27 @@ pub fn find_repos_from_path(search_path: impl AsRef) -> Vec<(String, PathB } } } + // Don't descend into .git + return false; + } + + // Skip common build/dependency directories + if SKIP_DIRECTORIES.contains(&file_name) { + return false; } - WalkState::Continue + // Skip hidden files/directories (emulate default behavior), except root + // This prevents scanning .config, .ssh, etc. but allows .git (handled above) + if entry.depth() > 0 && file_name.starts_with('.') { + return false; + } + + true }) - }); + .build_parallel(); + + // Walk the directory tree in parallel - logic is now in filter_entry + walker.run(|| Box::new(|_| ignore::WalkState::Continue)); // Extract repositories from DashMap // Convert DashMap to Vec<(String, PathBuf)>