diff --git a/pyproject.toml b/pyproject.toml index 621e0d3..ecfcd79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -289,7 +289,7 @@ pyflakes = [ [tool.flakeheaven.exceptions."tests/"] flake8-aaa = [ - "-AAA01 ", # no Act block found in test + "-AAA01", # no Act block found in test "-AAA03", # expected 1 blank line before Act block, found none "-AAA04", # expected 1 blank line before Assert block, found none "-AAA05", # blank line in block diff --git a/src/yamlfix/entrypoints/cli.py b/src/yamlfix/entrypoints/cli.py index 222abc6..4fc8fc7 100644 --- a/src/yamlfix/entrypoints/cli.py +++ b/src/yamlfix/entrypoints/cli.py @@ -17,16 +17,38 @@ log = logging.getLogger(__name__) +_GLOB_CACHE: dict[tuple[str, str, str], set[Path]] = {} + + +def _clear_glob_cache() -> None: + """Clear the glob cache to prevent memory leaks in long-running processes.""" + _GLOB_CACHE.clear() + + +def _glob_cache(dir_: Path, glob: str) -> set[Path]: + cache_key = (str(dir_), glob, "g") + if cache_key not in _GLOB_CACHE: + _GLOB_CACHE[cache_key] = set(dir_.glob(glob)) + return _GLOB_CACHE[cache_key] + + +def _rglob_cache(dir_: Path, glob: str) -> set[Path]: + cache_key = (str(dir_), glob, "r") + if cache_key not in _GLOB_CACHE: + _GLOB_CACHE[cache_key] = set(dir_.rglob(glob)) + return _GLOB_CACHE[cache_key] + + def _matches_any_glob( file_to_test: Path, dir_: Path, globs: Optional[List[str]] ) -> bool: - return any(file_to_test in dir_.glob(glob) for glob in (globs or [])) + return any(file_to_test in _glob_cache(dir_, glob) for glob in (globs or [])) def _find_all_yaml_files( dir_: Path, include_globs: Optional[List[str]], exclude_globs: Optional[List[str]] ) -> List[Path]: - files = [dir_.rglob(glob) for glob in (include_globs or [])] + files = [list(_rglob_cache(dir_, glob)) for glob in (include_globs or [])] return [ file for list_ in files @@ -121,6 +143,9 @@ def cli( # pylint: disable=too-many-arguments if fixed_code is not None: print(fixed_code, end="") + # Clear cache to prevent memory leaks in long-running processes + _clear_glob_cache() + if changed and check: sys.exit(1) diff --git a/tests/e2e/test_cli.py b/tests/e2e/test_cli.py index cad9015..4d290e2 100644 --- a/tests/e2e/test_cli.py +++ b/tests/e2e/test_cli.py @@ -3,16 +3,18 @@ import logging import os import re +import time from itertools import product from pathlib import Path from textwrap import dedent +from unittest import mock import py # type: ignore import pytest from _pytest.logging import LogCaptureFixture from click.testing import CliRunner -from yamlfix.entrypoints.cli import cli +from yamlfix.entrypoints.cli import _clear_glob_cache, cli from yamlfix.version import __version__ @@ -420,3 +422,85 @@ def test_do_not_read_folders_as_files(runner: CliRunner, tmpdir: py.path.local) result = runner.invoke(cli, [str(tmpdir)]) assert result.exit_code == 0 + + +def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None: + """Test that glob caching improves performance.""" + # Create 1000 files in the tmpdir + for i in range(1000): + (tmpdir / f"dont_use_me_{i}.yaml").write_text( + "program: yamlfix", encoding="utf-8" + ) + + for i in range(1000): + (tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8") + + # Test with cache (second run should benefit from cache) + start_time = time.time() + result = runner.invoke( + cli, + [str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"], + ) + end_time = time.time() + cache_time = end_time - start_time + + max_time = 2 + assert result.exit_code == 0 + assert ( + cache_time <= max_time + ), f"Expected cache time to be less than {max_time} seconds, got {cache_time:.3f}s" + + +@pytest.mark.slow +def test_glob_cache_excludes_slow(runner: CliRunner, tmpdir: Path) -> None: + """Test that glob caching improves performance with multiple exclude patterns.""" + # Create 1000 files in the tmpdir + for i in range(1000): + (tmpdir / f"dont_use_me_{i}.yaml").write_text( + "program: yamlfix", encoding="utf-8" + ) + + for i in range(1000): + (tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8") + + # Test without cache (mock to bypass cache) + start_time = time.time() + with mock.patch( + "yamlfix.entrypoints.cli._glob_cache", + side_effect=lambda dir_, glob: set(dir_.glob(glob)), + ), mock.patch( + "yamlfix.entrypoints.cli._rglob_cache", + side_effect=lambda dir_, glob: set(dir_.rglob(glob)), + ): + result1 = runner.invoke( + cli, + [ + str(tmpdir), + "--exclude", + "dont_*.yaml", + "--exclude", + "dont_use_me_*.yaml", + ], + ) + end_time = time.time() + no_cache_time = end_time - start_time + + _clear_glob_cache() + + # Test with cache (second run should benefit from cache) + start_time = time.time() + result2 = runner.invoke( + cli, + [str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"], + ) + end_time = time.time() + cache_time = end_time - start_time + + # Cache should be faster (less time) + assert cache_time <= no_cache_time * 0.5, ( + f"Cached version should be faster or comparable to non-cached. " + f"Cache time: {cache_time:.3f}s, No cache time: {no_cache_time:.3f}s" + ) + + assert result1.exit_code == 0 + assert result2.exit_code == 0 diff --git a/tests/unit/test_cli_cache.py b/tests/unit/test_cli_cache.py new file mode 100644 index 0000000..fb30531 --- /dev/null +++ b/tests/unit/test_cli_cache.py @@ -0,0 +1,195 @@ +"""Unit tests for CLI glob caching functions.""" + +from pathlib import Path +from typing import Generator + +import pytest + +from yamlfix.entrypoints.cli import ( + _GLOB_CACHE, + _clear_glob_cache, + _glob_cache, + _rglob_cache, +) + + +@pytest.fixture(autouse=True) +def _clear_cache() -> Generator[None, None, None]: + """Clear the glob cache before and after each test.""" + _clear_glob_cache() + yield + _clear_glob_cache() + + +@pytest.fixture +def test_directory(tmp_path: Path) -> Path: + """Create a test directory structure with YAML files.""" + # Create files in root + (tmp_path / "test.yaml").write_text("test") + (tmp_path / "test.yml").write_text("test") + (tmp_path / "other.txt").write_text("test") + + # Create subdirectory with nested files + subdir = tmp_path / "subdir" + subdir.mkdir() + (subdir / "nested.yaml").write_text("test") + (subdir / "nested.yml").write_text("test") + + return tmp_path + + +class TestGlobCache: + """Test the _glob_cache function.""" + + def test_glob_cache_basic_functionality(self, test_directory: Path) -> None: + """Test that _glob_cache returns correct files and caches results.""" + result = _glob_cache(test_directory, "*.yaml") + + # Should find files in current directory only (not recursive) + assert test_directory / "test.yaml" in result + assert test_directory / "subdir" / "nested.yaml" not in result + + # Cache should be populated + assert len(_GLOB_CACHE) == 1 + cache_key = (str(test_directory), "*.yaml", "g") + assert cache_key in _GLOB_CACHE + + def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None: + """Test that subsequent calls use cached results.""" + # First call + result1 = _glob_cache(test_directory, "*.yaml") + cache_size_after_first = len(_GLOB_CACHE) + + # Second call should use cache + result2 = _glob_cache(test_directory, "*.yaml") + cache_size_after_second = len(_GLOB_CACHE) + + assert result1 == result2 + assert cache_size_after_first == cache_size_after_second == 1 + + def test_glob_cache_different_patterns_create_separate_entries( + self, test_directory: Path + ) -> None: + """Test that different glob patterns create separate cache entries.""" + result_yaml = _glob_cache(test_directory, "*.yaml") + result_yml = _glob_cache(test_directory, "*.yml") + + assert len(_GLOB_CACHE) == 2 + assert result_yaml != result_yml + assert test_directory / "test.yaml" in result_yaml + assert test_directory / "test.yml" in result_yml + + def test_glob_cache_different_directories_create_separate_entries( + self, test_directory: Path + ) -> None: + """Test that different directories create separate cache entries.""" + # Create another directory + other_dir = test_directory / "other" + other_dir.mkdir() + (other_dir / "other.yaml").write_text("test") + + result1 = _glob_cache(test_directory, "*.yaml") + result2 = _glob_cache(other_dir, "*.yaml") + + assert len(_GLOB_CACHE) == 2 + assert result1 != result2 + assert test_directory / "test.yaml" in result1 + assert other_dir / "other.yaml" in result2 + + +class TestRglobCache: + """Test the _rglob_cache function.""" + + def test_rglob_cache_basic_functionality(self, test_directory: Path) -> None: + """Test that _rglob_cache returns correct files recursively and caches results.""" + result = _rglob_cache(test_directory, "*.yaml") + + # Should find files recursively + assert test_directory / "test.yaml" in result + assert test_directory / "subdir" / "nested.yaml" in result + + # Cache should be populated with rglob key + assert len(_GLOB_CACHE) == 1 + cache_key = (str(test_directory), "*.yaml", "r") + assert cache_key in _GLOB_CACHE + + def test_rglob_cache_vs_glob_cache_different_keys( + self, test_directory: Path + ) -> None: + """Test that rglob and glob use different cache keys.""" + glob_result = _glob_cache(test_directory, "*.yaml") + rglob_result = _rglob_cache(test_directory, "*.yaml") + + # Should have separate cache entries + assert len(_GLOB_CACHE) == 2 + + # Results should be different (rglob includes nested files) + assert len(rglob_result) > len(glob_result) + assert test_directory / "subdir" / "nested.yaml" in rglob_result + assert test_directory / "subdir" / "nested.yaml" not in glob_result + + def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None: + """Test that subsequent rglob calls use cached results.""" + result1 = _rglob_cache(test_directory, "*.yaml") + cache_size_after_first = len(_GLOB_CACHE) + + result2 = _rglob_cache(test_directory, "*.yaml") + cache_size_after_second = len(_GLOB_CACHE) + + assert result1 == result2 + assert cache_size_after_first == cache_size_after_second == 1 + + +class TestCacheClear: + """Test the _clear_glob_cache function.""" + + def test_clear_glob_cache_empties_cache(self, test_directory: Path) -> None: + """Test that cache clearing removes all entries.""" + # Populate cache with several entries + _glob_cache(test_directory, "*.yaml") + _glob_cache(test_directory, "*.yml") + _rglob_cache(test_directory, "*.yaml") + + assert len(_GLOB_CACHE) == 3 + + # Clear cache + _clear_glob_cache() + + assert len(_GLOB_CACHE) == 0 + + def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path) -> None: + """Test that after clearing, caching works normally again.""" + # First round of caching + _glob_cache(test_directory, "*.yaml") + assert len(_GLOB_CACHE) == 1 + + # Clear and verify empty + _clear_glob_cache() + assert len(_GLOB_CACHE) == 0 + + # Second round should work normally + _glob_cache(test_directory, "*.yaml") + assert len(_GLOB_CACHE) == 1 + + +class TestCacheKeyConstruction: + """Test that cache keys are constructed correctly.""" + + def test_cache_key_includes_directory_path(self, test_directory: Path) -> None: + """Test that cache keys include the directory path.""" + _glob_cache(test_directory, "*.yaml") + + expected_key = (str(test_directory), "*.yaml", "g") + assert expected_key in _GLOB_CACHE + + def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path) -> None: + """Test that glob and rglob operations have different key suffixes.""" + _glob_cache(test_directory, "*.yaml") + _rglob_cache(test_directory, "*.yaml") + + glob_key = (str(test_directory), "*.yaml", "g") + rglob_key = (str(test_directory), "*.yaml", "r") + + assert glob_key in _GLOB_CACHE + assert rglob_key in _GLOB_CACHE + assert len(_GLOB_CACHE) == 2