From 15e88a997829fa23d6446bf38dbd51c6d8495640 Mon Sep 17 00:00:00 2001 From: Antonio Feregrino Date: Fri, 8 Aug 2025 11:14:34 +0100 Subject: [PATCH 1/3] Cache globbed files for speed increase --- src/yamlfix/entrypoints/cli.py | 29 ++++- tests/e2e/test_cli.py | 92 ++++++++++++++++ tests/unit/test_cli_cache.py | 192 +++++++++++++++++++++++++++++++++ 3 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_cli_cache.py diff --git a/src/yamlfix/entrypoints/cli.py b/src/yamlfix/entrypoints/cli.py index 222abc6..78ca96c 100644 --- a/src/yamlfix/entrypoints/cli.py +++ b/src/yamlfix/entrypoints/cli.py @@ -17,16 +17,38 @@ log = logging.getLogger(__name__) +_GLOB_CACHE = {} + + +def _clear_glob_cache() -> None: + """Clear the glob cache to prevent memory leaks in long-running processes.""" + _GLOB_CACHE.clear() + + +def _glob_cache(dir_: Path, glob: str) -> set: + cache_key = (str(dir_), glob, "g") + if cache_key not in _GLOB_CACHE: + _GLOB_CACHE[cache_key] = set(dir_.glob(glob)) + return _GLOB_CACHE[cache_key] + + +def _rglob_cache(dir_: Path, glob: str) -> set: + cache_key = (str(dir_), glob, "r") + if cache_key not in _GLOB_CACHE: + _GLOB_CACHE[cache_key] = set(dir_.rglob(glob)) + return _GLOB_CACHE[cache_key] + + def _matches_any_glob( file_to_test: Path, dir_: Path, globs: Optional[List[str]] ) -> bool: - return any(file_to_test in dir_.glob(glob) for glob in (globs or [])) + return any(file_to_test in _glob_cache(dir_, glob) for glob in (globs or [])) def _find_all_yaml_files( dir_: Path, include_globs: Optional[List[str]], exclude_globs: Optional[List[str]] ) -> List[Path]: - files = [dir_.rglob(glob) for glob in (include_globs or [])] + files = [list(_rglob_cache(dir_, glob)) for glob in (include_globs or [])] return [ file for list_ in files @@ -121,6 +143,9 @@ def cli( # pylint: disable=too-many-arguments if fixed_code is not None: print(fixed_code, end="") + # Clear cache to prevent memory leaks in long-running processes + _clear_glob_cache() + if changed and check: sys.exit(1) diff --git a/tests/e2e/test_cli.py b/tests/e2e/test_cli.py index cad9015..59487cb 100644 --- a/tests/e2e/test_cli.py +++ b/tests/e2e/test_cli.py @@ -3,9 +3,11 @@ import logging import os import re +import time from itertools import product from pathlib import Path from textwrap import dedent +from unittest import mock import py # type: ignore import pytest @@ -420,3 +422,93 @@ def test_do_not_read_folders_as_files(runner: CliRunner, tmpdir: py.path.local) result = runner.invoke(cli, [str(tmpdir)]) assert result.exit_code == 0 + + +def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None: + """Test that glob caching improves performance.""" + # Create 1000 files in the tmpdir + for i in range(1000): + (tmpdir / f"dont_use_me_{i}.yaml").write_text( + "program: yamlfix", encoding="utf-8" + ) + + for i in range(1000): + (tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8") + + # Clear any existing cache + from yamlfix.entrypoints.cli import _clear_glob_cache + + _clear_glob_cache() + + # Test with cache (second run should benefit from cache) + start_time = time.time() + result = runner.invoke( + cli, + [str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"], + ) + end_time = time.time() + cache_time = end_time - start_time + + assert result.exit_code == 0 + assert ( + cache_time <= 1 + ), f"Expected cache time to be less than 1 second, got {cache_time:.3f}s" + + +@pytest.mark.slow +def test_glob_cache_excludes_slow(runner: CliRunner, tmpdir: Path) -> None: + """Test that glob caching improves performance with multiple exclude patterns.""" + # Create 1000 files in the tmpdir + for i in range(1000): + (tmpdir / f"dont_use_me_{i}.yaml").write_text( + "program: yamlfix", encoding="utf-8" + ) + + for i in range(1000): + (tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8") + + # Test without cache (mock to bypass cache) + start_time = time.time() + with mock.patch( + "yamlfix.entrypoints.cli._glob_cache", + side_effect=lambda dir_, glob: set(dir_.glob(glob)), + ): + with mock.patch( + "yamlfix.entrypoints.cli._rglob_cache", + side_effect=lambda dir_, glob: set(dir_.rglob(glob)), + ): + result1 = runner.invoke( + cli, + [ + str(tmpdir), + "--exclude", + "dont_*.yaml", + "--exclude", + "dont_use_me_*.yaml", + ], + ) + end_time = time.time() + no_cache_time = end_time - start_time + + # Clear any existing cache + from yamlfix.entrypoints.cli import _clear_glob_cache + + _clear_glob_cache() + + # Test with cache (second run should benefit from cache) + start_time = time.time() + result2 = runner.invoke( + cli, + [str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"], + ) + end_time = time.time() + cache_time = end_time - start_time + + # Cache should be faster (less time) + assert cache_time <= no_cache_time * 0.5, ( + f"Cached version should be faster or comparable to non-cached. " + f"Cache time: {cache_time:.3f}s, No cache time: {no_cache_time:.3f}s" + ) + + assert result1.exit_code == 0 + assert result2.exit_code == 0 diff --git a/tests/unit/test_cli_cache.py b/tests/unit/test_cli_cache.py new file mode 100644 index 0000000..ec0d0d5 --- /dev/null +++ b/tests/unit/test_cli_cache.py @@ -0,0 +1,192 @@ +"""Unit tests for CLI glob caching functions.""" + +from pathlib import Path + +import pytest + +from yamlfix.entrypoints.cli import ( + _GLOB_CACHE, + _clear_glob_cache, + _glob_cache, + _rglob_cache, +) + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear the glob cache before and after each test.""" + _clear_glob_cache() + yield + _clear_glob_cache() + + +@pytest.fixture +def test_directory(tmp_path: Path) -> Path: + """Create a test directory structure with YAML files.""" + # Create files in root + (tmp_path / "test.yaml").write_text("test") + (tmp_path / "test.yml").write_text("test") + (tmp_path / "other.txt").write_text("test") + + # Create subdirectory with nested files + subdir = tmp_path / "subdir" + subdir.mkdir() + (subdir / "nested.yaml").write_text("test") + (subdir / "nested.yml").write_text("test") + + return tmp_path + + +class TestGlobCache: + """Test the _glob_cache function.""" + + def test_glob_cache_basic_functionality(self, test_directory: Path): + """Test that _glob_cache returns correct files and caches results.""" + result = _glob_cache(test_directory, "*.yaml") + + # Should find files in current directory only (not recursive) + assert test_directory / "test.yaml" in result + assert test_directory / "subdir" / "nested.yaml" not in result + + # Cache should be populated + assert len(_GLOB_CACHE) == 1 + cache_key = (str(test_directory), "*.yaml", "g") + assert cache_key in _GLOB_CACHE + + def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): + """Test that subsequent calls use cached results.""" + # First call + result1 = _glob_cache(test_directory, "*.yaml") + cache_size_after_first = len(_GLOB_CACHE) + + # Second call should use cache + result2 = _glob_cache(test_directory, "*.yaml") + cache_size_after_second = len(_GLOB_CACHE) + + assert result1 == result2 + assert cache_size_after_first == cache_size_after_second == 1 + + def test_glob_cache_different_patterns_create_separate_entries( + self, test_directory: Path + ): + """Test that different glob patterns create separate cache entries.""" + result_yaml = _glob_cache(test_directory, "*.yaml") + result_yml = _glob_cache(test_directory, "*.yml") + + assert len(_GLOB_CACHE) == 2 + assert result_yaml != result_yml + assert test_directory / "test.yaml" in result_yaml + assert test_directory / "test.yml" in result_yml + + def test_glob_cache_different_directories_create_separate_entries( + self, test_directory: Path + ): + """Test that different directories create separate cache entries.""" + # Create another directory + other_dir = test_directory / "other" + other_dir.mkdir() + (other_dir / "other.yaml").write_text("test") + + result1 = _glob_cache(test_directory, "*.yaml") + result2 = _glob_cache(other_dir, "*.yaml") + + assert len(_GLOB_CACHE) == 2 + assert result1 != result2 + assert test_directory / "test.yaml" in result1 + assert other_dir / "other.yaml" in result2 + + +class TestRglobCache: + """Test the _rglob_cache function.""" + + def test_rglob_cache_basic_functionality(self, test_directory: Path): + """Test that _rglob_cache returns correct files recursively and caches results.""" + result = _rglob_cache(test_directory, "*.yaml") + + # Should find files recursively + assert test_directory / "test.yaml" in result + assert test_directory / "subdir" / "nested.yaml" in result + + # Cache should be populated with rglob key + assert len(_GLOB_CACHE) == 1 + cache_key = (str(test_directory), "*.yaml", "r") + assert cache_key in _GLOB_CACHE + + def test_rglob_cache_vs_glob_cache_different_keys(self, test_directory: Path): + """Test that rglob and glob use different cache keys.""" + glob_result = _glob_cache(test_directory, "*.yaml") + rglob_result = _rglob_cache(test_directory, "*.yaml") + + # Should have separate cache entries + assert len(_GLOB_CACHE) == 2 + + # Results should be different (rglob includes nested files) + assert len(rglob_result) > len(glob_result) + assert test_directory / "subdir" / "nested.yaml" in rglob_result + assert test_directory / "subdir" / "nested.yaml" not in glob_result + + def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): + """Test that subsequent rglob calls use cached results.""" + result1 = _rglob_cache(test_directory, "*.yaml") + cache_size_after_first = len(_GLOB_CACHE) + + result2 = _rglob_cache(test_directory, "*.yaml") + cache_size_after_second = len(_GLOB_CACHE) + + assert result1 == result2 + assert cache_size_after_first == cache_size_after_second == 1 + + +class TestCacheClear: + """Test the _clear_glob_cache function.""" + + def test_clear_glob_cache_empties_cache(self, test_directory: Path): + """Test that cache clearing removes all entries.""" + # Populate cache with several entries + _glob_cache(test_directory, "*.yaml") + _glob_cache(test_directory, "*.yml") + _rglob_cache(test_directory, "*.yaml") + + assert len(_GLOB_CACHE) == 3 + + # Clear cache + _clear_glob_cache() + + assert len(_GLOB_CACHE) == 0 + + def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path): + """Test that after clearing, caching works normally again.""" + # First round of caching + _glob_cache(test_directory, "*.yaml") + assert len(_GLOB_CACHE) == 1 + + # Clear and verify empty + _clear_glob_cache() + assert len(_GLOB_CACHE) == 0 + + # Second round should work normally + _glob_cache(test_directory, "*.yaml") + assert len(_GLOB_CACHE) == 1 + + +class TestCacheKeyConstruction: + """Test that cache keys are constructed correctly.""" + + def test_cache_key_includes_directory_path(self, test_directory: Path): + """Test that cache keys include the directory path.""" + _glob_cache(test_directory, "*.yaml") + + expected_key = (str(test_directory), "*.yaml", "g") + assert expected_key in _GLOB_CACHE + + def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path): + """Test that glob and rglob operations have different key suffixes.""" + _glob_cache(test_directory, "*.yaml") + _rglob_cache(test_directory, "*.yaml") + + glob_key = (str(test_directory), "*.yaml", "g") + rglob_key = (str(test_directory), "*.yaml", "r") + + assert glob_key in _GLOB_CACHE + assert rglob_key in _GLOB_CACHE + assert len(_GLOB_CACHE) == 2 From c23dadcf23946282a8cd34acab67f453ff61514a Mon Sep 17 00:00:00 2001 From: Antonio Feregrino Date: Mon, 1 Sep 2025 20:42:56 +0100 Subject: [PATCH 2/3] Update test to conform to linter --- pyproject.toml | 2 +- src/yamlfix/entrypoints/cli.py | 6 +++--- tests/e2e/test_cli.py | 37 +++++++++++++--------------------- tests/unit/test_cli_cache.py | 27 ++++++++++++++----------- 4 files changed, 33 insertions(+), 39 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 621e0d3..ecfcd79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -289,7 +289,7 @@ pyflakes = [ [tool.flakeheaven.exceptions."tests/"] flake8-aaa = [ - "-AAA01 ", # no Act block found in test + "-AAA01", # no Act block found in test "-AAA03", # expected 1 blank line before Act block, found none "-AAA04", # expected 1 blank line before Assert block, found none "-AAA05", # blank line in block diff --git a/src/yamlfix/entrypoints/cli.py b/src/yamlfix/entrypoints/cli.py index 78ca96c..4fc8fc7 100644 --- a/src/yamlfix/entrypoints/cli.py +++ b/src/yamlfix/entrypoints/cli.py @@ -17,7 +17,7 @@ log = logging.getLogger(__name__) -_GLOB_CACHE = {} +_GLOB_CACHE: dict[tuple[str, str, str], set[Path]] = {} def _clear_glob_cache() -> None: @@ -25,14 +25,14 @@ def _clear_glob_cache() -> None: _GLOB_CACHE.clear() -def _glob_cache(dir_: Path, glob: str) -> set: +def _glob_cache(dir_: Path, glob: str) -> set[Path]: cache_key = (str(dir_), glob, "g") if cache_key not in _GLOB_CACHE: _GLOB_CACHE[cache_key] = set(dir_.glob(glob)) return _GLOB_CACHE[cache_key] -def _rglob_cache(dir_: Path, glob: str) -> set: +def _rglob_cache(dir_: Path, glob: str) -> set[Path]: cache_key = (str(dir_), glob, "r") if cache_key not in _GLOB_CACHE: _GLOB_CACHE[cache_key] = set(dir_.rglob(glob)) diff --git a/tests/e2e/test_cli.py b/tests/e2e/test_cli.py index 59487cb..a17aa50 100644 --- a/tests/e2e/test_cli.py +++ b/tests/e2e/test_cli.py @@ -14,7 +14,7 @@ from _pytest.logging import LogCaptureFixture from click.testing import CliRunner -from yamlfix.entrypoints.cli import cli +from yamlfix.entrypoints.cli import _clear_glob_cache, cli from yamlfix.version import __version__ @@ -435,11 +435,6 @@ def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None: for i in range(1000): (tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8") - # Clear any existing cache - from yamlfix.entrypoints.cli import _clear_glob_cache - - _clear_glob_cache() - # Test with cache (second run should benefit from cache) start_time = time.time() result = runner.invoke( @@ -472,27 +467,23 @@ def test_glob_cache_excludes_slow(runner: CliRunner, tmpdir: Path) -> None: with mock.patch( "yamlfix.entrypoints.cli._glob_cache", side_effect=lambda dir_, glob: set(dir_.glob(glob)), + ), mock.patch( + "yamlfix.entrypoints.cli._rglob_cache", + side_effect=lambda dir_, glob: set(dir_.rglob(glob)), ): - with mock.patch( - "yamlfix.entrypoints.cli._rglob_cache", - side_effect=lambda dir_, glob: set(dir_.rglob(glob)), - ): - result1 = runner.invoke( - cli, - [ - str(tmpdir), - "--exclude", - "dont_*.yaml", - "--exclude", - "dont_use_me_*.yaml", - ], - ) + result1 = runner.invoke( + cli, + [ + str(tmpdir), + "--exclude", + "dont_*.yaml", + "--exclude", + "dont_use_me_*.yaml", + ], + ) end_time = time.time() no_cache_time = end_time - start_time - # Clear any existing cache - from yamlfix.entrypoints.cli import _clear_glob_cache - _clear_glob_cache() # Test with cache (second run should benefit from cache) diff --git a/tests/unit/test_cli_cache.py b/tests/unit/test_cli_cache.py index ec0d0d5..fb30531 100644 --- a/tests/unit/test_cli_cache.py +++ b/tests/unit/test_cli_cache.py @@ -1,6 +1,7 @@ """Unit tests for CLI glob caching functions.""" from pathlib import Path +from typing import Generator import pytest @@ -13,7 +14,7 @@ @pytest.fixture(autouse=True) -def clear_cache(): +def _clear_cache() -> Generator[None, None, None]: """Clear the glob cache before and after each test.""" _clear_glob_cache() yield @@ -40,7 +41,7 @@ def test_directory(tmp_path: Path) -> Path: class TestGlobCache: """Test the _glob_cache function.""" - def test_glob_cache_basic_functionality(self, test_directory: Path): + def test_glob_cache_basic_functionality(self, test_directory: Path) -> None: """Test that _glob_cache returns correct files and caches results.""" result = _glob_cache(test_directory, "*.yaml") @@ -53,7 +54,7 @@ def test_glob_cache_basic_functionality(self, test_directory: Path): cache_key = (str(test_directory), "*.yaml", "g") assert cache_key in _GLOB_CACHE - def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): + def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None: """Test that subsequent calls use cached results.""" # First call result1 = _glob_cache(test_directory, "*.yaml") @@ -68,7 +69,7 @@ def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): def test_glob_cache_different_patterns_create_separate_entries( self, test_directory: Path - ): + ) -> None: """Test that different glob patterns create separate cache entries.""" result_yaml = _glob_cache(test_directory, "*.yaml") result_yml = _glob_cache(test_directory, "*.yml") @@ -80,7 +81,7 @@ def test_glob_cache_different_patterns_create_separate_entries( def test_glob_cache_different_directories_create_separate_entries( self, test_directory: Path - ): + ) -> None: """Test that different directories create separate cache entries.""" # Create another directory other_dir = test_directory / "other" @@ -99,7 +100,7 @@ def test_glob_cache_different_directories_create_separate_entries( class TestRglobCache: """Test the _rglob_cache function.""" - def test_rglob_cache_basic_functionality(self, test_directory: Path): + def test_rglob_cache_basic_functionality(self, test_directory: Path) -> None: """Test that _rglob_cache returns correct files recursively and caches results.""" result = _rglob_cache(test_directory, "*.yaml") @@ -112,7 +113,9 @@ def test_rglob_cache_basic_functionality(self, test_directory: Path): cache_key = (str(test_directory), "*.yaml", "r") assert cache_key in _GLOB_CACHE - def test_rglob_cache_vs_glob_cache_different_keys(self, test_directory: Path): + def test_rglob_cache_vs_glob_cache_different_keys( + self, test_directory: Path + ) -> None: """Test that rglob and glob use different cache keys.""" glob_result = _glob_cache(test_directory, "*.yaml") rglob_result = _rglob_cache(test_directory, "*.yaml") @@ -125,7 +128,7 @@ def test_rglob_cache_vs_glob_cache_different_keys(self, test_directory: Path): assert test_directory / "subdir" / "nested.yaml" in rglob_result assert test_directory / "subdir" / "nested.yaml" not in glob_result - def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): + def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None: """Test that subsequent rglob calls use cached results.""" result1 = _rglob_cache(test_directory, "*.yaml") cache_size_after_first = len(_GLOB_CACHE) @@ -140,7 +143,7 @@ def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path): class TestCacheClear: """Test the _clear_glob_cache function.""" - def test_clear_glob_cache_empties_cache(self, test_directory: Path): + def test_clear_glob_cache_empties_cache(self, test_directory: Path) -> None: """Test that cache clearing removes all entries.""" # Populate cache with several entries _glob_cache(test_directory, "*.yaml") @@ -154,7 +157,7 @@ def test_clear_glob_cache_empties_cache(self, test_directory: Path): assert len(_GLOB_CACHE) == 0 - def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path): + def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path) -> None: """Test that after clearing, caching works normally again.""" # First round of caching _glob_cache(test_directory, "*.yaml") @@ -172,14 +175,14 @@ def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path): class TestCacheKeyConstruction: """Test that cache keys are constructed correctly.""" - def test_cache_key_includes_directory_path(self, test_directory: Path): + def test_cache_key_includes_directory_path(self, test_directory: Path) -> None: """Test that cache keys include the directory path.""" _glob_cache(test_directory, "*.yaml") expected_key = (str(test_directory), "*.yaml", "g") assert expected_key in _GLOB_CACHE - def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path): + def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path) -> None: """Test that glob and rglob operations have different key suffixes.""" _glob_cache(test_directory, "*.yaml") _rglob_cache(test_directory, "*.yaml") From 12d7822856e424dbf72e35df61591d0f6b88ad38 Mon Sep 17 00:00:00 2001 From: Antonio Feregrino Date: Thu, 4 Sep 2025 19:55:38 +0100 Subject: [PATCH 3/3] Increase time It seems that the runners are slower --- tests/e2e/test_cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_cli.py b/tests/e2e/test_cli.py index a17aa50..4d290e2 100644 --- a/tests/e2e/test_cli.py +++ b/tests/e2e/test_cli.py @@ -444,10 +444,11 @@ def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None: end_time = time.time() cache_time = end_time - start_time + max_time = 2 assert result.exit_code == 0 assert ( - cache_time <= 1 - ), f"Expected cache time to be less than 1 second, got {cache_time:.3f}s" + cache_time <= max_time + ), f"Expected cache time to be less than {max_time} seconds, got {cache_time:.3f}s" @pytest.mark.slow