Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ pyflakes = [

[tool.flakeheaven.exceptions."tests/"]
flake8-aaa = [
"-AAA01 ", # no Act block found in test
"-AAA01", # no Act block found in test
"-AAA03", # expected 1 blank line before Act block, found none
"-AAA04", # expected 1 blank line before Assert block, found none
"-AAA05", # blank line in block
Expand Down
29 changes: 27 additions & 2 deletions src/yamlfix/entrypoints/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,38 @@
log = logging.getLogger(__name__)


_GLOB_CACHE: dict[tuple[str, str, str], set[Path]] = {}


def _clear_glob_cache() -> None:
"""Clear the glob cache to prevent memory leaks in long-running processes."""
_GLOB_CACHE.clear()


def _glob_cache(dir_: Path, glob: str) -> set[Path]:
cache_key = (str(dir_), glob, "g")
if cache_key not in _GLOB_CACHE:
_GLOB_CACHE[cache_key] = set(dir_.glob(glob))
return _GLOB_CACHE[cache_key]


def _rglob_cache(dir_: Path, glob: str) -> set[Path]:
cache_key = (str(dir_), glob, "r")
if cache_key not in _GLOB_CACHE:
_GLOB_CACHE[cache_key] = set(dir_.rglob(glob))
return _GLOB_CACHE[cache_key]


def _matches_any_glob(
file_to_test: Path, dir_: Path, globs: Optional[List[str]]
) -> bool:
return any(file_to_test in dir_.glob(glob) for glob in (globs or []))
return any(file_to_test in _glob_cache(dir_, glob) for glob in (globs or []))


def _find_all_yaml_files(
dir_: Path, include_globs: Optional[List[str]], exclude_globs: Optional[List[str]]
) -> List[Path]:
files = [dir_.rglob(glob) for glob in (include_globs or [])]
files = [list(_rglob_cache(dir_, glob)) for glob in (include_globs or [])]
return [
file
for list_ in files
Expand Down Expand Up @@ -121,6 +143,9 @@ def cli( # pylint: disable=too-many-arguments
if fixed_code is not None:
print(fixed_code, end="")

# Clear cache to prevent memory leaks in long-running processes
_clear_glob_cache()

if changed and check:
sys.exit(1)

Expand Down
86 changes: 85 additions & 1 deletion tests/e2e/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
import logging
import os
import re
import time
from itertools import product
from pathlib import Path
from textwrap import dedent
from unittest import mock

import py # type: ignore
import pytest
from _pytest.logging import LogCaptureFixture
from click.testing import CliRunner

from yamlfix.entrypoints.cli import cli
from yamlfix.entrypoints.cli import _clear_glob_cache, cli
from yamlfix.version import __version__


Expand Down Expand Up @@ -420,3 +422,85 @@ def test_do_not_read_folders_as_files(runner: CliRunner, tmpdir: py.path.local)
result = runner.invoke(cli, [str(tmpdir)])

assert result.exit_code == 0


def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None:
"""Test that glob caching improves performance."""
# Create 1000 files in the tmpdir
for i in range(1000):
(tmpdir / f"dont_use_me_{i}.yaml").write_text(
"program: yamlfix", encoding="utf-8"
)

for i in range(1000):
(tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8")

# Test with cache (second run should benefit from cache)
start_time = time.time()
result = runner.invoke(
cli,
[str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"],
)
end_time = time.time()
cache_time = end_time - start_time

max_time = 2
assert result.exit_code == 0
assert (
cache_time <= max_time
), f"Expected cache time to be less than {max_time} seconds, got {cache_time:.3f}s"


@pytest.mark.slow
def test_glob_cache_excludes_slow(runner: CliRunner, tmpdir: Path) -> None:
"""Test that glob caching improves performance with multiple exclude patterns."""
# Create 1000 files in the tmpdir
for i in range(1000):
(tmpdir / f"dont_use_me_{i}.yaml").write_text(
"program: yamlfix", encoding="utf-8"
)

for i in range(1000):
(tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8")

# Test without cache (mock to bypass cache)
start_time = time.time()
with mock.patch(
"yamlfix.entrypoints.cli._glob_cache",
side_effect=lambda dir_, glob: set(dir_.glob(glob)),
), mock.patch(
"yamlfix.entrypoints.cli._rglob_cache",
side_effect=lambda dir_, glob: set(dir_.rglob(glob)),
):
result1 = runner.invoke(
cli,
[
str(tmpdir),
"--exclude",
"dont_*.yaml",
"--exclude",
"dont_use_me_*.yaml",
],
)
end_time = time.time()
no_cache_time = end_time - start_time

_clear_glob_cache()

# Test with cache (second run should benefit from cache)
start_time = time.time()
result2 = runner.invoke(
cli,
[str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"],
)
end_time = time.time()
cache_time = end_time - start_time

# Cache should be faster (less time)
assert cache_time <= no_cache_time * 0.5, (
f"Cached version should be faster or comparable to non-cached. "
f"Cache time: {cache_time:.3f}s, No cache time: {no_cache_time:.3f}s"
)

assert result1.exit_code == 0
assert result2.exit_code == 0
195 changes: 195 additions & 0 deletions tests/unit/test_cli_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""Unit tests for CLI glob caching functions."""

from pathlib import Path
from typing import Generator

import pytest

from yamlfix.entrypoints.cli import (
_GLOB_CACHE,
_clear_glob_cache,
_glob_cache,
_rglob_cache,
)


@pytest.fixture(autouse=True)
def _clear_cache() -> Generator[None, None, None]:
"""Clear the glob cache before and after each test."""
_clear_glob_cache()
yield
_clear_glob_cache()


@pytest.fixture
def test_directory(tmp_path: Path) -> Path:
"""Create a test directory structure with YAML files."""
# Create files in root
(tmp_path / "test.yaml").write_text("test")
(tmp_path / "test.yml").write_text("test")
(tmp_path / "other.txt").write_text("test")

# Create subdirectory with nested files
subdir = tmp_path / "subdir"
subdir.mkdir()
(subdir / "nested.yaml").write_text("test")
(subdir / "nested.yml").write_text("test")

return tmp_path


class TestGlobCache:
"""Test the _glob_cache function."""

def test_glob_cache_basic_functionality(self, test_directory: Path) -> None:
"""Test that _glob_cache returns correct files and caches results."""
result = _glob_cache(test_directory, "*.yaml")

# Should find files in current directory only (not recursive)
assert test_directory / "test.yaml" in result
assert test_directory / "subdir" / "nested.yaml" not in result

# Cache should be populated
assert len(_GLOB_CACHE) == 1
cache_key = (str(test_directory), "*.yaml", "g")
assert cache_key in _GLOB_CACHE

def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None:
"""Test that subsequent calls use cached results."""
# First call
result1 = _glob_cache(test_directory, "*.yaml")
cache_size_after_first = len(_GLOB_CACHE)

# Second call should use cache
result2 = _glob_cache(test_directory, "*.yaml")
cache_size_after_second = len(_GLOB_CACHE)

assert result1 == result2
assert cache_size_after_first == cache_size_after_second == 1

def test_glob_cache_different_patterns_create_separate_entries(
self, test_directory: Path
) -> None:
"""Test that different glob patterns create separate cache entries."""
result_yaml = _glob_cache(test_directory, "*.yaml")
result_yml = _glob_cache(test_directory, "*.yml")

assert len(_GLOB_CACHE) == 2
assert result_yaml != result_yml
assert test_directory / "test.yaml" in result_yaml
assert test_directory / "test.yml" in result_yml

def test_glob_cache_different_directories_create_separate_entries(
self, test_directory: Path
) -> None:
"""Test that different directories create separate cache entries."""
# Create another directory
other_dir = test_directory / "other"
other_dir.mkdir()
(other_dir / "other.yaml").write_text("test")

result1 = _glob_cache(test_directory, "*.yaml")
result2 = _glob_cache(other_dir, "*.yaml")

assert len(_GLOB_CACHE) == 2
assert result1 != result2
assert test_directory / "test.yaml" in result1
assert other_dir / "other.yaml" in result2


class TestRglobCache:
"""Test the _rglob_cache function."""

def test_rglob_cache_basic_functionality(self, test_directory: Path) -> None:
"""Test that _rglob_cache returns correct files recursively and caches results."""
result = _rglob_cache(test_directory, "*.yaml")

# Should find files recursively
assert test_directory / "test.yaml" in result
assert test_directory / "subdir" / "nested.yaml" in result

# Cache should be populated with rglob key
assert len(_GLOB_CACHE) == 1
cache_key = (str(test_directory), "*.yaml", "r")
assert cache_key in _GLOB_CACHE

def test_rglob_cache_vs_glob_cache_different_keys(
self, test_directory: Path
) -> None:
"""Test that rglob and glob use different cache keys."""
glob_result = _glob_cache(test_directory, "*.yaml")
rglob_result = _rglob_cache(test_directory, "*.yaml")

# Should have separate cache entries
assert len(_GLOB_CACHE) == 2

# Results should be different (rglob includes nested files)
assert len(rglob_result) > len(glob_result)
assert test_directory / "subdir" / "nested.yaml" in rglob_result
assert test_directory / "subdir" / "nested.yaml" not in glob_result

def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path) -> None:
"""Test that subsequent rglob calls use cached results."""
result1 = _rglob_cache(test_directory, "*.yaml")
cache_size_after_first = len(_GLOB_CACHE)

result2 = _rglob_cache(test_directory, "*.yaml")
cache_size_after_second = len(_GLOB_CACHE)

assert result1 == result2
assert cache_size_after_first == cache_size_after_second == 1


class TestCacheClear:
"""Test the _clear_glob_cache function."""

def test_clear_glob_cache_empties_cache(self, test_directory: Path) -> None:
"""Test that cache clearing removes all entries."""
# Populate cache with several entries
_glob_cache(test_directory, "*.yaml")
_glob_cache(test_directory, "*.yml")
_rglob_cache(test_directory, "*.yaml")

assert len(_GLOB_CACHE) == 3

# Clear cache
_clear_glob_cache()

assert len(_GLOB_CACHE) == 0

def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path) -> None:
"""Test that after clearing, caching works normally again."""
# First round of caching
_glob_cache(test_directory, "*.yaml")
assert len(_GLOB_CACHE) == 1

# Clear and verify empty
_clear_glob_cache()
assert len(_GLOB_CACHE) == 0

# Second round should work normally
_glob_cache(test_directory, "*.yaml")
assert len(_GLOB_CACHE) == 1


class TestCacheKeyConstruction:
"""Test that cache keys are constructed correctly."""

def test_cache_key_includes_directory_path(self, test_directory: Path) -> None:
"""Test that cache keys include the directory path."""
_glob_cache(test_directory, "*.yaml")

expected_key = (str(test_directory), "*.yaml", "g")
assert expected_key in _GLOB_CACHE

def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path) -> None:
"""Test that glob and rglob operations have different key suffixes."""
_glob_cache(test_directory, "*.yaml")
_rglob_cache(test_directory, "*.yaml")

glob_key = (str(test_directory), "*.yaml", "g")
rglob_key = (str(test_directory), "*.yaml", "r")

assert glob_key in _GLOB_CACHE
assert rglob_key in _GLOB_CACHE
assert len(_GLOB_CACHE) == 2