support caching metrics

agkloop · agkloop · commit c4ed07ad87ee · 2026-01-02T19:20:16.000+04:00
diff --git a/README.md b/README.md
@@ -333,6 +333,89 @@ Notes: one file per key; atomic writes; optional compression and dedupe to skip
 
 ---
 
+### Custom Storage
+
+Implement your own storage backend by following the `CacheStorage` protocol:
+
+```python
+from advanced_caching import CacheStorage, CacheEntry
+from typing import Any
+
+class MyCustomStorage:
+    """Custom cache storage implementation."""
+    
+    def get(self, key: str) -> Any | None:
+        """Retrieve value by key, or None if not found/expired."""
+        ...
+    
+    def get_entry(self, key: str) -> CacheEntry | None:
+        """Retrieve full cache entry with metadata."""
+        ...
+    
+    def set(self, key: str, value: Any, ttl: int | None = None) -> None:
+        """Store value with optional TTL in seconds."""
+        ...
+    
+    def set_if_not_exists(self, key: str, value: Any, ttl: int | None = None) -> bool:
+        """Atomic set-if-not-exists. Returns True if set, False if key exists."""
+        ...
+    
+    def delete(self, key: str) -> None:
+        """Remove key from storage."""
+        ...
+    
+    def exists(self, key: str) -> bool:
+        """Check if key exists and is not expired."""
+        ...
+
+# Validate implementation
+from advanced_caching import validate_cache_storage
+validate_cache_storage(MyCustomStorage())
+
+# Use with decorators
+@TTLCache.cached("user:{id}", ttl=60, cache=MyCustomStorage())
+def get_user(id: int):
+    return {"id": id}
+```
+
+**Exposing Metrics:**
+
+To track cache operations in your custom storage, wrap it with `InstrumentedStorage`:
+
+```python
+from advanced_caching.storage import InstrumentedStorage
+from advanced_caching.metrics import InMemoryMetrics
+
+# Create metrics collector
+metrics = InMemoryMetrics()
+
+# Wrap your custom storage
+instrumented = InstrumentedStorage(
+    storage=MyCustomStorage(),
+    metrics=metrics,
+    cache_name="my_custom_cache"
+)
+
+# Use instrumented storage
+@TTLCache.cached("user:{id}", ttl=60, cache=instrumented)
+def get_user(id: int):
+    return {"id": id}
+
+# Query metrics
+stats = metrics.get_stats()
+# Includes: hits, misses, latency, errors, memory usage for "my_custom_cache"
+```
+
+`InstrumentedStorage` automatically tracks:
+- All cache operations (get, set, delete)
+- Operation latency (p50/p95/p99 percentiles)
+- Errors with exception types
+- Memory usage (if your storage supports it)
+
+See [Metrics Documentation](docs/metrics.md) for details.
+
+---
+
 ## BGCache (Background)
 
 Single-writer/multi-reader pattern with background refresh and optional independent reader caches.
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -27,6 +27,101 @@ stats = metrics.get_stats()
 # Returns: hits, misses, hit_rate, latency percentiles, errors, memory, background_refresh
 ```
 
+## Metrics Reference
+
+All metrics collectors track the following operations and expose them through their respective backends.
+
+| Metric Name | Type | What It Represents | When Recorded | Use Case | Labels/Dimensions |
+|-------------|------|-------------------|---------------|----------|-------------------|
+| **`cache.hits`** | Counter | Number of times data was successfully retrieved from cache without executing the underlying function | Every time a cache lookup finds valid (non-expired) data | Calculate cache effectiveness. High hit count indicates good cache utilization | `cache_name`, `operation` (always "get") |
+| **`cache.misses`** | Counter | Number of times data was not found in cache or was expired, requiring function execution | When cache lookup fails (key not found or TTL expired) | Identify cold cache scenarios or TTL tuning needs. High miss rate may indicate TTL is too short | `cache_name`, `operation` (always "get") |
+| **`cache.sets`** | Counter | Number of times data was written to cache after function execution | After the underlying function completes successfully and result is stored | Track cache write operations. Should roughly equal misses in normal operation | `cache_name`, `operation` (always "set") |
+| **`cache.deletes`** | Counter | Number of explicit cache entry removals (not TTL expirations) | When cache entries are manually deleted or evicted by cache policy | Monitor cache invalidation patterns. Debug cache coherency issues | `cache_name`, `operation` (always "delete") |
+| **`cache.hit_rate_percent`** | Gauge (Calculated) | Percentage of cache lookups that resulted in hits: `(hits / (hits + misses)) * 100` | Calculated on-demand (InMemoryMetrics) or periodically (exporters) | **Primary effectiveness metric.** Target: >80% for most apps, >95% for read-heavy workloads. Values: `95.5` = 95.5% from cache, `50.0` = half hit/miss, `0.0` = cold cache | `cache_name` |
+| **`cache.operation.duration`** | Histogram/Timer | Time spent in cache operations (get, set, delete) in milliseconds. Provides p50, p95, p99, avg aggregations | For every cache operation, wrapping the storage backend call | Detect storage backend performance issues. Compare local vs remote cache (Redis, S3, GCS). **Example:** `get_p50_ms: 0.12` = fast in-memory, `get_p99_ms: 45.0` = 1% take up to 45ms (network spike?) | `cache_name`, `operation` (get/set/delete) |
+| **`cache.errors`** | Counter | Number of errors encountered during cache operations | When cache operations raise exceptions (network failures, serialization errors, Redis connection issues) | Alert on storage backend failures. Identify problematic cache keys. Monitor Redis connection health. Breakdown by `error_type` (e.g., ConnectionError, TimeoutError) | `cache_name`, `operation`, `error_type` |
+| **`cache.background_refresh`** | Counter (success/failure breakdown) | Number of background refresh operations for SWRCache (stale refresh) and BGCache (scheduled refresh) | **SWRCache:** When serving stale data triggers background refresh<br>**BGCache:** On every scheduled loader execution | Monitor SWR effectiveness (serving stale while updating). Track BGCache job reliability. High failure rate indicates unreliable data source, network issues, or function errors | `cache_name`, `status` (success/failure) |
+| **`cache.memory.bytes`** | Gauge | Approximate memory usage of cached entries in bytes. Also provides `mb` (megabytes) and `entries` (item count) | Periodically or on-demand when using `InstrumentedStorage` wrapper | Prevent memory exhaustion in long-running processes. Size L1 cache appropriately in HybridCache. Trigger eviction at threshold | `cache_name` |
+| **`cache.entry.count`** | Gauge | Number of entries currently stored in cache | Tracked alongside memory metrics | Monitor cache growth over time. Validate cache eviction policies. Estimate memory per entry (bytes / entries) | `cache_name` |
+
+---
+
+## Metric Naming Conventions
+
+### InMemoryMetrics
+Returns nested dictionary structure:
+```json
+{
+  "uptime_seconds": 3600.5,
+  "caches": {
+    "get_user": {
+      "hits": 100,
+      "misses": 20,
+      "sets": 20,
+      "deletes": 5,
+      "hit_rate_percent": 83.33
+    },
+    "get_product": {
+      "hits": 50,
+      "misses": 10,
+      "sets": 10,
+      "deletes": 2,
+      "hit_rate_percent": 83.33
+    }
+  },
+  "latency": {
+    "get_user.get_p50_ms": 0.15,
+    "get_user.get_p95_ms": 2.5,
+    "get_user.get_p99_ms": 10.0,
+    "get_user.get_avg_ms": 0.8,
+    "get_product.get_p50_ms": 0.12,
+    "get_product.set_p50_ms": 1.2
+  },
+  "errors": {
+    "get_user.get": {
+      "ConnectionError": 5,
+      "TimeoutError": 2
+    }
+  },
+  "memory": {
+    "my_cache": {
+      "bytes": 1048576,
+      "mb": 1.0,
+      "entries": 100
+    },
+    "another_cache": {
+      "bytes": 524288,
+      "mb": 0.5,
+      "entries": 50
+    }
+  },
+  "background_refresh": {
+    "get_user": {
+      "success": 50,
+      "failure": 2
+    }
+  }
+}
+```
+
+**Note:** Memory metrics are tracked **per-cache-name** when using `InstrumentedStorage` wrapper. If you have multiple functions sharing the same metrics collector but using different storage backends, each will have its own memory entry under the cache name you provide to `InstrumentedStorage(storage, metrics, "cache_name")`.
+
+### OpenTelemetry
+Metric names follow OpenTelemetry conventions:
+- `cache.hits` (Counter with `cache_name` attribute)
+- `cache.misses` (Counter with `cache_name` attribute)
+- `cache.operation.duration` (Histogram with `cache_name`, `operation` attributes)
+
+### GCP Cloud Monitoring
+Uses custom metric paths under your configured prefix:
+- `custom.googleapis.com/<prefix>/hits`
+- `custom.googleapis.com/<prefix>/misses`
+- `custom.googleapis.com/<prefix>/latency`
+
+Labels: `cache_name`, `operation`
+
+---
+
 ## InMemoryMetrics
 
 Built-in collector for API endpoints. Zero external dependencies, thread-safe.
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -394,6 +394,52 @@ def test_instrumented_storage_memory_usage():
     assert metrics.memory_usages[0][2] == 2  # entry_count
 
 
+def test_memory_metrics_per_cache_name():
+    """Test that memory metrics are tracked separately per cache name."""
+    from advanced_caching.metrics import InMemoryMetrics
+
+    metrics = InMemoryMetrics()
+
+    # Create two separate caches with different names
+    cache1 = InMemCache()
+    instrumented1 = InstrumentedStorage(cache1, metrics, "cache_one")
+
+    cache2 = InMemCache()
+    instrumented2 = InstrumentedStorage(cache2, metrics, "cache_two")
+
+    # Add data to first cache
+    instrumented1.set("key1", "x" * 1000, ttl=60)
+    instrumented1.set("key2", "y" * 2000, ttl=60)
+
+    # Add data to second cache
+    instrumented2.set("key1", "a" * 500, ttl=60)
+
+    # Get memory usage for each cache
+    usage1 = instrumented1.get_memory_usage()
+    usage2 = instrumented2.get_memory_usage()
+
+    # Get stats from shared metrics collector
+    stats = metrics.get_stats()
+
+    # Verify memory is tracked per cache name
+    assert "memory" in stats
+    assert "cache_one" in stats["memory"]
+    assert "cache_two" in stats["memory"]
+
+    # Verify each cache has its own memory stats
+    assert stats["memory"]["cache_one"]["entries"] == 2
+    assert stats["memory"]["cache_two"]["entries"] == 1
+
+    # Verify bytes are different for each cache
+    assert stats["memory"]["cache_one"]["bytes"] > stats["memory"]["cache_two"]["bytes"]
+    assert stats["memory"]["cache_one"]["mb"] > 0
+    assert stats["memory"]["cache_two"]["mb"] > 0
+
+    print(f"\n✓ Memory metrics tracked separately:")
+    print(f"  - cache_one: {stats['memory']['cache_one']}")
+    print(f"  - cache_two: {stats['memory']['cache_two']}")
+
+
 def test_metrics_latency_overhead():
     """Benchmark test to ensure metrics add minimal overhead."""
     import timeit