Merge pull request #51 from itrajanovska/add_custom_inference_model

MatteoGiomi · web-flow · commit c8649e9417e1 · 2025-12-09T11:36:31.000+01:00
Add custom inference model
diff --git a/.gitignore b/.gitignore
@@ -128,7 +128,8 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# vscode
+# IDE
 .vscode/
+.idea/
 
-**/*DS_Store*
+**/*DS_Store*
diff --git a/README.md b/README.md
@@ -159,4 +159,3 @@ This `bibtex` entry can be used to refer to the paper:
 ### License
 
 Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1).
-
diff --git a/src/anonymeter/evaluators/inference_evaluator.py b/src/anonymeter/evaluators/inference_evaluator.py
@@ -2,26 +2,27 @@
 # Copyright (c) 2022 Anonos IP LLC.
 # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
 """Privacy evaluator that measures the inference risk."""
-
 from typing import Optional
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 
-from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors
+from anonymeter.evaluators.inference_predictor import InferencePredictor
+from anonymeter.neighbors.mixed_types_kneighbors import KNNInferencePredictor
 from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk
 
 
 def _run_attack(
-    target: pd.DataFrame,
-    syn: pd.DataFrame,
-    n_attacks: int,
-    aux_cols: list[str],
-    secret: str,
-    n_jobs: int,
-    naive: bool,
-    regression: Optional[bool],
+        target: pd.DataFrame,
+        syn: pd.DataFrame,
+        n_attacks: int,
+        aux_cols: list[str],
+        secret: str,
+        n_jobs: int,
+        naive: bool,
+        regression: Optional[bool],
+        inference_model: Optional[InferencePredictor],
 ) -> int:
     if regression is None:
         regression = pd.api.types.is_numeric_dtype(target[secret])
@@ -30,21 +31,17 @@ def _run_attack(
 
     if naive:
         guesses = syn.sample(n_attacks)[secret]
-
     else:
-        nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=syn[aux_cols])
-
-        guesses_idx = nn.kneighbors(queries=targets[aux_cols])
-        if isinstance(guesses_idx, tuple):
-            raise RuntimeError("guesses_idx cannot be a tuple")
-
-        guesses = syn.iloc[guesses_idx.flatten()][secret]
+        # Instantiate the default KNN model if no other model is passed through `inference_model`.
+        if inference_model is None:
+            inference_model = KNNInferencePredictor(data=syn, columns=aux_cols, target_col=secret, n_jobs=n_jobs)
+        guesses = inference_model.predict(targets)
 
     return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum()
 
 
 def evaluate_inference_guesses(
-    guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
+        guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
 ) -> npt.NDArray:
     """Evaluate the success of an inference attack.
 
@@ -142,23 +139,33 @@ class InferenceEvaluator:
         the variable.
     n_attacks : int, default is 500
         Number of attack attempts.
+        In case the whole dataset size should be used, set this to np.inf.
+    inference_model: InferencePredictor
+        An ml model fitted on `syn` as training data, and `secret` as target, that supports ::predict(x).
+        If not None, it will be used over the MixedTypeKNeighbors in the attack.
 
     """
 
     def __init__(
-        self,
-        ori: pd.DataFrame,
-        syn: pd.DataFrame,
-        aux_cols: list[str],
-        secret: str,
-        regression: Optional[bool] = None,
-        n_attacks: int = 500,
-        control: Optional[pd.DataFrame] = None,
+            self,
+            ori: pd.DataFrame,
+            syn: pd.DataFrame,
+            aux_cols: list[str],
+            secret: str,
+            regression: Optional[bool] = None,
+            n_attacks: int = 500,
+            control: Optional[pd.DataFrame] = None,
+            inference_model: Optional[InferencePredictor] = None
     ):
         self._ori = ori
         self._syn = syn
         self._control = control
         self._n_attacks = n_attacks
+        self._inference_model = inference_model
+
+        self._n_attacks_ori = min(n_attacks, self._ori.shape[0])
+        self._n_attacks_baseline = min(self._syn.shape[0], self._n_attacks_ori)
+        self._n_attacks_control = -1 if self._control is None else min(n_attacks, self._control.shape[0])
 
         # check if secret is a string column
         if not isinstance(secret, str):
@@ -173,16 +180,17 @@ def __init__(
         self._aux_cols = aux_cols
         self._evaluated = False
 
-    def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int:
+    def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> int:
         return _run_attack(
             target=target,
             syn=self._syn,
-            n_attacks=self._n_attacks,
+            n_attacks=n_attacks,
             aux_cols=self._aux_cols,
             secret=self._secret,
             n_jobs=n_jobs,
             naive=naive,
             regression=self._regression,
+            inference_model=self._inference_model,
         )
 
     def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
@@ -199,11 +207,14 @@ def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
             The evaluated ``InferenceEvaluator`` object.
 
         """
-        self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs)
-        self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs)
+        self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs,
+                                        n_attacks=self._n_attacks_baseline)
+        self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs,
+                                       n_attacks=self._n_attacks_ori)
         self._n_control = (
-            None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs)
-        )
+            None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs,
+                                                            n_attacks=self._n_attacks_control)
+            )
 
         self._evaluated = True
         return self
@@ -226,7 +237,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
             raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.")
 
         return EvaluationResults(
-            n_attacks=self._n_attacks,
+            n_attacks=(self._n_attacks_ori, self._n_attacks_baseline, self._n_attacks_control),
             n_success=self._n_success,
             n_baseline=self._n_baseline,
             n_control=self._n_control,
diff --git a/src/anonymeter/evaluators/inference_predictor.py b/src/anonymeter/evaluators/inference_predictor.py
@@ -0,0 +1,31 @@
+# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
+# Copyright (c) 2022 Anonos IP LLC.
+# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
+"""A protocol for a custom inference predictor."""
+from typing import Protocol
+
+import pandas as pd
+
+
+class InferencePredictor(Protocol):
+    """Interface for custom inference models.
+
+    It is used as `inference_model` in the InferenceEvaluator in inference_evaluator.py.
+
+    For an example usage refer to the SklearnInferencePredictor in sklearn_inference_predictor.py.
+    """
+    def predict(self, x: pd.DataFrame) -> pd.Series:
+        """Predict the targets for input `x`.
+
+        Parameters
+        ----------
+        x : pd.DataFrame
+            The input data to predict.
+
+        Returns
+        -------
+        pd.Series
+            The predictions as pd.Series.
+
+        """
+        ...
diff --git a/src/anonymeter/evaluators/sklearn_inference_predictor.py b/src/anonymeter/evaluators/sklearn_inference_predictor.py
@@ -0,0 +1,44 @@
+# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
+# Copyright (c) 2022 Anonos IP LLC.
+# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
+"""A wrapper class around a sklearn model implementing the InferencePredictor."""
+import pandas as pd
+from sklearn.base import BaseEstimator, is_classifier, is_regressor
+
+from anonymeter.evaluators.inference_predictor import InferencePredictor
+
+
+class SklearnInferencePredictor(InferencePredictor):
+    """Wrapper class to use sklearn methods in the inference evaluator.
+
+    Parameters
+    ----------
+    model : sklearn.base.BaseEstimator
+        A classifier or regressor which implements ::predict().
+        The model needs to be fitted, it must contain its own preprocessing pipeline,
+        and it needs to respect the index of the input data.
+
+    """
+    def __init__(self, model: BaseEstimator):
+        if not (is_classifier(estimator=model) or is_regressor(estimator=model)):
+            raise ValueError("Model must be classifier or regressor %s", model)
+        if not hasattr(model, "predict"):
+            raise ValueError("Model must have a predict method, %s", model)
+        self._model = model
+
+    def predict(self, x: pd.DataFrame) -> pd.Series:
+        """Predict the targets for input `x`.
+
+        Parameters
+        ----------
+        x : pd.DataFrame
+            The input data to predict.
+
+        Returns
+        -------
+        pd.Series
+            The predictions as pd.Series.
+
+        """
+        prediction = self._model.predict(x)
+        return pd.Series(prediction, index=x.index)
diff --git a/src/anonymeter/neighbors/mixed_types_kneighbors.py b/src/anonymeter/neighbors/mixed_types_kneighbors.py
@@ -12,6 +12,7 @@
 from joblib import Parallel, delayed
 from numba import jit
 
+from anonymeter.evaluators.inference_predictor import InferencePredictor
 from anonymeter.preprocessing.transformations import mixed_types_transform
 from anonymeter.preprocessing.type_detection import detect_consistent_col_types
 
@@ -75,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo
 
 @jit(nopython=True, nogil=True)
 def _nearest_neighbors(
-    queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
+        queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
 ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]:
     r"""For every element of ``queries``, find its nearest neighbors in ``candidates``.
 
@@ -166,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] =
         return self
 
     def kneighbors(
-        self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
+            self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
     ) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]:
         """Find the nearest neighbors for a set of query points.
 
@@ -220,7 +221,7 @@ def kneighbors(
         with Parallel(n_jobs=self._n_jobs, backend="threading") as executor:
             res = executor(
                 delayed(_nearest_neighbors)(
-                    queries=queries[ii : ii + 1],
+                    queries=queries[ii: ii + 1],
                     candidates=candidates,
                     cat_cols_index=len(self._ctypes["num"]),
                     n_neighbors=n_neighbors,
@@ -235,3 +236,45 @@ def kneighbors(
             return distances, indexes
 
         return indexes
+
+
+class KNNInferencePredictor(InferencePredictor):
+    """Wrapper class to use MixedTypeKNeighbors in the inference evaluator.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        The train data to fit the model on (usually the synthetic data).
+    columns : list[str]
+        The auxiliary columns of `data`, used as input to the model.
+    target_col : str
+        The target column of `data`.
+    n_jobs : int, default is -2
+        Number of jobs to use. It follows joblib convention, so that ``n_jobs = -1``
+        means all available cores
+
+    """
+
+    def __init__(self, data: pd.DataFrame, columns: list[str], target_col: str, n_jobs: int):
+        self._nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=data[columns])
+        self._target_series = data[target_col]
+        self._columns = columns
+
+    def predict(self, x: pd.DataFrame) -> pd.Series:
+        """Predict the targets for input `x`.
+
+        Parameters
+        ----------
+        x : pd.DataFrame
+            The input data to predict.
+
+        Returns
+        -------
+        pd.Series
+            The predictions as pd.Series.
+
+        """
+        guesses_idx = self._nn.kneighbors(queries=x[self._columns])
+        if isinstance(guesses_idx, tuple):
+            raise RuntimeError("guesses_idx cannot be a tuple")
+        return self._target_series.iloc[guesses_idx.flatten()]
diff --git a/src/anonymeter/stats/confidence.py b/src/anonymeter/stats/confidence.py
@@ -5,7 +5,7 @@
 
 import warnings
 from math import sqrt
-from typing import NamedTuple, Optional
+from typing import NamedTuple, Optional, Union
 
 from scipy.stats import norm
 
@@ -174,8 +174,12 @@ class EvaluationResults:
 
     Parameters
     ----------
-    n_attacks : int
+    n_attacks : Union[int, tuple[int, int, int]]
         Total number of attacks performed.
+        It can be a single number (int) which will apply to all three: main (ori), baseline, and control attack,
+        or a tuple (n_attacks_ori, n_attacks_baseline, n_attacks_control) - (int, int, int) which will contain
+        different numbers of attacks in case the user wants to perform different number of attacks for each
+        main (ori), baseline and control target dataset.
     n_success : int
         Number of successful attacks.
     n_baseline : int
@@ -194,23 +198,31 @@ class EvaluationResults:
 
     def __init__(
         self,
-        n_attacks: int,
+        n_attacks: Union[int, tuple[int, int, int]],
         n_success: int,
         n_baseline: int,
         n_control: Optional[int] = None,
         confidence_level: float = 0.95,
     ):
-        self.attack_rate = success_rate(n_total=n_attacks, n_success=n_success, confidence_level=confidence_level)
+        if isinstance(n_attacks, int):
+            self.n_attacks_ori = n_attacks
+            self.n_attacks_baseline = n_attacks
+            self.n_attacks_control = n_attacks
+        elif isinstance(n_attacks, tuple):
+            self.n_attacks_ori, self.n_attacks_baseline, self.n_attacks_control = n_attacks
+        else:
+            raise ValueError(f"n_attacks must be an integer or a tuple of three integers, got {n_attacks}")
+
+        self.attack_rate = success_rate(n_total=self.n_attacks_ori, n_success=n_success, confidence_level=confidence_level)
 
-        self.baseline_rate = success_rate(n_total=n_attacks, n_success=n_baseline, confidence_level=confidence_level)
+        self.baseline_rate = success_rate(n_total=self.n_attacks_baseline, n_success=n_baseline, confidence_level=confidence_level)
 
         self.control_rate = (
             None
             if n_control is None
-            else success_rate(n_total=n_attacks, n_success=n_control, confidence_level=confidence_level)
+            else success_rate(n_total=self.n_attacks_control, n_success=n_control, confidence_level=confidence_level)
         )
 
-        self.n_attacks = n_attacks
         self.n_success = n_success
         self.n_baseline = n_baseline
         self.n_control = n_control
diff --git a/tests/test_inference_evaluator.py b/tests/test_inference_evaluator.py
diff --git a/tests/test_sklearn_inference_model.py b/tests/test_sklearn_inference_model.py

Original file line number	Diff line number	Diff line change
@@ -159,4 +159,3 @@ This `bibtex` entry can be used to refer to the paper:
`159`	`159`	`### License`
`160`	`160`
`161`	`161`	Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1).
`162`		`-`