Skip to content

Commit c8649e9

Browse files
authored
Merge pull request #51 from itrajanovska/add_custom_inference_model
Add custom inference model
2 parents 714ff2c + 3c53dfd commit c8649e9

9 files changed

Lines changed: 247 additions & 50 deletions

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ dmypy.json
128128
# Pyre type checker
129129
.pyre/
130130

131-
# vscode
131+
# IDE
132132
.vscode/
133+
.idea/
133134

134-
**/*DS_Store*
135+
**/*DS_Store*

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,3 @@ This `bibtex` entry can be used to refer to the paper:
159159
### License
160160

161161
Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1).
162-

src/anonymeter/evaluators/inference_evaluator.py

Lines changed: 45 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,27 @@
22
# Copyright (c) 2022 Anonos IP LLC.
33
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
44
"""Privacy evaluator that measures the inference risk."""
5-
65
from typing import Optional
76

87
import numpy as np
98
import numpy.typing as npt
109
import pandas as pd
1110

12-
from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors
11+
from anonymeter.evaluators.inference_predictor import InferencePredictor
12+
from anonymeter.neighbors.mixed_types_kneighbors import KNNInferencePredictor
1313
from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk
1414

1515

1616
def _run_attack(
17-
target: pd.DataFrame,
18-
syn: pd.DataFrame,
19-
n_attacks: int,
20-
aux_cols: list[str],
21-
secret: str,
22-
n_jobs: int,
23-
naive: bool,
24-
regression: Optional[bool],
17+
target: pd.DataFrame,
18+
syn: pd.DataFrame,
19+
n_attacks: int,
20+
aux_cols: list[str],
21+
secret: str,
22+
n_jobs: int,
23+
naive: bool,
24+
regression: Optional[bool],
25+
inference_model: Optional[InferencePredictor],
2526
) -> int:
2627
if regression is None:
2728
regression = pd.api.types.is_numeric_dtype(target[secret])
@@ -30,21 +31,17 @@ def _run_attack(
3031

3132
if naive:
3233
guesses = syn.sample(n_attacks)[secret]
33-
3434
else:
35-
nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=syn[aux_cols])
36-
37-
guesses_idx = nn.kneighbors(queries=targets[aux_cols])
38-
if isinstance(guesses_idx, tuple):
39-
raise RuntimeError("guesses_idx cannot be a tuple")
40-
41-
guesses = syn.iloc[guesses_idx.flatten()][secret]
35+
# Instantiate the default KNN model if no other model is passed through `inference_model`.
36+
if inference_model is None:
37+
inference_model = KNNInferencePredictor(data=syn, columns=aux_cols, target_col=secret, n_jobs=n_jobs)
38+
guesses = inference_model.predict(targets)
4239

4340
return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum()
4441

4542

4643
def evaluate_inference_guesses(
47-
guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
44+
guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
4845
) -> npt.NDArray:
4946
"""Evaluate the success of an inference attack.
5047
@@ -142,23 +139,33 @@ class InferenceEvaluator:
142139
the variable.
143140
n_attacks : int, default is 500
144141
Number of attack attempts.
142+
In case the whole dataset size should be used, set this to np.inf.
143+
inference_model: InferencePredictor
144+
An ml model fitted on `syn` as training data, and `secret` as target, that supports ::predict(x).
145+
If not None, it will be used over the MixedTypeKNeighbors in the attack.
145146
146147
"""
147148

148149
def __init__(
149-
self,
150-
ori: pd.DataFrame,
151-
syn: pd.DataFrame,
152-
aux_cols: list[str],
153-
secret: str,
154-
regression: Optional[bool] = None,
155-
n_attacks: int = 500,
156-
control: Optional[pd.DataFrame] = None,
150+
self,
151+
ori: pd.DataFrame,
152+
syn: pd.DataFrame,
153+
aux_cols: list[str],
154+
secret: str,
155+
regression: Optional[bool] = None,
156+
n_attacks: int = 500,
157+
control: Optional[pd.DataFrame] = None,
158+
inference_model: Optional[InferencePredictor] = None
157159
):
158160
self._ori = ori
159161
self._syn = syn
160162
self._control = control
161163
self._n_attacks = n_attacks
164+
self._inference_model = inference_model
165+
166+
self._n_attacks_ori = min(n_attacks, self._ori.shape[0])
167+
self._n_attacks_baseline = min(self._syn.shape[0], self._n_attacks_ori)
168+
self._n_attacks_control = -1 if self._control is None else min(n_attacks, self._control.shape[0])
162169

163170
# check if secret is a string column
164171
if not isinstance(secret, str):
@@ -173,16 +180,17 @@ def __init__(
173180
self._aux_cols = aux_cols
174181
self._evaluated = False
175182

176-
def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int:
183+
def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> int:
177184
return _run_attack(
178185
target=target,
179186
syn=self._syn,
180-
n_attacks=self._n_attacks,
187+
n_attacks=n_attacks,
181188
aux_cols=self._aux_cols,
182189
secret=self._secret,
183190
n_jobs=n_jobs,
184191
naive=naive,
185192
regression=self._regression,
193+
inference_model=self._inference_model,
186194
)
187195

188196
def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
@@ -199,11 +207,14 @@ def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
199207
The evaluated ``InferenceEvaluator`` object.
200208
201209
"""
202-
self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs)
203-
self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs)
210+
self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs,
211+
n_attacks=self._n_attacks_baseline)
212+
self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs,
213+
n_attacks=self._n_attacks_ori)
204214
self._n_control = (
205-
None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs)
206-
)
215+
None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs,
216+
n_attacks=self._n_attacks_control)
217+
)
207218

208219
self._evaluated = True
209220
return self
@@ -226,7 +237,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
226237
raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.")
227238

228239
return EvaluationResults(
229-
n_attacks=self._n_attacks,
240+
n_attacks=(self._n_attacks_ori, self._n_attacks_baseline, self._n_attacks_control),
230241
n_success=self._n_success,
231242
n_baseline=self._n_baseline,
232243
n_control=self._n_control,
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
2+
# Copyright (c) 2022 Anonos IP LLC.
3+
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
4+
"""A protocol for a custom inference predictor."""
5+
from typing import Protocol
6+
7+
import pandas as pd
8+
9+
10+
class InferencePredictor(Protocol):
11+
"""Interface for custom inference models.
12+
13+
It is used as `inference_model` in the InferenceEvaluator in inference_evaluator.py.
14+
15+
For an example usage refer to the SklearnInferencePredictor in sklearn_inference_predictor.py.
16+
"""
17+
def predict(self, x: pd.DataFrame) -> pd.Series:
18+
"""Predict the targets for input `x`.
19+
20+
Parameters
21+
----------
22+
x : pd.DataFrame
23+
The input data to predict.
24+
25+
Returns
26+
-------
27+
pd.Series
28+
The predictions as pd.Series.
29+
30+
"""
31+
...
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
2+
# Copyright (c) 2022 Anonos IP LLC.
3+
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
4+
"""A wrapper class around a sklearn model implementing the InferencePredictor."""
5+
import pandas as pd
6+
from sklearn.base import BaseEstimator, is_classifier, is_regressor
7+
8+
from anonymeter.evaluators.inference_predictor import InferencePredictor
9+
10+
11+
class SklearnInferencePredictor(InferencePredictor):
12+
"""Wrapper class to use sklearn methods in the inference evaluator.
13+
14+
Parameters
15+
----------
16+
model : sklearn.base.BaseEstimator
17+
A classifier or regressor which implements ::predict().
18+
The model needs to be fitted, it must contain its own preprocessing pipeline,
19+
and it needs to respect the index of the input data.
20+
21+
"""
22+
def __init__(self, model: BaseEstimator):
23+
if not (is_classifier(estimator=model) or is_regressor(estimator=model)):
24+
raise ValueError("Model must be classifier or regressor %s", model)
25+
if not hasattr(model, "predict"):
26+
raise ValueError("Model must have a predict method, %s", model)
27+
self._model = model
28+
29+
def predict(self, x: pd.DataFrame) -> pd.Series:
30+
"""Predict the targets for input `x`.
31+
32+
Parameters
33+
----------
34+
x : pd.DataFrame
35+
The input data to predict.
36+
37+
Returns
38+
-------
39+
pd.Series
40+
The predictions as pd.Series.
41+
42+
"""
43+
prediction = self._model.predict(x)
44+
return pd.Series(prediction, index=x.index)

src/anonymeter/neighbors/mixed_types_kneighbors.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from joblib import Parallel, delayed
1313
from numba import jit
1414

15+
from anonymeter.evaluators.inference_predictor import InferencePredictor
1516
from anonymeter.preprocessing.transformations import mixed_types_transform
1617
from anonymeter.preprocessing.type_detection import detect_consistent_col_types
1718

@@ -75,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo
7576

7677
@jit(nopython=True, nogil=True)
7778
def _nearest_neighbors(
78-
queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
79+
queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
7980
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]:
8081
r"""For every element of ``queries``, find its nearest neighbors in ``candidates``.
8182
@@ -166,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] =
166167
return self
167168

168169
def kneighbors(
169-
self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
170+
self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
170171
) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]:
171172
"""Find the nearest neighbors for a set of query points.
172173
@@ -220,7 +221,7 @@ def kneighbors(
220221
with Parallel(n_jobs=self._n_jobs, backend="threading") as executor:
221222
res = executor(
222223
delayed(_nearest_neighbors)(
223-
queries=queries[ii : ii + 1],
224+
queries=queries[ii: ii + 1],
224225
candidates=candidates,
225226
cat_cols_index=len(self._ctypes["num"]),
226227
n_neighbors=n_neighbors,
@@ -235,3 +236,45 @@ def kneighbors(
235236
return distances, indexes
236237

237238
return indexes
239+
240+
241+
class KNNInferencePredictor(InferencePredictor):
242+
"""Wrapper class to use MixedTypeKNeighbors in the inference evaluator.
243+
244+
Parameters
245+
----------
246+
data : pd.DataFrame
247+
The train data to fit the model on (usually the synthetic data).
248+
columns : list[str]
249+
The auxiliary columns of `data`, used as input to the model.
250+
target_col : str
251+
The target column of `data`.
252+
n_jobs : int, default is -2
253+
Number of jobs to use. It follows joblib convention, so that ``n_jobs = -1``
254+
means all available cores
255+
256+
"""
257+
258+
def __init__(self, data: pd.DataFrame, columns: list[str], target_col: str, n_jobs: int):
259+
self._nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=data[columns])
260+
self._target_series = data[target_col]
261+
self._columns = columns
262+
263+
def predict(self, x: pd.DataFrame) -> pd.Series:
264+
"""Predict the targets for input `x`.
265+
266+
Parameters
267+
----------
268+
x : pd.DataFrame
269+
The input data to predict.
270+
271+
Returns
272+
-------
273+
pd.Series
274+
The predictions as pd.Series.
275+
276+
"""
277+
guesses_idx = self._nn.kneighbors(queries=x[self._columns])
278+
if isinstance(guesses_idx, tuple):
279+
raise RuntimeError("guesses_idx cannot be a tuple")
280+
return self._target_series.iloc[guesses_idx.flatten()]

src/anonymeter/stats/confidence.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import warnings
77
from math import sqrt
8-
from typing import NamedTuple, Optional
8+
from typing import NamedTuple, Optional, Union
99

1010
from scipy.stats import norm
1111

@@ -174,8 +174,12 @@ class EvaluationResults:
174174
175175
Parameters
176176
----------
177-
n_attacks : int
177+
n_attacks : Union[int, tuple[int, int, int]]
178178
Total number of attacks performed.
179+
It can be a single number (int) which will apply to all three: main (ori), baseline, and control attack,
180+
or a tuple (n_attacks_ori, n_attacks_baseline, n_attacks_control) - (int, int, int) which will contain
181+
different numbers of attacks in case the user wants to perform different number of attacks for each
182+
main (ori), baseline and control target dataset.
179183
n_success : int
180184
Number of successful attacks.
181185
n_baseline : int
@@ -194,23 +198,31 @@ class EvaluationResults:
194198

195199
def __init__(
196200
self,
197-
n_attacks: int,
201+
n_attacks: Union[int, tuple[int, int, int]],
198202
n_success: int,
199203
n_baseline: int,
200204
n_control: Optional[int] = None,
201205
confidence_level: float = 0.95,
202206
):
203-
self.attack_rate = success_rate(n_total=n_attacks, n_success=n_success, confidence_level=confidence_level)
207+
if isinstance(n_attacks, int):
208+
self.n_attacks_ori = n_attacks
209+
self.n_attacks_baseline = n_attacks
210+
self.n_attacks_control = n_attacks
211+
elif isinstance(n_attacks, tuple):
212+
self.n_attacks_ori, self.n_attacks_baseline, self.n_attacks_control = n_attacks
213+
else:
214+
raise ValueError(f"n_attacks must be an integer or a tuple of three integers, got {n_attacks}")
215+
216+
self.attack_rate = success_rate(n_total=self.n_attacks_ori, n_success=n_success, confidence_level=confidence_level)
204217

205-
self.baseline_rate = success_rate(n_total=n_attacks, n_success=n_baseline, confidence_level=confidence_level)
218+
self.baseline_rate = success_rate(n_total=self.n_attacks_baseline, n_success=n_baseline, confidence_level=confidence_level)
206219

207220
self.control_rate = (
208221
None
209222
if n_control is None
210-
else success_rate(n_total=n_attacks, n_success=n_control, confidence_level=confidence_level)
223+
else success_rate(n_total=self.n_attacks_control, n_success=n_control, confidence_level=confidence_level)
211224
)
212225

213-
self.n_attacks = n_attacks
214226
self.n_success = n_success
215227
self.n_baseline = n_baseline
216228
self.n_control = n_control

0 commit comments

Comments
 (0)