From 0f7137f02978a4b10c4df8eb34731c5c14696158 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:27:10 -0800 Subject: [PATCH 01/24] Clear cached rda files and regenerate test-data each run. --- bin/test.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/test.sh b/bin/test.sh index bdfcc687..3fe692fa 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -46,7 +46,7 @@ decode-assoc-help() { write-assoc-testdata() { mkdir -p _tmp - + rm _tmp/*.rda cat >_tmp/true_values.csv < Date: Fri, 8 Jan 2016 15:29:03 -0800 Subject: [PATCH 02/24] Make sure we are using a matrix --- analysis/R/decode.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index b75a2902..3e098132 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -314,6 +314,8 @@ Resample <- function(e) { Decode <- function(counts, map, params, alpha = 0.05, correction = c("Bonferroni"), quiet = FALSE, ...) { + counts = as.matrix(counts) + k <- params$k p <- params$p q <- params$q From b05be2e9bb8ef35193c1ab6ec08551e161061ed6 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:30:50 -0800 Subject: [PATCH 03/24] Add new test options as examples. --- tests/regtest_spec.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 6774e400..30b8b90f 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -23,6 +23,16 @@ ('demo3 exp 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'), ('demo4 zipf1 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'), ('demo5 zipf1.5 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'), + + ('eval-demo1-8 gauss 20 10000 10', '32 2 32', '0.25 0.75 0.25', '20 v[0-9]*9$'), +# ('eval-demo1-32 exp 100 100000 10', '32 2 128', '0.25 0.75 0.50', '10 v[0-9]*9$'), +# ('eval-demo1-256 exp 100 100000 10', '256 2 128', '0.25 0.75 0.50', '10 v[0-9]*9$'), + +# ('eval-demo2-8a exp 100 100000 10', '8 1 128', '0.984375 1.0 0.25', '10 v[0-9]*9$'), +# ('eval-demo2-8 exp 100 100000 10', '8 1 128', '0.25 0.75 0.125', '10 v[0-9]*9$'), +# ('eval-demo2-32 exp 100 100000 10', '32 1 128', '0.25 0.75 0.125', '10 v[0-9]*9$'), +# ('eval-demo2-256 exp 100 100000 10', '256 1 128', '0.25 0.75 0.125', '10 v[0-9]*9$'), + ) DISTRIBUTIONS = ( From 24430fac3458fce975631f3f1fdbde75641380bd Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:32:01 -0800 Subject: [PATCH 04/24] Assign cohort randomly. --- tests/rappor_sim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rappor_sim.py b/tests/rappor_sim.py index 41145a3e..cc0fbeaa 100755 --- a/tests/rappor_sim.py +++ b/tests/rappor_sim.py @@ -111,8 +111,8 @@ def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, for i in xrange(n): for v1, v2 in rows: client_str = 'c%d' % report_index - cohort = report_index % params1.num_cohorts - + cohort = int(random.random() * params1.num_cohorts) + string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) From 785804b511816a3693fc914400c5740ad94829f1 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:33:29 -0800 Subject: [PATCH 05/24] Making parameters a mandatory argument. --- analysis/R/read_input.R | 27 +++++++++++---------------- bin/decode_assoc.R | 2 +- bin/decode_dist.R | 2 +- tests/compare_dist.R | 4 ++-- tests/gen_counts.R | 2 +- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index bb4afe31..4045ff8d 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -40,7 +40,7 @@ ReadParameterFile <- function(params_file) { params } -ReadCountsFile <- function(counts_file, params = NULL) { +ReadCountsFile <- function(counts_file, params) { # Read in the counts file. if (!file.exists(counts_file)) { return(NULL) @@ -65,7 +65,7 @@ ReadCountsFile <- function(counts_file, params = NULL) { counts } -ReadMapFile <- function(map_file, params = NULL) { +ReadMapFile <- function(map_file, params) { # Read in the map file which is in the following format (two hash functions): # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ... # str2, ... @@ -82,13 +82,12 @@ ReadMapFile <- function(map_file, params = NULL) { strs <- strs[ind] map_pos <- map_pos[ind, ] - if (!is.null(params)) { - n <- ncol(map_pos) - 1 - if (n != (params$h * params$m)) { - stop(paste0("Map file: number of columns should equal hm + 1:", - n, "_", params$h * params$m)) - } + n <- ncol(map_pos) - 1 + if (n != (params$h * params$m)) { + stop(paste0("Map file: number of columns should equal hm + 1:", + n, "_", params$h * params$m)) } + row_pos <- unlist(map_pos[, -1], use.names = FALSE) col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1) removed <- which(is.na(row_pos)) @@ -97,17 +96,13 @@ ReadMapFile <- function(map_file, params = NULL) { col_pos <- col_pos[-removed] } - if (!is.null(params)) { - map <- sparseMatrix(row_pos, col_pos, - dims = c(params$m * params$k, length(strs))) - } else { - map <- sparseMatrix(row_pos, col_pos) - } + map <- sparseMatrix(row_pos, col_pos, dims = c(params$m * params$k, length(strs))) + colnames(map) <- strs list(map = map, strs = strs, map_pos = map_pos) } -LoadMapFile <- function(map_file, params = NULL) { +LoadMapFile <- function(map_file, params) { # Reads the map file, caching an .rda (R binary data) version of it to speed # up future loads. @@ -119,7 +114,7 @@ LoadMapFile <- function(map_file, params = NULL) { # First save to a temp file, and then atomically rename to the destination. if (!file.exists(rda_path)) { Log("Reading %s", map_file) - map <- ReadMapFile(map_file, params = params) + map <- ReadMapFile(map_file, params) Log("Saving %s as an rda file for faster access", map_file) save(map, file = tmp_path) diff --git a/bin/decode_assoc.R b/bin/decode_assoc.R index 4d6a9d3e..96cd654d 100755 --- a/bin/decode_assoc.R +++ b/bin/decode_assoc.R @@ -244,7 +244,7 @@ main <- function(opts) { UsageError("--map1 must be provided when --var1 is a string (var = %s)", opts$var1) } - LoadMapFile(opts$map1) + LoadMapFile(opts$map1, params) # for 100k map file: 31 seconds to load map and write cache; 2.2 seconds to # read cache # LoadMapFile has the side effect of putting 'map' in the global enviroment. diff --git a/bin/decode_dist.R b/bin/decode_dist.R index 3554fdaf..401cbec1 100755 --- a/bin/decode_dist.R +++ b/bin/decode_dist.R @@ -127,7 +127,7 @@ main <- function(opts) { counts <- AdjustCounts(counts, params) - LoadMapFile(opts$map) + LoadMapFile(opts$map, params) val <- ValidateInput(params, counts, map$map) # NOTE: using global map if (val != "valid") { diff --git a/tests/compare_dist.R b/tests/compare_dist.R index e54978b4..a428b52b 100755 --- a/tests/compare_dist.R +++ b/tests/compare_dist.R @@ -77,10 +77,10 @@ RunRappor <- function(prefix_case, prefix_instance, ctx) { # ctx: context file with params field filled in c <- paste0(prefix_instance, '_counts.csv') - counts <- ReadCountsFile(c) + counts <- ReadCountsFile(c, ctx$params) m <- paste0(prefix_case, '_map.csv') - map <- ReadMapFile(m) # Switch to LoadMapFile if want to cache the result + map <- ReadMapFile(m, ctx$params) # Switch to LoadMapFile if want to cache the result # Main decode.R API timing <- system.time({ diff --git a/tests/gen_counts.R b/tests/gen_counts.R index 576cc1c6..769677c4 100755 --- a/tests/gen_counts.R +++ b/tests/gen_counts.R @@ -178,7 +178,7 @@ main <- function(argv) { params <- ReadParameterFile(params_file) - true_map <- ReadMapFile(true_map_file) + true_map <- ReadMapFile(true_map_file, params) num_unique_values <- length(true_map$strs) From 7133e45af99312c96d46ce27d3a59b378919bd47 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Tue, 2 Feb 2016 17:55:28 -0800 Subject: [PATCH 06/24] Change python to use secure random numbers for the IRR --- client/python/rappor.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/client/python/rappor.py b/client/python/rappor.py index 1af7cdc1..66e7e112 100644 --- a/client/python/rappor.py +++ b/client/python/rappor.py @@ -23,7 +23,7 @@ import hashlib import hmac import json -import random +import os import struct import sys @@ -122,36 +122,34 @@ def from_csv(f): class _SimpleRandom(object): """Returns an integer where each bit has probability p of being 1.""" - def __init__(self, prob_one, num_bits, _rand=None): + def __init__(self, prob_one, num_bits): self.prob_one = prob_one self.num_bits = num_bits - self._rand = _rand or random.Random() def __call__(self): p = self.prob_one - rand_fn = self._rand.random # cache it for speed + bytes = os.urandom(self.num_bits) r = 0 - for i in xrange(self.num_bits): - bit = rand_fn() < p + for i, b in enumerate(xrange(self.num_bits)): + bit = b < p * 255.0 r |= (bit << i) # using bool as int return r class SimpleIrrRand(object): - """Pure Python randomness.""" + """Python's os.random()""" - def __init__(self, params, _rand=None): + def __init__(self, params): """ Args: params: rappor.Params - _rand: Python Random object, for testing ONLY """ num_bits = params.num_bloombits # IRR probabilities - self.p_gen = _SimpleRandom(params.prob_p, num_bits, _rand=_rand) - self.q_gen = _SimpleRandom(params.prob_q, num_bits, _rand=_rand) + self.p_gen = _SimpleRandom(params.prob_p, num_bits) + self.q_gen = _SimpleRandom(params.prob_q, num_bits) def to_big_endian(i): From 1de6f39add1b48216171cdad065d20346a53e58b Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 3 Feb 2016 17:22:22 -0800 Subject: [PATCH 07/24] Added code to visualize the effects of different parameters. --- parameter_viz.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 parameter_viz.py diff --git a/parameter_viz.py b/parameter_viz.py new file mode 100644 index 00000000..0133f2d4 --- /dev/null +++ b/parameter_viz.py @@ -0,0 +1,82 @@ +from numpy import arange +from numpy import log + +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.pyplot as plt + + +def pid(x, f, p, q): + return .5*f*(q**x + (1-q)**x) + (1-.5*f)*((1-p)**x + p**x) + +def pother(x, f, p, q): + b = .5 * f * q + (1-.5*f) * p + return ( .5*f*(q*b + (1-q)*(1-b)) + (1-.5*f)*(p*b**(x-1) + (1-p)*(1-b)**(x-1)) ) + + +def delta(x, f, p, q): + return pid(x,f,p,q) - pother(x,f,p,q) + +def predictN(prob, delta): + if delta == 0: + return float('inf') + return prob * (1-prob) * 4 / (delta**2) + +def valueFPQ(f, p, q): + total = 0 + for i in range(2,4): + total += valueIFPQ(i,f,p,q) + return total + +def valueIFPQ(i,f,p,q): + p_id = pid(i, f, p, q) + p_other = pother(i,f,p,q) + return predictN(max(p_id, p_other), abs(p_id - p_other)) + +def signal(f, p, q, h): + pStar = .5 * f * q + (1-.5*f) * p # Probability of a bit being 1 from a true value of 0 in the irr + qStar = (1-.5*f) * q + .5 * f * p + if p == q: + return 0 + elif pStar < qStar: + return 1.0 / predictN(pStar**h, (qStar**h-pStar**h)) + else: + return 1.0 / predictN((1-pStar)**h, (1-qStar)**h - (1-pStar)**h) + +def value(x) : + return - signal(x[0],x[1],x[2], 1) * valueFPQ(x[0], x[1], x[2], 1) + +def printDelta(x): + printDelta(x[1], x[0], x[2]) + +def printDelta(f, p, q): + for i in range(2,10): + p_id = pid(i,f,p,q) + p_other = pother(i,f,p,q) + print(i, p_id, ' vs ', p_other, ' delta ', p_id-p_other, ' for a sum of ', predictN(max(p_id,p_other), p_id-p_other)) + +def eInf(f, h): + return 2 * h * log( (1-.5*f)/(.5*f) ) + +def getData(): + for h in (1,2): + for f in (.125,.25,.5,.75) : + for p in (.0,.1,.25,.4,.5,.6,.75,.9,1) : + for q in (.0,.1,.25,.5,.75,.9,1) : + if abs(p-q) > 0.05 : + yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) + +def toColor(color): + x = max(1, min(255, int(round(color * 256.0)))) + return hex(x*256*256 + x*256 + x)[2:] + +def makePlot(): + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): + ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c='%s'%f, marker='o') + ax.set_xlabel('e') + ax.set_ylabel('log(val1000)') + ax.set_zlabel('log(sig)') + plt.show() + +makePlot() \ No newline at end of file From 3c06b15ce9529f4c9d1e221bbcf828f48fc2c324 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Thu, 4 Feb 2016 14:28:00 -0800 Subject: [PATCH 08/24] Add better labels to paramerters --- parameter_viz.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index 0133f2d4..ddaaf93c 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -55,15 +55,17 @@ def printDelta(f, p, q): print(i, p_id, ' vs ', p_other, ' delta ', p_id-p_other, ' for a sum of ', predictN(max(p_id,p_other), p_id-p_other)) def eInf(f, h): - return 2 * h * log( (1-.5*f)/(.5*f) ) + if f <= 1.0: + return 2 * h * log( (1-.5*f)/(.5*f) ) + else: + return 2 * h * log( (.5*f)/(1-.5*f) ) def getData(): for h in (1,2): - for f in (.125,.25,.5,.75) : - for p in (.0,.1,.25,.4,.5,.6,.75,.9,1) : - for q in (.0,.1,.25,.5,.75,.9,1) : - if abs(p-q) > 0.05 : - yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) + for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : + for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : + for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : + yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) def toColor(color): x = max(1, min(255, int(round(color * 256.0)))) @@ -73,10 +75,12 @@ def makePlot(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): - ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c='%s'%f, marker='o') - ax.set_xlabel('e') - ax.set_ylabel('log(val1000)') - ax.set_zlabel('log(sig)') + ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c=(0.5*f,p,q), marker='o') + ax.set_xlabel('e \n Epesolon of privacy bound') + ax.set_ylabel('log(val1000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') + ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') + ax.text(0,9,1.5,"Good") + ax.text(12,-1,-9,"Bad") plt.show() makePlot() \ No newline at end of file From 5259f63c3bba024fbb8ecb20b76747dc3c45d9b3 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:27:10 -0800 Subject: [PATCH 09/24] Clear cached rda files and regenerate test-data each run. --- bin/test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/test.sh b/bin/test.sh index b1f2fdf5..deeb69af 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -161,8 +161,10 @@ EOF banner "Wrote testdata in $input_dir (intermediate files in $build_dir)" } + # Helper function to run decode-assoc with testdata. decode-assoc-helper() { +# Clear cached rda files and regenerate test-data each run. write-assoc-testdata local output_dir=$1 @@ -180,7 +182,7 @@ decode-assoc-helper() { --var2 flag..HTTPS \ --map1 $input_dir/domain_map.csv \ --create-bool-map \ - --max-em-iters 10 \ + --max-em-iters 1000 \ --num-cores 2 \ --output-dir $output_dir \ --tmp-dir $output_dir \ From 74f1bf9ee034deb3e81e0b057f0a477535e5c616 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:29:03 -0800 Subject: [PATCH 10/24] Make sure we are using a matrix --- analysis/R/decode.R | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 3311a121..02e2e7d9 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -345,6 +345,7 @@ Decode <- function(counts, map, params, alpha = 0.05, stop(error_msg) } + counts = as.matrix(counts) # Make sure we are using a matrix k <- params$k p <- params$p q <- params$q From 7857c785a9863973b087807f8aa1f2bd5295a678 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:30:50 -0800 Subject: [PATCH 11/24] Add new test options as examples. --- tests/regtest_spec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index 5a29f39e..ad7751f3 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -18,11 +18,13 @@ # (case_name distr num_unique_values num_clients values_per_client) # (num_bits num_hashes num_cohorts) # (p q f) (num_additional regexp_to_remove) + ('demo1 unif 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo2 gauss 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo3 exp 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo4 zipf1 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo5 zipf1.5 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), + ) DISTRIBUTIONS = ( From 65b02c3c61fd2de22df2918009e849bfcfa32934 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:32:01 -0800 Subject: [PATCH 12/24] Assign cohort randomly. --- tests/rappor_sim.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/rappor_sim.py b/tests/rappor_sim.py index acb61399..3c458479 100755 --- a/tests/rappor_sim.py +++ b/tests/rappor_sim.py @@ -112,8 +112,7 @@ def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, for v1, v2 in rows: client_str = 'c%d' % report_index # randint(a, b) gives i such that a <= i <= b - cohort = random.randint(0, params1.num_cohorts - 1) - + cohort = random.randint(0, params1.num_cohorts - 1) # Assign cohort randomly. string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) From ac8d800980a4a60c471325fc3ec6d136459532f6 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Fri, 8 Jan 2016 15:33:29 -0800 Subject: [PATCH 13/24] Making parameters a mandatory argument. --- analysis/R/read_input.R | 12 +----------- bin/decode_dist.R | 1 - tests/compare_dist.R | 1 + tests/gen_counts.R | 2 +- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 47f8be5d..01a84508 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -40,18 +40,8 @@ ReadParameterFile <- function(params_file) { params } -# Handle the case of redundant cohorts, i.e. the counts file needs to be -# further aggregated to obtain counts for the number of cohorts specified in -# the params file. -# -# NOTE: Why is this happening? -AdjustCounts <- function(counts, params) { - apply(counts, 2, function(x) { - tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) - }) -} +ReadCountsFile <- function(counts_file, params) { -ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { # Read in the counts file. if (!file.exists(counts_file)) { return(NULL) diff --git a/bin/decode_dist.R b/bin/decode_dist.R index 5c83f741..4bbe1a1b 100755 --- a/bin/decode_dist.R +++ b/bin/decode_dist.R @@ -89,7 +89,6 @@ main <- function(opts) { counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack) counts <- AdjustCounts(counts, params) - # The left-most column has totals. num_reports <- sum(counts[, 1]) diff --git a/tests/compare_dist.R b/tests/compare_dist.R index 65074c46..eb6521d3 100755 --- a/tests/compare_dist.R +++ b/tests/compare_dist.R @@ -80,6 +80,7 @@ RunRappor <- function(prefix_case, prefix_instance, ctx) { counts <- ReadCountsFile(c, ctx$params) m <- paste0(prefix_case, '_map.csv') + # Switch to LoadMapFile if want to cache the result map <- ReadMapFile(m, ctx$params) diff --git a/tests/gen_counts.R b/tests/gen_counts.R index 576cc1c6..769677c4 100755 --- a/tests/gen_counts.R +++ b/tests/gen_counts.R @@ -178,7 +178,7 @@ main <- function(argv) { params <- ReadParameterFile(params_file) - true_map <- ReadMapFile(true_map_file) + true_map <- ReadMapFile(true_map_file, params) num_unique_values <- length(true_map$strs) From 7fd35cafcc46c0b1a070df1dfa53de273db24f6b Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 3 Feb 2016 17:22:22 -0800 Subject: [PATCH 14/24] Added code to visualize the effects of different parameters. --- parameter_viz.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 parameter_viz.py diff --git a/parameter_viz.py b/parameter_viz.py new file mode 100644 index 00000000..0133f2d4 --- /dev/null +++ b/parameter_viz.py @@ -0,0 +1,82 @@ +from numpy import arange +from numpy import log + +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.pyplot as plt + + +def pid(x, f, p, q): + return .5*f*(q**x + (1-q)**x) + (1-.5*f)*((1-p)**x + p**x) + +def pother(x, f, p, q): + b = .5 * f * q + (1-.5*f) * p + return ( .5*f*(q*b + (1-q)*(1-b)) + (1-.5*f)*(p*b**(x-1) + (1-p)*(1-b)**(x-1)) ) + + +def delta(x, f, p, q): + return pid(x,f,p,q) - pother(x,f,p,q) + +def predictN(prob, delta): + if delta == 0: + return float('inf') + return prob * (1-prob) * 4 / (delta**2) + +def valueFPQ(f, p, q): + total = 0 + for i in range(2,4): + total += valueIFPQ(i,f,p,q) + return total + +def valueIFPQ(i,f,p,q): + p_id = pid(i, f, p, q) + p_other = pother(i,f,p,q) + return predictN(max(p_id, p_other), abs(p_id - p_other)) + +def signal(f, p, q, h): + pStar = .5 * f * q + (1-.5*f) * p # Probability of a bit being 1 from a true value of 0 in the irr + qStar = (1-.5*f) * q + .5 * f * p + if p == q: + return 0 + elif pStar < qStar: + return 1.0 / predictN(pStar**h, (qStar**h-pStar**h)) + else: + return 1.0 / predictN((1-pStar)**h, (1-qStar)**h - (1-pStar)**h) + +def value(x) : + return - signal(x[0],x[1],x[2], 1) * valueFPQ(x[0], x[1], x[2], 1) + +def printDelta(x): + printDelta(x[1], x[0], x[2]) + +def printDelta(f, p, q): + for i in range(2,10): + p_id = pid(i,f,p,q) + p_other = pother(i,f,p,q) + print(i, p_id, ' vs ', p_other, ' delta ', p_id-p_other, ' for a sum of ', predictN(max(p_id,p_other), p_id-p_other)) + +def eInf(f, h): + return 2 * h * log( (1-.5*f)/(.5*f) ) + +def getData(): + for h in (1,2): + for f in (.125,.25,.5,.75) : + for p in (.0,.1,.25,.4,.5,.6,.75,.9,1) : + for q in (.0,.1,.25,.5,.75,.9,1) : + if abs(p-q) > 0.05 : + yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) + +def toColor(color): + x = max(1, min(255, int(round(color * 256.0)))) + return hex(x*256*256 + x*256 + x)[2:] + +def makePlot(): + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): + ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c='%s'%f, marker='o') + ax.set_xlabel('e') + ax.set_ylabel('log(val1000)') + ax.set_zlabel('log(sig)') + plt.show() + +makePlot() \ No newline at end of file From bfc6b2a1d801d7e146d5aa238c3c6cf0c2374533 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Thu, 4 Feb 2016 14:28:00 -0800 Subject: [PATCH 15/24] Add better labels to paramerters --- parameter_viz.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index 0133f2d4..ddaaf93c 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -55,15 +55,17 @@ def printDelta(f, p, q): print(i, p_id, ' vs ', p_other, ' delta ', p_id-p_other, ' for a sum of ', predictN(max(p_id,p_other), p_id-p_other)) def eInf(f, h): - return 2 * h * log( (1-.5*f)/(.5*f) ) + if f <= 1.0: + return 2 * h * log( (1-.5*f)/(.5*f) ) + else: + return 2 * h * log( (.5*f)/(1-.5*f) ) def getData(): for h in (1,2): - for f in (.125,.25,.5,.75) : - for p in (.0,.1,.25,.4,.5,.6,.75,.9,1) : - for q in (.0,.1,.25,.5,.75,.9,1) : - if abs(p-q) > 0.05 : - yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) + for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : + for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : + for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : + yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) def toColor(color): x = max(1, min(255, int(round(color * 256.0)))) @@ -73,10 +75,12 @@ def makePlot(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): - ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c='%s'%f, marker='o') - ax.set_xlabel('e') - ax.set_ylabel('log(val1000)') - ax.set_zlabel('log(sig)') + ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c=(0.5*f,p,q), marker='o') + ax.set_xlabel('e \n Epesolon of privacy bound') + ax.set_ylabel('log(val1000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') + ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') + ax.text(0,9,1.5,"Good") + ax.text(12,-1,-9,"Bad") plt.show() makePlot() \ No newline at end of file From 8b417c47d5f1b6ed61a354416820369261b8a325 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Tue, 23 Feb 2016 15:42:20 -0800 Subject: [PATCH 16/24] Remove unneeded changes introduced in merge --- analysis/R/decode.R | 1 - bin/test.sh | 4 +--- tests/compare_dist.R | 1 - tests/rappor_sim.py | 3 ++- tests/regtest_spec.py | 2 -- 5 files changed, 3 insertions(+), 8 deletions(-) diff --git a/analysis/R/decode.R b/analysis/R/decode.R index dcd6855f..02e2e7d9 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -339,7 +339,6 @@ CheckDecodeInputs <- function(counts, map, params) { Decode <- function(counts, map, params, alpha = 0.05, correction = c("Bonferroni"), quiet = FALSE, ...) { - counts = as.matrix(counts) error_msg <- CheckDecodeInputs(counts, map, params) if (!is.null(error_msg)) { diff --git a/bin/test.sh b/bin/test.sh index deeb69af..b1f2fdf5 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -161,10 +161,8 @@ EOF banner "Wrote testdata in $input_dir (intermediate files in $build_dir)" } - # Helper function to run decode-assoc with testdata. decode-assoc-helper() { -# Clear cached rda files and regenerate test-data each run. write-assoc-testdata local output_dir=$1 @@ -182,7 +180,7 @@ decode-assoc-helper() { --var2 flag..HTTPS \ --map1 $input_dir/domain_map.csv \ --create-bool-map \ - --max-em-iters 1000 \ + --max-em-iters 10 \ --num-cores 2 \ --output-dir $output_dir \ --tmp-dir $output_dir \ diff --git a/tests/compare_dist.R b/tests/compare_dist.R index eb6521d3..65074c46 100755 --- a/tests/compare_dist.R +++ b/tests/compare_dist.R @@ -80,7 +80,6 @@ RunRappor <- function(prefix_case, prefix_instance, ctx) { counts <- ReadCountsFile(c, ctx$params) m <- paste0(prefix_case, '_map.csv') - # Switch to LoadMapFile if want to cache the result map <- ReadMapFile(m, ctx$params) diff --git a/tests/rappor_sim.py b/tests/rappor_sim.py index 3c458479..acb61399 100755 --- a/tests/rappor_sim.py +++ b/tests/rappor_sim.py @@ -112,7 +112,8 @@ def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, for v1, v2 in rows: client_str = 'c%d' % report_index # randint(a, b) gives i such that a <= i <= b - cohort = random.randint(0, params1.num_cohorts - 1) # Assign cohort randomly. + cohort = random.randint(0, params1.num_cohorts - 1) + string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py index ad7751f3..5a29f39e 100755 --- a/tests/regtest_spec.py +++ b/tests/regtest_spec.py @@ -18,13 +18,11 @@ # (case_name distr num_unique_values num_clients values_per_client) # (num_bits num_hashes num_cohorts) # (p q f) (num_additional regexp_to_remove) - ('demo1 unif 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo2 gauss 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo3 exp 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo4 zipf1 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), ('demo5 zipf1.5 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), - ) DISTRIBUTIONS = ( From 822495ffd1705ff4b2eeabf2645192221ae8fa2b Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Tue, 23 Feb 2016 16:14:21 -0800 Subject: [PATCH 17/24] Switch logs to be base2. --- parameter_viz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index ddaaf93c..71d801d1 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -56,9 +56,9 @@ def printDelta(f, p, q): def eInf(f, h): if f <= 1.0: - return 2 * h * log( (1-.5*f)/(.5*f) ) + return 2 * h * log( (1-.5*f)/(.5*f) ) / log(2) else: - return 2 * h * log( (.5*f)/(1-.5*f) ) + return 2 * h * log( (.5*f)/(1-.5*f) ) / log(2) def getData(): for h in (1,2): @@ -75,7 +75,7 @@ def makePlot(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): - ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c=(0.5*f,p,q), marker='o') + ax.scatter(e, log(val1000)/log(2), log(sig)/log(2), s=(h*h*10), c=(0.5*f,p,q), marker='o') ax.set_xlabel('e \n Epesolon of privacy bound') ax.set_ylabel('log(val1000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') From 8b7a3c7764d61d8e4b5774c74ed2e369ce2da4a8 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 24 Feb 2016 11:51:32 -0800 Subject: [PATCH 18/24] Undo change to read_input.R --- analysis/R/read_input.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 01a84508..55928431 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -40,8 +40,18 @@ ReadParameterFile <- function(params_file) { params } -ReadCountsFile <- function(counts_file, params) { - +# Handle the case of redundant cohorts, i.e. the counts file needs to be +# further aggregated to obtain counts for the number of cohorts specified in +# the params file. +# +# NOTE: Why is this happening? +AdjustCounts <- function(counts, params) { + apply(counts, 2, function(x) { + tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) + }) +} + +ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { # Read in the counts file. if (!file.exists(counts_file)) { return(NULL) From 6d4ff16fde931cdc678ddd8cb1a6836f1718abc8 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 24 Feb 2016 11:54:33 -0800 Subject: [PATCH 19/24] Remove whitespace --- analysis/R/read_input.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index 55928431..dbcb980a 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -41,16 +41,16 @@ ReadParameterFile <- function(params_file) { } # Handle the case of redundant cohorts, i.e. the counts file needs to be -# further aggregated to obtain counts for the number of cohorts specified in -# the params file. -# -# NOTE: Why is this happening? -AdjustCounts <- function(counts, params) { - apply(counts, 2, function(x) { - tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) - }) -} - +# further aggregated to obtain counts for the number of cohorts specified in +# the params file. +# +# NOTE: Why is this happening? +AdjustCounts <- function(counts, params) { + apply(counts, 2, function(x) { + tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) + }) +} + ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { # Read in the counts file. if (!file.exists(counts_file)) { From 4c6a8f1d8e55c4897c1894fbe6d65ba55d76c029 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 24 Feb 2016 11:55:10 -0800 Subject: [PATCH 20/24] Remove whitespace --- analysis/R/read_input.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R index dbcb980a..47f8be5d 100644 --- a/analysis/R/read_input.R +++ b/analysis/R/read_input.R @@ -47,7 +47,7 @@ ReadParameterFile <- function(params_file) { # NOTE: Why is this happening? AdjustCounts <- function(counts, params) { apply(counts, 2, function(x) { - tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) + tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) }) } From 8cffc74d9b8ea761fb227cf6456d65b1c4c629a4 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 24 Feb 2016 13:07:39 -0800 Subject: [PATCH 21/24] Cleanup parameter_viz. --- parameter_viz.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index ddaaf93c..d5b2bfb4 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -21,12 +21,6 @@ def predictN(prob, delta): return float('inf') return prob * (1-prob) * 4 / (delta**2) -def valueFPQ(f, p, q): - total = 0 - for i in range(2,4): - total += valueIFPQ(i,f,p,q) - return total - def valueIFPQ(i,f,p,q): p_id = pid(i, f, p, q) p_other = pother(i,f,p,q) @@ -42,9 +36,6 @@ def signal(f, p, q, h): else: return 1.0 / predictN((1-pStar)**h, (1-qStar)**h - (1-pStar)**h) -def value(x) : - return - signal(x[0],x[1],x[2], 1) * valueFPQ(x[0], x[1], x[2], 1) - def printDelta(x): printDelta(x[1], x[0], x[2]) @@ -74,10 +65,10 @@ def toColor(color): def makePlot(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') - for f, p, q, h, s, e, sig, val2, val3, val1000 in getData(): - ax.scatter(e, log(val1000), log(sig), s=(h*h*10), c=(0.5*f,p,q), marker='o') + for f, p, q, h, s, e, sig, val2, val3, val10000 in getData(): + ax.scatter(e, log(val10000), log(sig), s=(h*h*10), c=(0.5*f,p,q), marker='o') ax.set_xlabel('e \n Epesolon of privacy bound') - ax.set_ylabel('log(val1000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') + ax.set_ylabel('log(val10000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') ax.text(0,9,1.5,"Good") ax.text(12,-1,-9,"Bad") From b02e6675882efde699dc259dcfc073c33200a0ae Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Thu, 25 Feb 2016 15:15:10 -0800 Subject: [PATCH 22/24] Add code to print optimal P and Q values for a given h, f, k set. --- parameter_viz.py | 75 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index d3690d7f..c56a519f 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -1,5 +1,7 @@ from numpy import arange from numpy import log +from numpy import linspace +from numpy import floor from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt @@ -26,15 +28,19 @@ def valueIFPQ(i,f,p,q): p_other = pother(i,f,p,q) return predictN(max(p_id, p_other), abs(p_id - p_other)) -def signal(f, p, q, h): +def signal(f, p, q, h, k): pStar = .5 * f * q + (1-.5*f) * p # Probability of a bit being 1 from a true value of 0 in the irr - qStar = (1-.5*f) * q + .5 * f * p - if p == q: + qStar = (1-.5*f) * q + .5 * f * p # Probability of a bit being 0 from a true value of 1 in the irr + if k <= 1: return 0 - elif pStar < qStar: - return 1.0 / predictN(pStar**h, (qStar**h-pStar**h)) + probCollision = (1.0 * h) / k + qPrime = qStar*(1-probCollision) + (probCollision*pStar) + if pStar == qPrime: + return 0 + elif pStar < qPrime: + return 1.0 / predictN(pStar**h, qPrime**h - pStar**h) else: - return 1.0 / predictN((1-pStar)**h, (1-qStar)**h - (1-pStar)**h) + return 1.0 / predictN((1-pStar)**h, (1-qPrime)**h - (1-pStar)**h) def printDelta(x): printDelta(x[1], x[0], x[2]) @@ -45,18 +51,23 @@ def printDelta(f, p, q): p_other = pother(i,f,p,q) print(i, p_id, ' vs ', p_other, ' delta ', p_id-p_other, ' for a sum of ', predictN(max(p_id,p_other), p_id-p_other)) +def toPow2(x): + if x<=1: + return 0 + return 2**floor(log(x)/log(2)) + def eInf(f, h): if f <= 1.0: return 2 * h * log( (1-.5*f)/(.5*f) ) / log(2) else: return 2 * h * log( (.5*f)/(1-.5*f) ) / log(2) -def getData(): - for h in (1,2): - for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : - for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : - for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : - yield (f, p, q, h, 1/(.5*f), eInf(f,h), signal(f, p, q, h), valueIFPQ(2,f,p,q), valueIFPQ(3,f,p,q), valueIFPQ(10000,f,p,q)) +def getData(h): + for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : + for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : + for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : + maxk = toPow2(valueIFPQ(2,f,p,q)) + yield (f, p, q, h, maxk, 1/(.5*f), eInf(f,h), signal(f, p, q, h, maxk), valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q)) def toColor(color): x = max(1, min(255, int(round(color * 256.0)))) @@ -65,13 +76,43 @@ def toColor(color): def makePlot(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') - for f, p, q, h, s, e, sig, val2, val3, val10000 in getData(): - ax.scatter(e, log(val10000)/log(2), log(sig)/log(2), s=(h*h*10), c=(0.5*f,p,q), marker='o') + for f, p, q, h, maxk, s, e, sig, val2, val10000 in getData(1): + ax.scatter(e, log(val10000)/log(2), log(sig)/log(2), s=5*log(maxk), c=(0.5*f,p,q), marker='o') ax.set_xlabel('e \n Epsilon of privacy bound') ax.set_ylabel('log(val10000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') - ax.text(0,9,1.5,"Good") - ax.text(12,-1,-9,"Bad") + ax.text(1,9,3,"Good") + ax.text(9,-2,-11,"Bad") plt.show() -makePlot() \ No newline at end of file +makePlot() + + +def value(f, h, k, p, q): + maxk = floor(valueIFPQ(2,f,p,q)) + if maxk < k: + return -maxk + return signal(f, p, q, h, k) + + +def printOptimalPQ(): + print("Optimal choices for P and Q for varrious values:") + for f in (.125,.2,.25,.3,.4,.5,.75): + for h in (1,2): + for k in (8,32,64,126,256): + bestScore = value(f,h,k,.25,.75) + for p in linspace(0.0,1.0,101): + for q in linspace(0.0,1.0,101): + score = value(f,h,k,p,q) + if score > bestScore: + bestScore = score + for p in linspace(0.0,1.0,101): + for q in linspace(0.0,1.0,101): + p=round(p,4) + q=round(q,4) + score = value(f,h,k,p,q) + if score * 1.01 > bestScore: + print( {"h":h,"k":k,"f":f,"p":p,"q":q,"signal":signal(f, p, q, h, k)} ) + +print() +printOptimalPQ() \ No newline at end of file From 87d5309c1f42deafb0e871d4f9a7a8e99f7cfb02 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Thu, 25 Feb 2016 16:21:33 -0800 Subject: [PATCH 23/24] Better visualization of optimal parameters. --- parameter_viz.py | 53 +++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index c56a519f..07b7077b 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -62,31 +62,35 @@ def eInf(f, h): else: return 2 * h * log( (.5*f)/(1-.5*f) ) / log(2) -def getData(h): - for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : - for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : - for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : - maxk = toPow2(valueIFPQ(2,f,p,q)) - yield (f, p, q, h, maxk, 1/(.5*f), eInf(f,h), signal(f, p, q, h, maxk), valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q)) +def getData(): + for h in (1, 2): + for f in (.125,.2,.25,.3,.4,.5,.75,1,1.25,1.5,1.75) : + for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : + for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : + maxk = toPow2(valueIFPQ(2,f,p,q)) + sig = signal(f, p, q, h, maxk) + e = eInf(f,h) + if sig > 0 and e < 10: + yield (f, p, q, h, maxk, e, 1/sig, valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q)) def toColor(color): x = max(1, min(255, int(round(color * 256.0)))) return hex(x*256*256 + x*256 + x)[2:] -def makePlot(): +def makePlot(pointGenerator): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') - for f, p, q, h, maxk, s, e, sig, val2, val10000 in getData(1): - ax.scatter(e, log(val10000)/log(2), log(sig)/log(2), s=5*log(maxk), c=(0.5*f,p,q), marker='o') + for f, p, q, h, maxk, e, detThres, val2, val10000 in pointGenerator(): + ax.scatter(e, log(val10000)/log(2), log(detThres)/log(2), s=h*20, c=(0.5*f,p,q), marker='o') + ax.view_init(elev=20.,azim=45) + ax.invert_zaxis() ax.set_xlabel('e \n Epsilon of privacy bound') ax.set_ylabel('log(val10000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') - ax.set_zlabel('log(sig) \n The log scale of the amount of data gained per repport. \n (The inverse of the number of repports needed to distinguish something from nothing)') - ax.text(1,9,3,"Good") - ax.text(9,-2,-11,"Bad") + ax.set_zlabel('Detectability theashold \n The log base 2 of the number of repports needed to detect a value') + ax.text(1,10,0,"Good") + ax.text(9,-2,12,"Bad") plt.show() -makePlot() - def value(f, h, k, p, q): maxk = floor(valueIFPQ(2,f,p,q)) @@ -95,10 +99,11 @@ def value(f, h, k, p, q): return signal(f, p, q, h, k) -def printOptimalPQ(): - print("Optimal choices for P and Q for varrious values:") +def getOptimalPQ(): for f in (.125,.2,.25,.3,.4,.5,.75): for h in (1,2): + if h>1 and f<.4: + continue for k in (8,32,64,126,256): bestScore = value(f,h,k,.25,.75) for p in linspace(0.0,1.0,101): @@ -112,7 +117,17 @@ def printOptimalPQ(): q=round(q,4) score = value(f,h,k,p,q) if score * 1.01 > bestScore: - print( {"h":h,"k":k,"f":f,"p":p,"q":q,"signal":signal(f, p, q, h, k)} ) + yield ( f, p, q, h, k, eInf(f,h), 1/signal(f, p, q, h, k), valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q) ) -print() -printOptimalPQ() \ No newline at end of file +def printOptimalPQ(): + print("Optimal choices for P and Q for varrious values:") + for f, p, q, h, k, e, detThres, val2, val10000 in getOptimalPQ(): + print( 'h={}, k={:3}, f={:4}, p={:4}, q={:4}, epislon={:5}, signal={:5}'.format(h,k,f,p,q,e,detThres) ) + +print +print("Showing a plot shoing various points in the space. (Not nessicarly optimal ones)") +makePlot(getData) +print +printOptimalPQ() +print +makePlot(getOptimalPQ) \ No newline at end of file From 126656211466fb6be91b72547c035851211bda59 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Thu, 25 Feb 2016 18:38:32 -0800 Subject: [PATCH 24/24] Compute optimal points and graph them. --- parameter_viz.py | 96 ++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 35 deletions(-) diff --git a/parameter_viz.py b/parameter_viz.py index 07b7077b..18b123c8 100644 --- a/parameter_viz.py +++ b/parameter_viz.py @@ -2,6 +2,8 @@ from numpy import log from numpy import linspace from numpy import floor +from numpy import ceil +import math from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt @@ -21,26 +23,26 @@ def delta(x, f, p, q): def predictN(prob, delta): if delta == 0: return float('inf') - return prob * (1-prob) * 4 / (delta**2) + return max(1, prob * (1-prob) * 4 / (delta**2)) def valueIFPQ(i,f,p,q): p_id = pid(i, f, p, q) p_other = pother(i,f,p,q) return predictN(max(p_id, p_other), abs(p_id - p_other)) -def signal(f, p, q, h, k): +def detetectionThreshold(f, p, q, h, k): pStar = .5 * f * q + (1-.5*f) * p # Probability of a bit being 1 from a true value of 0 in the irr qStar = (1-.5*f) * q + .5 * f * p # Probability of a bit being 0 from a true value of 1 in the irr if k <= 1: - return 0 + return float("inf") probCollision = (1.0 * h) / k qPrime = qStar*(1-probCollision) + (probCollision*pStar) if pStar == qPrime: - return 0 + return float("inf") elif pStar < qPrime: - return 1.0 / predictN(pStar**h, qPrime**h - pStar**h) + return predictN(pStar**h, qPrime**h - pStar**h) else: - return 1.0 / predictN((1-pStar)**h, (1-qPrime)**h - (1-pStar)**h) + return predictN((1-pStar)**h, (1-qPrime)**h - (1-pStar)**h) def printDelta(x): printDelta(x[1], x[0], x[2]) @@ -68,10 +70,11 @@ def getData(): for p in (.0,.1,.2,.3,.4,.5,.6,.7,.8,.9) : for q in (.15,.25,.35,.45,.55,.65,.75,.85,1) : maxk = toPow2(valueIFPQ(2,f,p,q)) - sig = signal(f, p, q, h, maxk) + detThres = detetectionThreshold(f, p, q, h, maxk) e = eInf(f,h) - if sig > 0 and e < 10: - yield (f, p, q, h, maxk, e, 1/sig, valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q)) + tradeoff = eInf(f,h) * detThres + if not math.isinf(detThres) and e < 10: + yield (f, p, q, h, maxk, e, detThres, valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q), tradeoff) def toColor(color): x = max(1, min(255, int(round(color * 256.0)))) @@ -80,54 +83,77 @@ def toColor(color): def makePlot(pointGenerator): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') - for f, p, q, h, maxk, e, detThres, val2, val10000 in pointGenerator(): - ax.scatter(e, log(val10000)/log(2), log(detThres)/log(2), s=h*20, c=(0.5*f,p,q), marker='o') + for f, p, q, h, maxk, e, detThres, val2, val10000, tradeoff in pointGenerator(): + ax.scatter(e, log(val10000)/log(2), log(ceil(detThres))/log(2), s=h*h*20, c=(0.5*f,p,q), marker='o') ax.view_init(elev=20.,azim=45) ax.invert_zaxis() ax.set_xlabel('e \n Epsilon of privacy bound') ax.set_ylabel('log(val10000) \n Log of number of bits of K needed to form a identifier that could distinguish two users') ax.set_zlabel('Detectability theashold \n The log base 2 of the number of repports needed to detect a value') - ax.text(1,10,0,"Good") + ax.text(1,10,1,"Good") ax.text(9,-2,12,"Bad") plt.show() -def value(f, h, k, p, q): +def value(f, p, q, h, k): maxk = floor(valueIFPQ(2,f,p,q)) if maxk < k: - return -maxk - return signal(f, p, q, h, k) + return float("inf") + return detetectionThreshold(f, p, q, h, k) def getOptimalPQ(): - for f in (.125,.2,.25,.3,.4,.5,.75): + epislons = {} + for f in (.75,.5,.4,.333,.25,.2,.15,.125): for h in (1,2): - if h>1 and f<.4: - continue - for k in (8,32,64,126,256): - bestScore = value(f,h,k,.25,.75) - for p in linspace(0.0,1.0,101): - for q in linspace(0.0,1.0,101): - score = value(f,h,k,p,q) - if score > bestScore: - bestScore = score - for p in linspace(0.0,1.0,101): - for q in linspace(0.0,1.0,101): - p=round(p,4) - q=round(q,4) - score = value(f,h,k,p,q) - if score * 1.01 > bestScore: - yield ( f, p, q, h, k, eInf(f,h), 1/signal(f, p, q, h, k), valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q) ) + e = eInf(f,h) + epislons[e] = (f,h) + + smallestTradeoff = float("inf") + for e in sorted(epislons.iterkeys()): + f = epislons[e][0] + h = epislons[e][1] + smallestTradeoffForE = float("inf") + if h>1 and f<.4: + continue + for k in (8,32,64,126,256): + lowest = float("inf") + for p in linspace(0.0,1.0,101): + for q in linspace(0.0,1.0,101): + detThres = value(f, p, q, h, k) + if detThres < lowest: + lowest = detThres + for p in linspace(0.0,1.0,101): + for q in linspace(0.0,1.0,101): + p=round(p,4) + q=round(q,4) + detThres = value(f, p, q, h, k) + if not math.isinf(detThres) and detThres < lowest * 1.01: + tradeoff = e * detThres + if tradeoff < smallestTradeoff: + yield ( f, p, q, h, k, e, detThres, valueIFPQ(2,f,p,q), valueIFPQ(10000,f,p,q), tradeoff ) + if tradeoff < smallestTradeoffForE: + smallestTradeoffForE = tradeoff + if smallestTradeoffForE < smallestTradeoff: + smallestTradeoff = smallestTradeoffForE def printOptimalPQ(): print("Optimal choices for P and Q for varrious values:") - for f, p, q, h, k, e, detThres, val2, val10000 in getOptimalPQ(): - print( 'h={}, k={:3}, f={:4}, p={:4}, q={:4}, epislon={:5}, signal={:5}'.format(h,k,f,p,q,e,detThres) ) + for f, p, q, h, k, e, detThres, val2, val10000, tradeoff in getOptimalPQ(): + print( 'h={}, k={:3}, f={:4}, p={:4}, q={:4}, epislon={:5}, detThres={}'.format(h,k,f,p,q, round(e,4), ceil(detThres)) ) print print("Showing a plot shoing various points in the space. (Not nessicarly optimal ones)") makePlot(getData) print +print("Computing points on the optimal frontier") printOptimalPQ() print -makePlot(getOptimalPQ) \ No newline at end of file +print("Plotting the optimal set") +makePlot(getOptimalPQ) + +print("As you can see the optimal points have a few properties in common:") +print("h is always 1") +print("f is never below .2 (Though the exact lower bound of this threashold will require more experimentation)") +print("P and Q are always some extreme (one of them is either 1 or 0)") +print("Higher values of K corilate with higher values of F")