From 64ac6ada8f9e97a3e56b3db9e038400e66284f46 Mon Sep 17 00:00:00 2001 From: "Konstantin (Tino) Sering" Date: Tue, 11 Aug 2020 00:03:01 +0200 Subject: [PATCH] adds masking functionality --- pyndl/__init__.py | 2 +- pyndl/error_codes.pxd | 14 ++- pyndl/ndl.py | 123 ++++++++++++++++--- pyndl/ndl_openmp.pyx | 56 ++++++++- pyndl/ndl_parallel.pxd | 8 +- pyndl/ndl_parallel.pyx | 141 +++++++++++++++++++++- tests/resources/event_file_masking.tab.gz | Bin 0 -> 89 bytes tests/test_ndl.py | 71 ++++++++--- 8 files changed, 369 insertions(+), 46 deletions(-) create mode 100644 tests/resources/event_file_masking.tab.gz diff --git a/pyndl/__init__.py b/pyndl/__init__.py index d9dba2e..0948914 100644 --- a/pyndl/__init__.py +++ b/pyndl/__init__.py @@ -17,7 +17,7 @@ __author__ = ('Konstantin Sering, Marc Weitz, ' 'David-Elias Künstle, Lennard Schneider') __author_email__ = 'konstantin.sering@uni-tuebingen.de' -__version__ = '0.7.1' +__version__ = '0.8.0' __license__ = 'MIT' __description__ = ('Naive discriminative learning implements learning and ' 'classification models based on the Rescorla-Wagner ' diff --git a/pyndl/error_codes.pxd b/pyndl/error_codes.pxd index 41cc294..9d4fb14 100644 --- a/pyndl/error_codes.pxd +++ b/pyndl/error_codes.pxd @@ -3,11 +3,13 @@ cdef enum ErrorCode: MAGIC_NUMBER_DOES_NOT_MATCH = 1 VERSION_NUMBER_DOES_NOT_MATCH = 2 INITIAL_ERROR_CODE = 3 + START_LARGER_END = 4 -ERROR_CODES = """ - NO_ERROR = 0 - MAGIC_NUMBER_DOES_NOT_MATCH = 1 - VERSION_NUMBER_DOES_NOT_MATCH = 2 - INITIAL_ERROR_CODE = 3 - """ +#ERROR_CODES = """ +# NO_ERROR = 0 +# MAGIC_NUMBER_DOES_NOT_MATCH = 1 +# VERSION_NUMBER_DOES_NOT_MATCH = 2 +# INITIAL_ERROR_CODE = 3 +# START_LARGER_END = 4 +# """ diff --git a/pyndl/ndl.py b/pyndl/ndl.py index ab54fad..08fa4b1 100644 --- a/pyndl/ndl.py +++ b/pyndl/ndl.py @@ -43,11 +43,43 @@ def events_from_file(event_path): return io.events_from_file(event_path) +def _create_cue_outcome_map(cues, outcomes, cues_to_mask, old_cues=frozenset(), old_outcomes=frozenset()): + """This function returns a cue_map and an outcome_map with the cues_to_mask + placed in the first mask_up_to_excluding index.""" + + cues = set(cues) | set(old_cues) + outcomes = set(outcomes) | set(old_outcomes) + + # cues to mask have to come in the beginning of the cue_map and we need the + # same amount of indices in the outcome_map + if cues_to_mask == 'all': + cues_to_mask = set(cues) + elif cues_to_mask is None: + cues_to_mask = set() + mask_up_to_excluding = len(cues_to_mask) # the highest index that should be masked (excluding) + cues_not_to_mask = set(cues) - cues_to_mask + outcomes_not_to_mask = set(outcomes) - cues_to_mask + + # fix order of sets + cues_to_mask = list(cues_to_mask) + cues_not_to_mask = list(cues_not_to_mask) + outcomes_not_to_mask = list(outcomes_not_to_mask) + + # reassamble cues and outcomes with specific ordering now + cues = cues_to_mask + cues_not_to_mask + outcomes = cues_to_mask + outcomes_not_to_mask + + cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues))) + outcome_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(outcomes))) + + return mask_up_to_excluding, cue_map, outcome_map + + def ndl(events, alpha, betas, lambda_=1.0, *, method='openmp', weights=None, number_of_threads=8, len_sublists=10, remove_duplicates=None, verbose=False, temporary_directory=None, - events_per_temporary_file=10000000): + events_per_temporary_file=10000000, cues_to_mask=None): """ Calculate the weights for all_outcomes over all events in event_file given by the files path. @@ -85,7 +117,11 @@ def ndl(events, alpha, betas, lambda_=1.0, *, if none is provided, the operating system's default will be used (/tmp on unix) events_per_temporary_file: int - Number of events in each temporary binary file. Has to be larger than 1 + number of events in each temporary binary file. Has to be larger than 1 + cues_to_mask: set of cues or None or 'all' + if None no masking is applied, otherwise all cues are masked from + themselfes if they appear as outcomes as well in the learning events, + 'all' indicates that all cues should be masked Returns ------- @@ -111,30 +147,33 @@ def ndl(events, alpha, betas, lambda_=1.0, *, verbose=verbose) cues = list(cues.keys()) outcomes = list(outcomes.keys()) - cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues))) - outcome_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(outcomes))) - - all_outcome_indices = [outcome_map[outcome] for outcome in outcomes] - - shape = (len(outcome_map), len(cue_map)) # initialize weights if weights is None: + mask_up_to_excluding, cue_map, outcome_map = _create_cue_outcome_map(cues, outcomes, cues_to_mask) + shape = (len(outcome_map), len(cue_map)) weights = np.ascontiguousarray(np.zeros(shape, dtype=np.float64, order='C')) elif isinstance(weights, xr.DataArray): old_cues = weights.coords["cues"].values.tolist() - new_cues = list(set(cues) - set(old_cues)) old_outcomes = weights.coords["outcomes"].values.tolist() - new_outcomes = list(set(outcomes) - set(old_outcomes)) + if cues_to_mask is None: + mask_up_to_excluding = 0 + else: + mask_up_to_excluding, cue_map, outcome_map = _create_cue_outcome_map(cues, outcomes, + cues_to_mask, old_cues, old_outcomes) + # TODO: allocate weights and copy them cell wise from the old + # weights to the new weights + raise NotImplementedError('continue learning is not implemented for masking right now') + + new_cues = list(set(cues) - set(old_cues)) + new_outcomes = list(set(outcomes) - set(old_outcomes)) cues = old_cues + new_cues outcomes = old_outcomes + new_outcomes cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues))) outcome_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(outcomes))) - all_outcome_indices = [outcome_map[outcome] for outcome in outcomes] - weights_tmp = np.concatenate((weights.values, np.zeros((len(new_outcomes), len(old_cues)), dtype=np.float64, order='C')), @@ -168,16 +207,25 @@ def ndl(events, alpha, betas, lambda_=1.0, *, if verbose: print('start learning...') # learning + all_outcome_indices_masked = list(range(mask_up_to_excluding)) + all_outcome_indices_normal = list(range(mask_up_to_excluding, len(outcome_map))) if method == 'openmp': if sys.platform.startswith('darwin'): raise NotImplementedError("OpenMP does not work under MacOs yet." "Use method='threading' instead.") + # 1. learn masked indices + ndl_openmp.learn_inplace_masked(binary_files, weights, alpha, + beta1, beta2, lambda_, + np.array(all_outcome_indices_masked, dtype=np.uint32), + len_sublists, number_of_threads) + # 2. learn normal ndl_openmp.learn_inplace(binary_files, weights, alpha, beta1, beta2, lambda_, - np.array(all_outcome_indices, dtype=np.uint32), + np.array(all_outcome_indices_normal, dtype=np.uint32), len_sublists, number_of_threads) elif method == 'threading': - part_lists = slice_list(all_outcome_indices, len_sublists) + # 1. learn all masked indices + part_lists = slice_list(all_outcome_indices_masked, len_sublists) working_queue = Queue(len(part_lists)) threads = [] @@ -189,7 +237,7 @@ def worker(): if working_queue.empty(): break data = working_queue.get() - ndl_parallel.learn_inplace(binary_files, weights, alpha, + ndl_parallel.learn_inplace_masked(binary_files, weights, alpha, beta1, beta2, lambda_, data) with queue_lock: @@ -203,6 +251,36 @@ def worker(): for thread in threads: thread.join() + + # 2. learn all normal + part_lists = slice_list(all_outcome_indices_normal, len_sublists) + + working_queue = Queue(len(part_lists)) + threads = [] + queue_lock = threading.Lock() + + def worker(): + while True: + with queue_lock: + if working_queue.empty(): + break + data = working_queue.get() + ndl_parallel.learn_inplace(binary_files, weights, + alpha, beta1, beta2, + lambda_, data) + + with queue_lock: + for partlist in part_lists: + working_queue.put(np.array(partlist, dtype=np.uint32)) + + for _ in range(number_of_threads): + thread = threading.Thread(target=worker) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + else: raise ValueError('method needs to be either "threading" or "openmp"') @@ -220,6 +298,9 @@ def worker(): __name__ + "." + ndl.__name__, method=method, attrs=attrs_to_be_updated) # post-processing + # we have to extract the right ordering + cues = list(cue_map.keys()) + outcomes = list(outcome_map.keys()) weights = xr.DataArray(weights, [('outcomes', outcomes), ('cues', cues)], attrs=attrs) return weights @@ -312,7 +393,7 @@ def attrs(self, attrs): def dict_ndl(events, alphas, betas, lambda_=1.0, *, weights=None, inplace=False, remove_duplicates=None, - make_data_array=False, verbose=False): + make_data_array=False, verbose=False, cues_to_mask=None): """ Calculate the weights for all_outcomes over all events in event_file. @@ -347,6 +428,10 @@ def dict_ndl(events, alphas, betas, lambda_=1.0, *, if True makes a xarray.DataArray out of the dict of dicts. verbose : bool print some output if True. + cues_to_mask: set of cues or None or 'all' + if None no masking is applied, otherwise all cues are masked from + themselfes if they appear as outcomes as well in the learning events, + 'all' indicates that all cues should be masked Returns ------- @@ -370,6 +455,9 @@ def dict_ndl(events, alphas, betas, lambda_=1.0, *, if not (remove_duplicates is None or isinstance(remove_duplicates, bool)): raise ValueError("remove_duplicates must be None, True or False") + if cues_to_mask is None: + cues_to_mask = set() + wall_time_start = time.perf_counter() cpu_time_start = time.process_time() if isinstance(events, str): @@ -434,6 +522,9 @@ def dict_ndl(events, alphas, betas, lambda_=1.0, *, else: update = beta2 * (0 - association_strength) for cue in cues: + if cues_to_mask == 'all' or cue in cues_to_mask: + if cue == outcome: + continue weights[outcome][cue] += alphas[cue] * update cpu_time_stop = time.process_time() diff --git a/pyndl/ndl_openmp.pyx b/pyndl/ndl_openmp.pyx index 6ebd07b..f071dc6 100644 --- a/pyndl/ndl_openmp.pyx +++ b/pyndl/ndl_openmp.pyx @@ -5,8 +5,8 @@ ctypedef np.float64_t dtype_t cimport cython from cython.parallel cimport parallel, prange -from ndl_parallel cimport learn_inplace_ptr -from error_codes cimport ErrorCode, NO_ERROR, INITIAL_ERROR_CODE, ERROR_CODES +from ndl_parallel cimport learn_inplace_ptr, learn_inplace_masked_ptr +from error_codes cimport ErrorCode, NO_ERROR, INITIAL_ERROR_CODE def learn_inplace(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, @@ -23,6 +23,8 @@ def learn_inplace(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, cdef unsigned int start_val, end_val, ii, number_parts cdef ErrorCode error = INITIAL_ERROR_CODE + if length_all_outcomes == 0: + return # cdef String # weights muss contigousarray sein und mode=c, siehe: @@ -48,4 +50,52 @@ def learn_inplace(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, break if (error != NO_ERROR): - raise IOError(f'binary files does not have proper format, error code {error}\n{ERROR_CODES}') + raise IOError(f'binary files does not have proper format, error code {error}') + + +# The masked versions where learning is ignored when cue and outcome have the +# same index. The code is copied to not take the penalty for the if statement +# in the innerst loop in the case where no masking is applied. + +def learn_inplace_masked(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, + dtype_t alpha, dtype_t beta1, + dtype_t beta2, dtype_t lambda_, + np.ndarray[unsigned int, ndim=1] all_outcomes, + unsigned int chunksize, + unsigned int number_of_threads): + + cdef unsigned int mm = weights.shape[1] # number of cues == columns + cdef unsigned int* all_outcomes_ptr = all_outcomes.data + cdef unsigned int length_all_outcomes = all_outcomes.shape[0] + cdef char* fname + cdef unsigned int start_val, end_val, ii, number_parts + cdef ErrorCode error = INITIAL_ERROR_CODE + + if length_all_outcomes == 0: + return + + # cdef String + # weights muss contigousarray sein und mode=c, siehe: + #cdef np.ndarray[np.uint32_t, ndim=3, mode = 'c'] np_buff = np.ascontiguousarray(im, dtype = np.uint32) + cdef dtype_t* weights_ptr = weights.data # ueberlegen ob [][] oder ** oder [] oder * + + for binary_file_path in binary_file_paths: # + filename_byte_string = binary_file_path.encode("UTF-8") + fname = filename_byte_string + + number_parts = math.ceil( length_all_outcomes / chunksize) + + with nogil, parallel(num_threads=number_of_threads): + for ii in prange(number_parts, schedule="dynamic", chunksize=1): + start_val = ii * chunksize + end_val = min(start_val + chunksize, length_all_outcomes) + if start_val == length_all_outcomes: + break + error = learn_inplace_masked_ptr(fname, weights_ptr, mm, alpha, beta1, + beta2, lambda_, all_outcomes_ptr, start_val, + end_val) + if error != NO_ERROR: + break + + if (error != NO_ERROR): + raise IOError(f'binary files does not have proper format, error code {error}') diff --git a/pyndl/ndl_parallel.pxd b/pyndl/ndl_parallel.pxd index 48351b8..5d49273 100644 --- a/pyndl/ndl_parallel.pxd +++ b/pyndl/ndl_parallel.pxd @@ -3,6 +3,10 @@ ctypedef np.float64_t dtype_t from error_codes cimport ErrorCode -cdef ErrorCode learn_inplace_ptr(char*, dtype_t*, unsigned int, dtype_t, dtype_t, - dtype_t, dtype_t, unsigned int*, unsigned int, +cdef ErrorCode learn_inplace_ptr(char*, dtype_t*, unsigned int, dtype_t, + dtype_t, dtype_t, dtype_t, unsigned int*, unsigned int, + unsigned int) nogil + +cdef ErrorCode learn_inplace_masked_ptr(char*, dtype_t*, unsigned int, dtype_t, + dtype_t, dtype_t, dtype_t, unsigned int*, unsigned int, unsigned int) nogil diff --git a/pyndl/ndl_parallel.pyx b/pyndl/ndl_parallel.pyx index c51ad8f..2d662dc 100644 --- a/pyndl/ndl_parallel.pyx +++ b/pyndl/ndl_parallel.pyx @@ -3,7 +3,8 @@ import math from libc.stdlib cimport abort, malloc, free from libc.stdio cimport fopen, fread, fclose, FILE -from error_codes cimport ErrorCode, NO_ERROR, MAGIC_NUMBER_DOES_NOT_MATCH, VERSION_NUMBER_DOES_NOT_MATCH, INITIAL_ERROR_CODE, ERROR_CODES +from error_codes cimport ErrorCode, NO_ERROR, MAGIC_NUMBER_DOES_NOT_MATCH, VERSION_NUMBER_DOES_NOT_MATCH, INITIAL_ERROR_CODE, START_LARGER_END + cdef unsigned int MAGIC_NUMBER = 14159265 cdef unsigned int CURRENT_VERSION_WITH_FREQ = 215 @@ -51,6 +52,9 @@ def learn_inplace(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, cdef unsigned int start_val, end_val cdef ErrorCode error = INITIAL_ERROR_CODE + if length_all_outcomes == 0: + return NO_ERROR + # cdef String # weights muss contigousarray sein und mode=c, siehe: #cdef np.ndarray[np.uint32_t, ndim=3, mode = 'c'] np_buff = np.ascontiguousarray(im, dtype = np.uint32) @@ -68,7 +72,7 @@ def learn_inplace(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, break if (error != NO_ERROR): - raise IOError(f'binary files does not have proper format, error code {error}\n{ERROR_CODES}') + raise IOError(f'binary files does not have proper format, error code {error}') cdef int is_element_of(unsigned int elem, unsigned int* arr, unsigned int size) nogil: @@ -88,6 +92,136 @@ cdef ErrorCode learn_inplace_ptr(char* binary_file_path, dtype_t* weights, unsigned int start, unsigned int end) nogil: + if start == end: + return NO_ERROR + elif start > end: + return START_LARGER_END + + cdef unsigned int number_of_events, number_of_cues, number_of_outcomes + cdef dtype_t association_strength, update + cdef unsigned int magic_number, version, ii, jj, event, appearance + cdef unsigned long long index + cdef unsigned int* cue_indices + cdef unsigned int* outcome_indices + cdef unsigned int max_number_of_cues = 1024 + cdef unsigned int max_number_of_outcomes = 1024 + + cdef FILE* binary_file + binary_file = fopen(binary_file_path, "rb") + + read_next_int(&magic_number, binary_file) + if not magic_number == MAGIC_NUMBER: + fclose(binary_file) + return MAGIC_NUMBER_DOES_NOT_MATCH + read_next_int(&version, binary_file) + if version == CURRENT_VERSION: + pass + else: + fclose(binary_file) + return VERSION_NUMBER_DOES_NOT_MATCH + + # preallocate memory + cue_indices = malloc(sizeof(unsigned int) * max_number_of_cues) + outcome_indices = malloc(sizeof(unsigned int) * max_number_of_outcomes) + + read_next_int(&number_of_events, binary_file) + + for event in range(number_of_events): + # cues + read_next_int(&number_of_cues, binary_file) + if number_of_cues > max_number_of_cues: + max_number_of_cues = number_of_cues + free(cue_indices) + cue_indices = malloc(sizeof(unsigned int) * max_number_of_cues) + fread(cue_indices, 4, number_of_cues, binary_file) + + # outcomes + read_next_int(&number_of_outcomes, binary_file) + if number_of_outcomes > max_number_of_outcomes: + max_number_of_outcomes = number_of_outcomes + free(outcome_indices) + outcome_indices = malloc(sizeof(unsigned int) * max_number_of_outcomes) + fread(outcome_indices, 4, number_of_outcomes, binary_file) + + # learn + for ii in range(start, end): + association_strength = 0.0 + for jj in range(number_of_cues): + # this overflows: + #index = cue_indices[jj] + mm * all_outcome_indices[ii] + index = mm # implicit cast to unsigned long long + index *= all_outcome_indices[ii] # this can't overflow anymore + index += cue_indices[jj] # this can't overflow anymore + # worst case: 4294967295 * 4294967295 + 4294967295 == 18446744069414584320 < 18446744073709551615 + association_strength += weights[index] + if is_element_of(all_outcome_indices[ii], outcome_indices, number_of_outcomes): + update = beta1 * (lambda_ - association_strength) + else: + update = beta2 * (0.0 - association_strength) + for jj in range(number_of_cues): + index = mm # implicit cast to unsigned long long + index *= all_outcome_indices[ii] # this can't overflow anymore + index += cue_indices[jj] # this can't overflow anymore + weights[index] += alpha * update + + fclose(binary_file) + free(cue_indices) + free(outcome_indices) + return NO_ERROR + + +# The masked versions where learning is ignored when cue and outcome have the +# same index. The code is copied to not take the penalty for the if statement +# in the innerst loop in the case where no masking is applied. + +def learn_inplace_masked(binary_file_paths, np.ndarray[dtype_t, ndim=2] weights, + dtype_t alpha, dtype_t beta1, + dtype_t beta2, dtype_t lambda_, + np.ndarray[unsigned int, ndim=1] all_outcomes): + + cdef unsigned int mm = weights.shape[1] # number of cues == columns + cdef unsigned int* all_outcomes_ptr = all_outcomes.data + cdef unsigned int length_all_outcomes = all_outcomes.shape[0] + cdef char* fname + cdef unsigned int start_val, end_val + cdef ErrorCode error = INITIAL_ERROR_CODE + + if length_all_outcomes == 0: + return NO_ERROR + + # cdef String + # weights muss contigousarray sein und mode=c, siehe: + #cdef np.ndarray[np.uint32_t, ndim=3, mode = 'c'] np_buff = np.ascontiguousarray(im, dtype = np.uint32) + cdef dtype_t* weights_ptr = weights.data # ueberlegen ob [][] oder ** oder [] oder * + + for binary_file_path in binary_file_paths: # + filename_byte_string = binary_file_path.encode("UTF-8") + fname = filename_byte_string + + with nogil: + error = learn_inplace_masked_ptr(fname, weights_ptr, mm, alpha, beta1, + beta2, lambda_, all_outcomes_ptr, 0, + length_all_outcomes) + if error != NO_ERROR: + break + + if (error != NO_ERROR): + raise IOError(f'binary files does not have proper format, error code {error}') + + +# ggf exception zurückgeben +cdef ErrorCode learn_inplace_masked_ptr(char* binary_file_path, dtype_t* weights, + unsigned int mm, + dtype_t alpha, dtype_t beta1, + dtype_t beta2, dtype_t lambda_, + unsigned int* all_outcome_indices, + unsigned int start, + unsigned int end) nogil: + + if start == end: + return NO_ERROR + elif start > end: + return START_LARGER_END cdef unsigned int number_of_events, number_of_cues, number_of_outcomes cdef dtype_t association_strength, update @@ -151,6 +285,9 @@ cdef ErrorCode learn_inplace_ptr(char* binary_file_path, dtype_t* weights, else: update = beta2 * (0.0 - association_strength) for jj in range(number_of_cues): + # check for masking: + if all_outcome_indices[ii] == cue_indices[jj]: + continue index = mm # implicit cast to unsigned long long index *= all_outcome_indices[ii] # this can't overflow anymore index += cue_indices[jj] # this can't overflow anymore diff --git a/tests/resources/event_file_masking.tab.gz b/tests/resources/event_file_masking.tab.gz new file mode 100644 index 0000000000000000000000000000000000000000..ddea1361c6471df0c46ceaf3658d1587b30cf70b GIT binary patch literal 89 zcmb2|=HMt3H;iXsPAyB#D~V6b%t?*UO)SpN%uClRNlaoW*7MbU_C;UU^W1qL!>h-? s%l{e6WCgSIq8cUf8Rw@S;S}liXY5*bY`J47!_gfKYS|^f9T*rG0H_rqNB{r; literal 0 HcmV?d00001 diff --git a/tests/test_ndl.py b/tests/test_ndl.py index bff7e96..1abdde8 100644 --- a/tests/test_ndl.py +++ b/tests/test_ndl.py @@ -19,6 +19,7 @@ TEST_ROOT = os.path.join(os.path.pardir, os.path.dirname(__file__)) FILE_PATH_SIMPLE = os.path.join(TEST_ROOT, "resources/event_file_simple.tab.gz") FILE_PATH_MULTIPLE_CUES = os.path.join(TEST_ROOT, "resources/event_file_multiple_cues.tab.gz") +FILE_PATH_MASKING = os.path.join(TEST_ROOT, "resources/event_file_masking.tab.gz") REFERENCE_PATH = os.path.join(TEST_ROOT, 'reference/weights_event_file_simple.csv') REFERENCE_PATH_NDL2 = os.path.join(TEST_ROOT, 'reference/weights_event_file_simple_ndl2.csv') REFERENCE_PATH_MULTIPLE_CUES_NDL2 = os.path.join(TEST_ROOT, 'reference/weights_event_file_multiple_cues_ndl2.csv') @@ -57,6 +58,36 @@ def result_dict_ndl_data_array(): return ndl.dict_ndl(FILE_PATH_SIMPLE, ALPHA, BETAS, make_data_array=True) +@pytest.fixture(scope='module') +def result_dict_ndl_mask_all(): + return ndl.dict_ndl(FILE_PATH_MASKING, ALPHA, BETAS, cues_to_mask='all') + + +@pytest.fixture(scope='module') +def result_dict_ndl_mask_ab(): + return ndl.dict_ndl(FILE_PATH_MASKING, ALPHA, BETAS, cues_to_mask={'a', 'b'}) + + +@pytest.fixture(scope='module') +def result_ndl_threading_mask_all(): + return ndl.ndl(FILE_PATH_MASKING, ALPHA, BETAS, method='threading', cues_to_mask='all') + + +@pytest.fixture(scope='module') +def result_ndl_threading_mask_ab(): + return ndl.ndl(FILE_PATH_MASKING, ALPHA, BETAS, method='threading', cues_to_mask={'a', 'b'}) + + +@pytest.fixture(scope='module') +def result_ndl_openmp_mask_all(): + return ndl.ndl(FILE_PATH_MASKING, ALPHA, BETAS, method='openmp', cues_to_mask='all') + + +@pytest.fixture(scope='module') +def result_ndl_openmp_mask_ab(): + return ndl.ndl(FILE_PATH_MASKING, ALPHA, BETAS, method='openmp', cues_to_mask={'a', 'b'}) + + @pytest.fixture(scope='module') def result_continue_learning(): events_simple = pd.read_csv(FILE_PATH_SIMPLE, sep="\t") @@ -177,6 +208,29 @@ def test_continue_learning_dict_ndl_data_array(result_dict_ndl, result_dict_ndl_ assert len(unequal) == 0 # pylint: disable=len-as-condition +def test_masking_all(result_dict_ndl_mask_all, result_ndl_threading_mask_all, result_ndl_openmp_mask_all): + unequal, unequal_ratio = compare_arrays(FILE_PATH_MASKING, result_dict_ndl_mask_all, + result_ndl_threading_mask_all) + print('%.2f ratio unequal' % unequal_ratio) + assert len(unequal) == 0 # pylint: disable=len-as-condition + unequal, unequal_ratio = compare_arrays(FILE_PATH_MASKING, result_dict_ndl_mask_all, + result_ndl_openmp_mask_all) + print('%.2f ratio unequal' % unequal_ratio) + assert len(unequal) == 0 # pylint: disable=len-as-condition + + +def test_masking_ab(result_dict_ndl_mask_ab, result_ndl_threading_mask_ab, result_ndl_openmp_mask_ab): + unequal, unequal_ratio = compare_arrays(FILE_PATH_MASKING, result_dict_ndl_mask_ab, + result_ndl_threading_mask_ab) + print('%.2f ratio unequal' % unequal_ratio) + assert len(unequal) == 0 # pylint: disable=len-as-condition + unequal, unequal_ratio = compare_arrays(FILE_PATH_MASKING, result_dict_ndl_mask_ab, + result_ndl_openmp_mask_ab) + print('%.2f ratio unequal' % unequal_ratio) + assert len(unequal) == 0 # pylint: disable=len-as-condition + + + @pytest.mark.nolinux def test_continue_learning(result_continue_learning, result_ndl_openmp): assert result_continue_learning.shape == result_ndl_openmp.shape @@ -450,7 +504,6 @@ def clock(func, args, **kwargs): def compare_arrays(file_path, arr1, arr2): _, cues, outcomes = count.cues_outcomes(file_path) - cue_map, outcome_map, _ = generate_mapping(file_path) unequal = list() @@ -458,11 +511,7 @@ def compare_arrays(file_path, arr1, arr2): for cue in cues: values = list() for array in (arr1, arr2): - if isinstance(array, np.ndarray): - outcome_index = outcome_map[outcome] - cue_index = cue_map[cue] - values.append(array[outcome_index][cue_index]) - elif isinstance(array, xr.DataArray): + if isinstance(array, xr.DataArray): values.append(array.loc[{'outcomes': outcome, 'cues': cue}].values) elif isinstance(array, pd.DataFrame): values.append(array.loc[outcome][cue]) @@ -475,13 +524,3 @@ def compare_arrays(file_path, arr1, arr2): unequal_ratio = len(unequal) / (len(outcomes) * len(cues)) return (unequal, unequal_ratio) - - -def generate_mapping(event_path): - _, cues, outcomes = count.cues_outcomes(event_path) - all_cues = list(cues.keys()) - all_outcomes = list(outcomes.keys()) - cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(all_cues))) - outcome_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(all_outcomes))) - - return (cue_map, outcome_map, all_outcomes)