Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest pytest-cov
python -m pip install flake8 pytest pytest-cov flake8-pyproject
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
Expand Down
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# PyProbables Changelog

### Version 0.7.0

***Breaking Changes***
Minor breaking changes; mismatched Bloom filters raise a `SimilarityError` instead of returning `None`

* `BitArray`
* Add ability to read and write as bytes
* Add abilitt to export
* Updated typing to be more consistent and correct


### Version 0.6.2

* `BloomFilterOnDisk`
Expand Down
2 changes: 2 additions & 0 deletions probables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
NotSupportedError,
ProbablesBaseException,
RotatingBloomFilterError,
SimilarityError,
)
from probables.quotientfilter import QuotientFilter
from probables.utilities import Bitarray
Expand Down Expand Up @@ -48,4 +49,5 @@
"RotatingBloomFilterError",
"QuotientFilter",
"Bitarray",
"SimilarityError",
]
37 changes: 17 additions & 20 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from textwrap import wrap
from typing import Union

from probables.exceptions import InitializationError, NotSupportedError
from probables.exceptions import InitializationError, NotSupportedError, SimilarityError
from probables.hashes import HashFuncT, HashResultsT, KeyT, default_fnv_1a
from probables.utilities import MMap, is_hex_string, is_valid_file, resolve_path

Expand Down Expand Up @@ -368,7 +368,7 @@ def current_false_positive_rate(self) -> float:
exp = math.exp(dbl)
return math.pow((1 - exp), self.number_hashes)

def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
def intersection(self, second: SimpleBloomT) -> SimpleBloomT:
"""Return a new Bloom Filter that contains the intersection of the
two

Expand All @@ -378,15 +378,14 @@ def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
BloomFilter: The new Bloom Filter containing the intersection
Raises:
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
`second` may be a BloomFilterOnDisk object
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
`second` may be a BloomFilterOnDisk object"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Bloom Filters are not similar")

res = BloomFilter(
self.estimated_elements,
Expand All @@ -399,7 +398,7 @@ def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
res.elements_added = res.estimate_elements()
return res

def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
def union(self, second: SimpleBloomT) -> "BloomFilter":
"""Return a new Bloom Filter that contains the union of the two

Args:
Expand All @@ -408,15 +407,14 @@ def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
BloomFilter: The new Bloom Filter containing the union
Raises:
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
`second` may be a BloomFilterOnDisk object
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
`second` may be a BloomFilterOnDisk object"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Bloom Filters are not similar")

res = BloomFilter(
self.estimated_elements,
Expand All @@ -429,7 +427,7 @@ def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
res.elements_added = res.estimate_elements()
return res

def jaccard_index(self, second: SimpleBloomT) -> Union[float, None]:
def jaccard_index(self, second: SimpleBloomT) -> float:
"""Calculate the jaccard similarity score between two Bloom Filters

Args:
Expand All @@ -438,15 +436,14 @@ def jaccard_index(self, second: SimpleBloomT) -> Union[float, None]:
float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different
Raises:
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
`second` may be a BloomFilterOnDisk object
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
`second` may be a BloomFilterOnDisk object"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Bloom Filters are not similar")

count_union = 0

Expand Down Expand Up @@ -694,7 +691,7 @@ def _get_element(self, idx: int) -> int:

def __update(self):
"""update the on disk Bloom Filter and ensure everything is out to disk"""
self._bloom.flush()
self.__file_pointer.seek(-1 * self._UPDATE_OFFSET.size, os.SEEK_END)
self.__file_pointer.write(self._EXPECTED_ELM_STRUCT.pack(self.elements_added))
self.__file_pointer.flush()
self._bloom.flush() # type: ignore
self.__file_pointer.seek(-1 * self._UPDATE_OFFSET.size, os.SEEK_END) # type: ignore
self.__file_pointer.write(self._EXPECTED_ELM_STRUCT.pack(self.elements_added)) # type: ignore
self.__file_pointer.flush() # type: ignore
32 changes: 15 additions & 17 deletions probables/blooms/countingbloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from probables.blooms.bloom import BloomFilter
from probables.constants import UINT32_T_MAX, UINT64_T_MAX
from probables.exceptions import InitializationError
from probables.exceptions import InitializationError, SimilarityError
from probables.hashes import HashFuncT, HashResultsT, KeyT
from probables.utilities import is_hex_string, is_valid_file, resolve_path

Expand Down Expand Up @@ -208,7 +208,7 @@ def remove_alt(self, hashes: HashResultsT, num_els: int = 1) -> int:
self.elements_added -= to_remove
return min_val - to_remove

def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type: ignore
def intersection(self, second: "CountingBloomFilter") -> "CountingBloomFilter": # type: ignore
"""Take the intersection of two Counting Bloom Filters

Args:
Expand All @@ -217,17 +217,16 @@ def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFil
CountingBloomFilter: The new Counting Bloom Filter containing the union
Raises:
TypeError: When second is not a :class:`CountingBloomFilter`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
The elements_added property will be set to the estimated number of unique elements \
added as found in estimate_elements()
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then \
this will return `None`"""
added as found in estimate_elements()"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")

res = CountingBloomFilter(
est_elements=self.estimated_elements,
false_positive_rate=self.false_positive_rate,
Expand All @@ -241,7 +240,7 @@ def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFil
res.elements_added = res.estimate_elements()
return res

def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: # type:ignore
def jaccard_index(self, second: "CountingBloomFilter") -> float: # type: ignore
"""Take the Jaccard Index of two Counting Bloom Filters

Args:
Expand All @@ -250,15 +249,14 @@ def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: #
float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different
Raises:
TypeError: When second is not a :class:`CountingBloomFilter`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
The Jaccard Index is based on the unique set of elements added and not the number of each element added
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
The Jaccard Index is based on the unique set of elements added and not the number of each element added"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")

count_union = 0
count_inter = 0
Expand All @@ -271,7 +269,7 @@ def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: #
return 1.0
return count_inter / count_union

def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type:ignore
def union(self, second: "CountingBloomFilter") -> "CountingBloomFilter": # type:ignore
"""Return a new Countiong Bloom Filter that contains the union of
the two

Expand All @@ -281,16 +279,16 @@ def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", N
CountingBloomFilter: The new Counting Bloom Filter containing the union
Raises:
TypeError: When second is not a :class:`CountingBloomFilter`
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
Note:
The elements_added property will be set to the estimated number of unique elements added as \
found in estimate_elements()
Note:
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
found in estimate_elements()"""
if not _verify_not_type_mismatch(second):
raise TypeError(MISMATCH_MSG)

if self._verify_bloom_similarity(second) is False:
return None
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")

res = CountingBloomFilter(
est_elements=self.estimated_elements,
false_positive_rate=self.false_positive_rate,
Expand Down
2 changes: 1 addition & 1 deletion probables/cuckoo/countingcuckoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def _parse_buckets(self, d: ByteString) -> None:
start = end
end += bin_size

def _expand_logic(self, extra_fingerprint: "CountingCuckooBin") -> None:
def _expand_logic(self, extra_fingerprint: Union["CountingCuckooBin", None]) -> None:
"""the logic to acutally expand the cuckoo filter"""
# get all the fingerprints
fingerprints = self._setup_expand(extra_fingerprint)
Expand Down
4 changes: 2 additions & 2 deletions probables/cuckoo/cuckoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ def _indicies_from_fingerprint(self, fingerprint):
Args:
fingerprint (int): The fingerprint to use for generating indicies"""
idx_1 = fingerprint % self.capacity
idx_2 = self.__hash_func(str(fingerprint)) % self.capacity
idx_2 = self.__hash_func(str(fingerprint)) % self.capacity # type: ignore
return idx_1, idx_2

def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]:
Expand All @@ -497,7 +497,7 @@ def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]:
key (str): The element for which information is to be generated
"""
# generate the fingerprint along with the two possible indecies
hash_val = self.__hash_func(key)
hash_val = self.__hash_func(key) # type: ignore
fingerprint = get_x_bits(hash_val, 64, self.fingerprint_size_bits, True)
idx_1, idx_2 = self._indicies_from_fingerprint(fingerprint)

Expand Down
11 changes: 11 additions & 0 deletions probables/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ def __init__(self, message: str) -> None:
super().__init__(self.message)


class SimilarityError(ProbablesBaseException):
"""Similarity Exception

Args:
message (str): The error message to be reported"""

def __init__(self, message: str) -> None:
self.message = message
super().__init__(self.message)


class CuckooFilterFullError(ProbablesBaseException):
"""Cuckoo Filter Full Exception

Expand Down
5 changes: 3 additions & 2 deletions probables/hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@

KeyT = Union[str, bytes]
SimpleHashT = Callable[[KeyT, int], int]
SimpleHashBytesT = Callable[[KeyT, int], bytes]
HashResultsT = list[int]
HashFuncT = Callable[[KeyT, int], HashResultsT]
HashFuncBytesT = Callable[[KeyT, int], bytes]


def hash_with_depth_bytes(func: HashFuncBytesT) -> HashFuncT:
def hash_with_depth_bytes(func: Union[HashFuncBytesT, SimpleHashBytesT]) -> HashFuncT:
"""Decorator to turns a function taking a single key and hashes it to
bytes. Wraps functions to be used in Bloom filters and Count-Min sketch
data structures.
Expand All @@ -40,7 +41,7 @@ def hashing_func(key, depth=1):
return hashing_func


def hash_with_depth_int(func: HashFuncT) -> HashFuncT:
def hash_with_depth_int(func: Union[HashFuncT, SimpleHashT]) -> HashFuncT:
"""Decorator to turn a function that takes a single key and hashes it to
an int. Wraps functions to be used in Bloom filters and Count-Min
sketch data structures.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ max-line-length = 120
max-line-length = 120

[tool.flake8]
extend-ignore = ["E203"]
max-line-length = 120

[tool.isort]
Expand Down
Loading
Loading