From 69b2496ce072db039730d8cc34a5127b415c5043 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 3 Jun 2019 18:52:23 -0700 Subject: [PATCH 1/4] add hashing for index --- python/cudf/dataframe/index.py | 26 ++++++++++++++++++++++++++ python/cudf/tests/test_index.py | 18 ++++++++++++++++++ thirdparty/rmm | 2 +- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py index 7eb0b893a1b..500730caa0a 100644 --- a/python/cudf/dataframe/index.py +++ b/python/cudf/dataframe/index.py @@ -23,6 +23,7 @@ from cudf.comm.serialize import register_distributed_serializer import cudf.bindings.copying as cpp_copying +import cudf.bindings.hash as cpp_hash class Index(object): @@ -530,6 +531,31 @@ def find_label_range(self, first, last): end += 1 return begin, end + def hash_index(self): + """Hash the given index and return a new Series + + Returns + ------- + Series : + Sequence of column names. If columns is *None* (unspecified), + all columns in the frame are used. + """ + from cudf.dataframe.series import Series + + initial_hash_values = None + buf = Buffer(rmm.device_array(len(self), dtype=np.int32)) + result = NumericalColumn(data=buf, dtype=buf.dtype) + + _hash = cpp_hash.hash_columns([self.as_column()], + result, initial_hash_values) + + sr = Series(_hash) + + # hash_columns produces negative valuesg + # probably can switch to np.uint32 + # when supported by libcud + return abs(sr) + class DatetimeIndex(GenericIndex): # TODO this constructor should take a timezone or something to be diff --git a/python/cudf/tests/test_index.py b/python/cudf/tests/test_index.py index 822549e9e46..f4425335cf2 100644 --- a/python/cudf/tests/test_index.py +++ b/python/cudf/tests/test_index.py @@ -123,6 +123,24 @@ def test_categorical_index(): assert_eq(pdf.index, gdf2.index) +@pytest.mark.parametrize('index_name', [ + 'num_idx', + 'cat_idx', +]) +def test_hashing_index(index_name): + pdf = pd.DataFrame() + pdf['num_idx'] = [1, 2, 3, 1] + pdf['cat_idx'] = pd.Categorical(['a', 'b', 'c', 'a']) + gdf = DataFrame.from_pandas(pdf) + sr = gdf.set_index(index_name).index.hash_index() + + # values are always positive for modulo calculation + assert_eq(sr, sr[sr > 0]) + assert len(sr) == len(pdf[index_name]) + assert sr.iloc[0] == sr.iloc[-1] + assert len(sr.unique()) == len(sr) - 1 + + def test_pandas_as_index(): # Define Pandas Indexes pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5]) diff --git a/thirdparty/rmm b/thirdparty/rmm index d704d10ec72..df27f2e3fae 160000 --- a/thirdparty/rmm +++ b/thirdparty/rmm @@ -1 +1 @@ -Subproject commit d704d10ec729437e0edc313c38bb7b4a1987b015 +Subproject commit df27f2e3fae2a8fe8ab37c2608f27bbfd2e45c47 From 8f0d01cd98fac448f78ef4db2241439454cac11d Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 3 Jun 2019 18:55:10 -0700 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed851439df..9c5ba7dcced 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - PR #1828 JSON Reader: add suport for bool8 columns - PR #1665 Add the point-in-polygon GIS function - PR #1863 Series and Dataframe methods for all and any +- PR #1917 Adds an index hashing method ## Improvements - PR #1538 Replacing LesserRTTI with inequality_comparator From f82cc485dd8359f0fa770635bca25e0730b7e0db Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 3 Jun 2019 19:00:14 -0700 Subject: [PATCH 3/4] update rmm --- thirdparty/rmm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/rmm b/thirdparty/rmm index df27f2e3fae..d704d10ec72 160000 --- a/thirdparty/rmm +++ b/thirdparty/rmm @@ -1 +1 @@ -Subproject commit df27f2e3fae2a8fe8ab37c2608f27bbfd2e45c47 +Subproject commit d704d10ec729437e0edc313c38bb7b4a1987b015 From 9679f1ee4644394d54ccefae5b2f313df630fee5 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 3 Jun 2019 22:06:11 -0400 Subject: [PATCH 4/4] Update python/cudf/dataframe/index.py Co-Authored-By: Keith Kraus --- python/cudf/dataframe/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py index 500730caa0a..52ffcb4a6a2 100644 --- a/python/cudf/dataframe/index.py +++ b/python/cudf/dataframe/index.py @@ -553,7 +553,7 @@ def hash_index(self): # hash_columns produces negative valuesg # probably can switch to np.uint32 - # when supported by libcud + # when supported by libcudf return abs(sr)