From 69b2496ce072db039730d8cc34a5127b415c5043 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 3 Jun 2019 18:52:23 -0700
Subject: [PATCH 1/4] add hashing for index

---
 python/cudf/dataframe/index.py  | 26 ++++++++++++++++++++++++++
 python/cudf/tests/test_index.py | 18 ++++++++++++++++++
 thirdparty/rmm                  |  2 +-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py
index 7eb0b893a1b..500730caa0a 100644
--- a/python/cudf/dataframe/index.py
+++ b/python/cudf/dataframe/index.py
@@ -23,6 +23,7 @@
 from cudf.comm.serialize import register_distributed_serializer
 
 import cudf.bindings.copying as cpp_copying
+import cudf.bindings.hash as cpp_hash
 
 
 class Index(object):
@@ -530,6 +531,31 @@ def find_label_range(self, first, last):
             end += 1
         return begin, end
 
+    def hash_index(self):
+        """Hash the given index and return a new Series
+
+        Returns
+        -------
+        Series :
+            Sequence of column names. If columns is *None* (unspecified),
+            all columns in the frame are used.
+        """
+        from cudf.dataframe.series import Series
+
+        initial_hash_values = None
+        buf = Buffer(rmm.device_array(len(self), dtype=np.int32))
+        result = NumericalColumn(data=buf, dtype=buf.dtype)
+
+        _hash = cpp_hash.hash_columns([self.as_column()],
+                                      result, initial_hash_values)
+
+        sr = Series(_hash)
+
+        # hash_columns produces negative valuesg
+        # probably can switch to np.uint32
+        # when supported by libcud
+        return abs(sr)
+
 
 class DatetimeIndex(GenericIndex):
     # TODO this constructor should take a timezone or something to be
diff --git a/python/cudf/tests/test_index.py b/python/cudf/tests/test_index.py
index 822549e9e46..f4425335cf2 100644
--- a/python/cudf/tests/test_index.py
+++ b/python/cudf/tests/test_index.py
@@ -123,6 +123,24 @@ def test_categorical_index():
     assert_eq(pdf.index, gdf2.index)
 
 
+@pytest.mark.parametrize('index_name', [
+    'num_idx',
+    'cat_idx',
+])
+def test_hashing_index(index_name):
+    pdf = pd.DataFrame()
+    pdf['num_idx'] = [1, 2, 3, 1]
+    pdf['cat_idx'] = pd.Categorical(['a', 'b', 'c', 'a'])
+    gdf = DataFrame.from_pandas(pdf)
+    sr = gdf.set_index(index_name).index.hash_index()
+
+    # values are always positive for modulo calculation
+    assert_eq(sr, sr[sr > 0])
+    assert len(sr) == len(pdf[index_name])
+    assert sr.iloc[0] == sr.iloc[-1]
+    assert len(sr.unique()) == len(sr) - 1
+
+
 def test_pandas_as_index():
     # Define Pandas Indexes
     pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5])
diff --git a/thirdparty/rmm b/thirdparty/rmm
index d704d10ec72..df27f2e3fae 160000
--- a/thirdparty/rmm
+++ b/thirdparty/rmm
@@ -1 +1 @@
-Subproject commit d704d10ec729437e0edc313c38bb7b4a1987b015
+Subproject commit df27f2e3fae2a8fe8ab37c2608f27bbfd2e45c47

From 8f0d01cd98fac448f78ef4db2241439454cac11d Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 3 Jun 2019 18:55:10 -0700
Subject: [PATCH 2/4] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ed851439df..9c5ba7dcced 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@
 - PR #1828 JSON Reader: add suport for bool8 columns
 - PR #1665 Add the point-in-polygon GIS function
 - PR #1863 Series and Dataframe methods for all and any
+- PR #1917 Adds an index hashing method
 
 ## Improvements
 - PR #1538 Replacing LesserRTTI with inequality_comparator

From f82cc485dd8359f0fa770635bca25e0730b7e0db Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 3 Jun 2019 19:00:14 -0700
Subject: [PATCH 3/4] update rmm

---
 thirdparty/rmm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/rmm b/thirdparty/rmm
index df27f2e3fae..d704d10ec72 160000
--- a/thirdparty/rmm
+++ b/thirdparty/rmm
@@ -1 +1 @@
-Subproject commit df27f2e3fae2a8fe8ab37c2608f27bbfd2e45c47
+Subproject commit d704d10ec729437e0edc313c38bb7b4a1987b015

From 9679f1ee4644394d54ccefae5b2f313df630fee5 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Mon, 3 Jun 2019 22:06:11 -0400
Subject: [PATCH 4/4] Update python/cudf/dataframe/index.py

Co-Authored-By: Keith Kraus <keith.j.kraus@gmail.com>
---
 python/cudf/dataframe/index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py
index 500730caa0a..52ffcb4a6a2 100644
--- a/python/cudf/dataframe/index.py
+++ b/python/cudf/dataframe/index.py
@@ -553,7 +553,7 @@ def hash_index(self):
 
         # hash_columns produces negative valuesg
         # probably can switch to np.uint32
-        # when supported by libcud
+        # when supported by libcudf
         return abs(sr)