From 1449d6ce01ad606ecc7bf713c9863a139590dd7b Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 5 Jun 2019 11:05:46 -0700
Subject: [PATCH 1/3] define meta creation instead of to_pandas and dispatching
 to dask

---
 dask_cudf/backends.py | 96 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 3 deletions(-)

diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py
index 5009963..ce9ea63 100644
--- a/dask_cudf/backends.py
+++ b/dask_cudf/backends.py
@@ -1,9 +1,20 @@
+import numpy as np
+import pandas as pd
+
 from dask.dataframe.methods import concat_dispatch
+from dask.dataframe.utils import is_integer_na_dtype, _scalar_from_dtype
 from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta
+
 import cudf
+from cudf.dataframe.index import DatetimeIndex, GenericIndex, CategoricalIndex, StringIndex, RangeIndex
+from cudf import MultiIndex
 
-from .core import DataFrame, Series, Index
 
+from pandas.api.types import (is_categorical_dtype, is_scalar, is_sparse,
+                              is_period_dtype, is_datetime64tz_dtype,
+                              is_interval_dtype)
+
+from .core import DataFrame, Series, Index
 
 get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_parallel_type.register(cudf.Series, lambda _: Series)
@@ -12,8 +23,13 @@
 
 @meta_nonempty.register((cudf.DataFrame, cudf.Series, cudf.Index))
 def meta_nonempty_cudf(x, index=None):
-    y = meta_nonempty(x.to_pandas())  # TODO: add iloc[:5]
-    return cudf.from_pandas(y)
+
+    idx = _nonempty_index(x.index)
+    data = {i: _nonempty_series(x.iloc[:, i], idx=idx)
+            for i, c in enumerate(x.columns)}
+    res = cudf.DataFrame(data, index=idx)
+    res.columns = x.columns
+    return res
 
 
 @make_meta.register((cudf.Series, cudf.DataFrame))
@@ -31,3 +47,77 @@ def concat_cudf(dfs, axis=0, join="outer", uniform=False, filter_warning=True):
     assert axis == 0
     assert join == "outer"
     return cudf.concat(dfs)
+
+
+@meta_nonempty.register(cudf.Series)
+def _nonempty_series(s, idx=None):
+    if idx is None:
+        idx = _nonempty_index(s.index)
+    dtype = s.dtype
+    if is_datetime64tz_dtype(dtype):
+        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
+        data = [entry, entry]
+    elif is_categorical_dtype(dtype):
+        if len(s.cat.categories):
+            data = [s.cat.categories[0]] * 2
+            cats = s.cat.categories
+        else:
+            data = _nonempty_index(s.cat.categories)
+            cats = None
+        data = pd.Categorical(data, categories=cats,
+                              ordered=s.cat.ordered)
+    elif is_integer_na_dtype(dtype):
+        data = pd.array([1, None], dtype=dtype)
+    elif is_period_dtype(dtype):
+        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
+        freq = dtype.freq
+        data = [pd.Period('2000', freq), pd.Period('2001', freq)]
+    else:
+        entry = _scalar_from_dtype(dtype)
+        data = np.array([entry, entry], dtype=dtype)
+
+    return cudf.Series(data, name=s.name, index=idx)
+
+
+@meta_nonempty.register(cudf.Index)
+def _nonempty_index(idx):
+    typ = type(idx)
+    if typ is RangeIndex:
+        return typ(2, name=idx.name)
+    elif typ is GenericIndex:
+        return typ([1, 2], name=idx.name)
+    elif typ is StringIndex:
+        return typ(['a', 'b'], name=idx.name)
+    elif typ is CategoricalIndex:
+        if len(idx.categories) == 0:
+            data = pd.Categorical(_nonempty_index(idx.categories),
+                                  ordered=idx.ordered)
+        else:
+            data = pd.Categorical.from_codes(
+                [-1, 0], categories=idx.categories, ordered=idx.ordered)
+        return type(data, name=idx.name)
+    elif typ is DatetimeIndex:
+        start = '1970-01-01'
+        # Need a non-monotonic decreasing index to avoid issues with
+        # partial string indexing see https://github.com/dask/dask/issues/2389
+        # and https://github.com/pandas-dev/pandas/issues/16515
+        # This doesn't mean `_meta_nonempty` should ever rely on
+        # `self.monotonic_increasing` or `self.monotonic_decreasing`
+        try:
+            dates = pd.date_range(start=start, periods=2, freq=idx.freq,
+                                 tz=idx.tz, name=idx.name)
+        except ValueError:  # older pandas versions
+            data = [start, '1970-01-02'] if idx.freq is None else None
+            dates = pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq,
+                                    tz=idx.tz, name=idx.name)
+        return type(dates, name=idx.name)
+    elif typ is MultiIndex:
+        levels = [_nonempty_index(l) for l in idx.levels]
+        codes = [[0, 0] for i in idx.levels]
+        try:
+            return typ(levels=levels, codes=codes, names=idx.names)
+        except TypeError:  # older pandas versions
+            return typ(levels=levels, labels=codes, names=idx.names)
+
+    raise TypeError("Don't know how to handle index of "
+                    "type {0}".format(typename(type(idx))))

From 71537411b116391e0676e372695c06acbcc82e9f Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 5 Jun 2019 11:09:24 -0700
Subject: [PATCH 2/3] linting

---
 dask_cudf/backends.py | 54 +++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py
index ce9ea63..0dd6bea 100644
--- a/dask_cudf/backends.py
+++ b/dask_cudf/backends.py
@@ -1,18 +1,27 @@
 import numpy as np
 import pandas as pd
 
+from dask.utils import typename
 from dask.dataframe.methods import concat_dispatch
 from dask.dataframe.utils import is_integer_na_dtype, _scalar_from_dtype
 from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta
 
 import cudf
-from cudf.dataframe.index import DatetimeIndex, GenericIndex, CategoricalIndex, StringIndex, RangeIndex
+from cudf.dataframe.index import (
+    DatetimeIndex,
+    GenericIndex,
+    CategoricalIndex,
+    StringIndex,
+    RangeIndex,
+)
 from cudf import MultiIndex
 
 
-from pandas.api.types import (is_categorical_dtype, is_scalar, is_sparse,
-                              is_period_dtype, is_datetime64tz_dtype,
-                              is_interval_dtype)
+from pandas.api.types import (
+    is_categorical_dtype,
+    is_period_dtype,
+    is_datetime64tz_dtype,
+)
 
 from .core import DataFrame, Series, Index
 
@@ -25,8 +34,7 @@
 def meta_nonempty_cudf(x, index=None):
 
     idx = _nonempty_index(x.index)
-    data = {i: _nonempty_series(x.iloc[:, i], idx=idx)
-            for i, c in enumerate(x.columns)}
+    data = {i: _nonempty_series(x.iloc[:, i], idx=idx) for i, c in enumerate(x.columns)}
     res = cudf.DataFrame(data, index=idx)
     res.columns = x.columns
     return res
@@ -55,7 +63,7 @@ def _nonempty_series(s, idx=None):
         idx = _nonempty_index(s.index)
     dtype = s.dtype
     if is_datetime64tz_dtype(dtype):
-        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
+        entry = pd.Timestamp("1970-01-01", tz=dtype.tz)
         data = [entry, entry]
     elif is_categorical_dtype(dtype):
         if len(s.cat.categories):
@@ -64,14 +72,13 @@ def _nonempty_series(s, idx=None):
         else:
             data = _nonempty_index(s.cat.categories)
             cats = None
-        data = pd.Categorical(data, categories=cats,
-                              ordered=s.cat.ordered)
+        data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered)
     elif is_integer_na_dtype(dtype):
         data = pd.array([1, None], dtype=dtype)
     elif is_period_dtype(dtype):
         # pandas 0.24.0+ should infer this to be Series[Period[freq]]
         freq = dtype.freq
-        data = [pd.Period('2000', freq), pd.Period('2001', freq)]
+        data = [pd.Period("2000", freq), pd.Period("2001", freq)]
     else:
         entry = _scalar_from_dtype(dtype)
         data = np.array([entry, entry], dtype=dtype)
@@ -87,29 +94,31 @@ def _nonempty_index(idx):
     elif typ is GenericIndex:
         return typ([1, 2], name=idx.name)
     elif typ is StringIndex:
-        return typ(['a', 'b'], name=idx.name)
+        return typ(["a", "b"], name=idx.name)
     elif typ is CategoricalIndex:
         if len(idx.categories) == 0:
-            data = pd.Categorical(_nonempty_index(idx.categories),
-                                  ordered=idx.ordered)
+            data = pd.Categorical(_nonempty_index(idx.categories), ordered=idx.ordered)
         else:
             data = pd.Categorical.from_codes(
-                [-1, 0], categories=idx.categories, ordered=idx.ordered)
+                [-1, 0], categories=idx.categories, ordered=idx.ordered
+            )
         return type(data, name=idx.name)
     elif typ is DatetimeIndex:
-        start = '1970-01-01'
+        start = "1970-01-01"
         # Need a non-monotonic decreasing index to avoid issues with
         # partial string indexing see https://github.com/dask/dask/issues/2389
         # and https://github.com/pandas-dev/pandas/issues/16515
         # This doesn't mean `_meta_nonempty` should ever rely on
         # `self.monotonic_increasing` or `self.monotonic_decreasing`
         try:
-            dates = pd.date_range(start=start, periods=2, freq=idx.freq,
-                                 tz=idx.tz, name=idx.name)
+            dates = pd.date_range(
+                start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name
+            )
         except ValueError:  # older pandas versions
-            data = [start, '1970-01-02'] if idx.freq is None else None
-            dates = pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq,
-                                    tz=idx.tz, name=idx.name)
+            data = [start, "1970-01-02"] if idx.freq is None else None
+            dates = pd.DatetimeIndex(
+                data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name
+            )
         return type(dates, name=idx.name)
     elif typ is MultiIndex:
         levels = [_nonempty_index(l) for l in idx.levels]
@@ -119,5 +128,6 @@ def _nonempty_index(idx):
         except TypeError:  # older pandas versions
             return typ(levels=levels, labels=codes, names=idx.names)
 
-    raise TypeError("Don't know how to handle index of "
-                    "type {0}".format(typename(type(idx))))
+    raise TypeError(
+        "Don't know how to handle index of " "type {0}".format(typename(type(idx)))
+    )

From 3d980467f1dddb92f027a605186c69efb956b7d7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Thu, 6 Jun 2019 09:41:37 -0700
Subject: [PATCH 3/3] remove freq/tz access when creating meta objects

---
 dask_cudf/backends.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py
index 0dd6bea..0b4cf27 100644
--- a/dask_cudf/backends.py
+++ b/dask_cudf/backends.py
@@ -19,7 +19,6 @@
 
 from pandas.api.types import (
     is_categorical_dtype,
-    is_period_dtype,
     is_datetime64tz_dtype,
 )
 
@@ -75,10 +74,6 @@ def _nonempty_series(s, idx=None):
         data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered)
     elif is_integer_na_dtype(dtype):
         data = pd.array([1, None], dtype=dtype)
-    elif is_period_dtype(dtype):
-        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
-        freq = dtype.freq
-        data = [pd.Period("2000", freq), pd.Period("2001", freq)]
     else:
         entry = _scalar_from_dtype(dtype)
         data = np.array([entry, entry], dtype=dtype)
@@ -105,6 +100,9 @@ def _nonempty_index(idx):
         return type(data, name=idx.name)
     elif typ is DatetimeIndex:
         start = "1970-01-01"
+        freq = None  # cudf does not support frequency
+        tz = None  # cudf does not support a timezone
+
         # Need a non-monotonic decreasing index to avoid issues with
         # partial string indexing see https://github.com/dask/dask/issues/2389
         # and https://github.com/pandas-dev/pandas/issues/16515
@@ -112,14 +110,14 @@ def _nonempty_index(idx):
         # `self.monotonic_increasing` or `self.monotonic_decreasing`
         try:
             dates = pd.date_range(
-                start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name
+                start=start, periods=2, freq=freq, tz=tz, name=idx.name
             )
         except ValueError:  # older pandas versions
-            data = [start, "1970-01-02"] if idx.freq is None else None
+            data = None
             dates = pd.DatetimeIndex(
-                data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name
+                data, start=start, periods=2, freq=freq, tz=tz, name=idx.name
             )
-        return type(dates, name=idx.name)
+        return typ(dates, name=idx.name)
     elif typ is MultiIndex:
         levels = [_nonempty_index(l) for l in idx.levels]
         codes = [[0, 0] for i in idx.levels]