From 1449d6ce01ad606ecc7bf713c9863a139590dd7b Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Wed, 5 Jun 2019 11:05:46 -0700 Subject: [PATCH 1/3] define meta creation instead of to_pandas and dispatching to dask --- dask_cudf/backends.py | 96 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py index 5009963..ce9ea63 100644 --- a/dask_cudf/backends.py +++ b/dask_cudf/backends.py @@ -1,9 +1,20 @@ +import numpy as np +import pandas as pd + from dask.dataframe.methods import concat_dispatch +from dask.dataframe.utils import is_integer_na_dtype, _scalar_from_dtype from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta + import cudf +from cudf.dataframe.index import DatetimeIndex, GenericIndex, CategoricalIndex, StringIndex, RangeIndex +from cudf import MultiIndex -from .core import DataFrame, Series, Index +from pandas.api.types import (is_categorical_dtype, is_scalar, is_sparse, + is_period_dtype, is_datetime64tz_dtype, + is_interval_dtype) + +from .core import DataFrame, Series, Index get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) @@ -12,8 +23,13 @@ @meta_nonempty.register((cudf.DataFrame, cudf.Series, cudf.Index)) def meta_nonempty_cudf(x, index=None): - y = meta_nonempty(x.to_pandas()) # TODO: add iloc[:5] - return cudf.from_pandas(y) + + idx = _nonempty_index(x.index) + data = {i: _nonempty_series(x.iloc[:, i], idx=idx) + for i, c in enumerate(x.columns)} + res = cudf.DataFrame(data, index=idx) + res.columns = x.columns + return res @make_meta.register((cudf.Series, cudf.DataFrame)) @@ -31,3 +47,77 @@ def concat_cudf(dfs, axis=0, join="outer", uniform=False, filter_warning=True): assert axis == 0 assert join == "outer" return cudf.concat(dfs) + + +@meta_nonempty.register(cudf.Series) +def _nonempty_series(s, idx=None): + if idx is None: + idx = _nonempty_index(s.index) + dtype = s.dtype + if is_datetime64tz_dtype(dtype): + entry = pd.Timestamp('1970-01-01', tz=dtype.tz) + data = [entry, entry] + elif is_categorical_dtype(dtype): + if len(s.cat.categories): + data = [s.cat.categories[0]] * 2 + cats = s.cat.categories + else: + data = _nonempty_index(s.cat.categories) + cats = None + data = pd.Categorical(data, categories=cats, + ordered=s.cat.ordered) + elif is_integer_na_dtype(dtype): + data = pd.array([1, None], dtype=dtype) + elif is_period_dtype(dtype): + # pandas 0.24.0+ should infer this to be Series[Period[freq]] + freq = dtype.freq + data = [pd.Period('2000', freq), pd.Period('2001', freq)] + else: + entry = _scalar_from_dtype(dtype) + data = np.array([entry, entry], dtype=dtype) + + return cudf.Series(data, name=s.name, index=idx) + + +@meta_nonempty.register(cudf.Index) +def _nonempty_index(idx): + typ = type(idx) + if typ is RangeIndex: + return typ(2, name=idx.name) + elif typ is GenericIndex: + return typ([1, 2], name=idx.name) + elif typ is StringIndex: + return typ(['a', 'b'], name=idx.name) + elif typ is CategoricalIndex: + if len(idx.categories) == 0: + data = pd.Categorical(_nonempty_index(idx.categories), + ordered=idx.ordered) + else: + data = pd.Categorical.from_codes( + [-1, 0], categories=idx.categories, ordered=idx.ordered) + return type(data, name=idx.name) + elif typ is DatetimeIndex: + start = '1970-01-01' + # Need a non-monotonic decreasing index to avoid issues with + # partial string indexing see https://github.com/dask/dask/issues/2389 + # and https://github.com/pandas-dev/pandas/issues/16515 + # This doesn't mean `_meta_nonempty` should ever rely on + # `self.monotonic_increasing` or `self.monotonic_decreasing` + try: + dates = pd.date_range(start=start, periods=2, freq=idx.freq, + tz=idx.tz, name=idx.name) + except ValueError: # older pandas versions + data = [start, '1970-01-02'] if idx.freq is None else None + dates = pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq, + tz=idx.tz, name=idx.name) + return type(dates, name=idx.name) + elif typ is MultiIndex: + levels = [_nonempty_index(l) for l in idx.levels] + codes = [[0, 0] for i in idx.levels] + try: + return typ(levels=levels, codes=codes, names=idx.names) + except TypeError: # older pandas versions + return typ(levels=levels, labels=codes, names=idx.names) + + raise TypeError("Don't know how to handle index of " + "type {0}".format(typename(type(idx)))) From 71537411b116391e0676e372695c06acbcc82e9f Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Wed, 5 Jun 2019 11:09:24 -0700 Subject: [PATCH 2/3] linting --- dask_cudf/backends.py | 54 +++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py index ce9ea63..0dd6bea 100644 --- a/dask_cudf/backends.py +++ b/dask_cudf/backends.py @@ -1,18 +1,27 @@ import numpy as np import pandas as pd +from dask.utils import typename from dask.dataframe.methods import concat_dispatch from dask.dataframe.utils import is_integer_na_dtype, _scalar_from_dtype from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta import cudf -from cudf.dataframe.index import DatetimeIndex, GenericIndex, CategoricalIndex, StringIndex, RangeIndex +from cudf.dataframe.index import ( + DatetimeIndex, + GenericIndex, + CategoricalIndex, + StringIndex, + RangeIndex, +) from cudf import MultiIndex -from pandas.api.types import (is_categorical_dtype, is_scalar, is_sparse, - is_period_dtype, is_datetime64tz_dtype, - is_interval_dtype) +from pandas.api.types import ( + is_categorical_dtype, + is_period_dtype, + is_datetime64tz_dtype, +) from .core import DataFrame, Series, Index @@ -25,8 +34,7 @@ def meta_nonempty_cudf(x, index=None): idx = _nonempty_index(x.index) - data = {i: _nonempty_series(x.iloc[:, i], idx=idx) - for i, c in enumerate(x.columns)} + data = {i: _nonempty_series(x.iloc[:, i], idx=idx) for i, c in enumerate(x.columns)} res = cudf.DataFrame(data, index=idx) res.columns = x.columns return res @@ -55,7 +63,7 @@ def _nonempty_series(s, idx=None): idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): - entry = pd.Timestamp('1970-01-01', tz=dtype.tz) + entry = pd.Timestamp("1970-01-01", tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): @@ -64,14 +72,13 @@ def _nonempty_series(s, idx=None): else: data = _nonempty_index(s.cat.categories) cats = None - data = pd.Categorical(data, categories=cats, - ordered=s.cat.ordered) + data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq - data = [pd.Period('2000', freq), pd.Period('2001', freq)] + data = [pd.Period("2000", freq), pd.Period("2001", freq)] else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) @@ -87,29 +94,31 @@ def _nonempty_index(idx): elif typ is GenericIndex: return typ([1, 2], name=idx.name) elif typ is StringIndex: - return typ(['a', 'b'], name=idx.name) + return typ(["a", "b"], name=idx.name) elif typ is CategoricalIndex: if len(idx.categories) == 0: - data = pd.Categorical(_nonempty_index(idx.categories), - ordered=idx.ordered) + data = pd.Categorical(_nonempty_index(idx.categories), ordered=idx.ordered) else: data = pd.Categorical.from_codes( - [-1, 0], categories=idx.categories, ordered=idx.ordered) + [-1, 0], categories=idx.categories, ordered=idx.ordered + ) return type(data, name=idx.name) elif typ is DatetimeIndex: - start = '1970-01-01' + start = "1970-01-01" # Need a non-monotonic decreasing index to avoid issues with # partial string indexing see https://github.com/dask/dask/issues/2389 # and https://github.com/pandas-dev/pandas/issues/16515 # This doesn't mean `_meta_nonempty` should ever rely on # `self.monotonic_increasing` or `self.monotonic_decreasing` try: - dates = pd.date_range(start=start, periods=2, freq=idx.freq, - tz=idx.tz, name=idx.name) + dates = pd.date_range( + start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name + ) except ValueError: # older pandas versions - data = [start, '1970-01-02'] if idx.freq is None else None - dates = pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq, - tz=idx.tz, name=idx.name) + data = [start, "1970-01-02"] if idx.freq is None else None + dates = pd.DatetimeIndex( + data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name + ) return type(dates, name=idx.name) elif typ is MultiIndex: levels = [_nonempty_index(l) for l in idx.levels] @@ -119,5 +128,6 @@ def _nonempty_index(idx): except TypeError: # older pandas versions return typ(levels=levels, labels=codes, names=idx.names) - raise TypeError("Don't know how to handle index of " - "type {0}".format(typename(type(idx)))) + raise TypeError( + "Don't know how to handle index of " "type {0}".format(typename(type(idx))) + ) From 3d980467f1dddb92f027a605186c69efb956b7d7 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Thu, 6 Jun 2019 09:41:37 -0700 Subject: [PATCH 3/3] remove freq/tz access when creating meta objects --- dask_cudf/backends.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/dask_cudf/backends.py b/dask_cudf/backends.py index 0dd6bea..0b4cf27 100644 --- a/dask_cudf/backends.py +++ b/dask_cudf/backends.py @@ -19,7 +19,6 @@ from pandas.api.types import ( is_categorical_dtype, - is_period_dtype, is_datetime64tz_dtype, ) @@ -75,10 +74,6 @@ def _nonempty_series(s, idx=None): data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) - elif is_period_dtype(dtype): - # pandas 0.24.0+ should infer this to be Series[Period[freq]] - freq = dtype.freq - data = [pd.Period("2000", freq), pd.Period("2001", freq)] else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) @@ -105,6 +100,9 @@ def _nonempty_index(idx): return type(data, name=idx.name) elif typ is DatetimeIndex: start = "1970-01-01" + freq = None # cudf does not support frequency + tz = None # cudf does not support a timezone + # Need a non-monotonic decreasing index to avoid issues with # partial string indexing see https://github.com/dask/dask/issues/2389 # and https://github.com/pandas-dev/pandas/issues/16515 @@ -112,14 +110,14 @@ def _nonempty_index(idx): # `self.monotonic_increasing` or `self.monotonic_decreasing` try: dates = pd.date_range( - start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name + start=start, periods=2, freq=freq, tz=tz, name=idx.name ) except ValueError: # older pandas versions - data = [start, "1970-01-02"] if idx.freq is None else None + data = None dates = pd.DatetimeIndex( - data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name + data, start=start, periods=2, freq=freq, tz=tz, name=idx.name ) - return type(dates, name=idx.name) + return typ(dates, name=idx.name) elif typ is MultiIndex: levels = [_nonempty_index(l) for l in idx.levels] codes = [[0, 0] for i in idx.levels]