Skip to content

Commit

Permalink
[DataFrame] Implement IO for ray_df (ray-project#1599)
Browse files Browse the repository at this point in the history
* Add parquet-cpp to gitignore

* Add read_csv and read_parquet

* Gitignore pytest_cache

* Fix flake8

* Add io to __init__

* Changing Index. Currently running tests, but so far untested.

* Removing issue of reassigning DF in from_pandas

* Fixing lint

* Fix bug

* Fix bug

* Fix bug

* Better performance

* Fixing index issue with sum

* Address comments

* Update io with index

* Updating performance and implementation. Adding tests

* Fixing off-by-1

* Fix lint

* Address Comments

* Make pop compatible with new to_pandas

* Format Code

* Cleanup some index issue

* Bug fix: assigned reset_index back

* Remove unused debug line
  • Loading branch information
simon-mo authored and devin-petersohn committed Feb 27, 2018
1 parent 87e107e commit d78a22f
Show file tree
Hide file tree
Showing 5 changed files with 427 additions and 17 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
/src/thirdparty/boost_1_60_0/
/src/thirdparty/catapult/
/src/thirdparty/flatbuffers/
/src/thirdparty/parquet-cpp

# Files generated by flatc should be ignored
/src/common/format/*.py
Expand Down Expand Up @@ -137,3 +138,6 @@ build
/site/Gemfile.lock
/site/.sass-cache
/site/_site

# Pytest Cache
**/.pytest_cache
28 changes: 23 additions & 5 deletions python/ray/dataframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,27 @@
from __future__ import division
from __future__ import print_function

from .dataframe import DataFrame
from .dataframe import from_pandas
from .dataframe import to_pandas
from .series import Series
DEFAULT_NPARTITIONS = 10

__all__ = ["DataFrame", "from_pandas", "to_pandas", "Series"]

def set_npartition_default(n):
global DEFAULT_NPARTITIONS
DEFAULT_NPARTITIONS = n


def get_npartitions():
return DEFAULT_NPARTITIONS


# We import these file after above two function
# because they depend on npartitions.
from .dataframe import DataFrame # noqa: 402
from .dataframe import from_pandas # noqa: 402
from .dataframe import to_pandas # noqa: 402
from .series import Series # noqa: 402
from .io import (read_csv, read_parquet) # noqa: 402

__all__ = [
"DataFrame", "from_pandas", "to_pandas", "Series", "read_csv",
"read_parquet"
]
59 changes: 47 additions & 12 deletions python/ray/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,16 +373,29 @@ def transpose(self, *args, **kwargs):
temp_index = [idx
for _ in range(len(self._df))
for idx in self.columns]

temp_columns = self.index
local_transpose = self._map_partitions(
lambda df: df.transpose(*args, **kwargs), index=temp_index)
local_transpose.columns = temp_columns

# Sum will collapse the NAs from the groupby
return local_transpose.reduce_by_index(
df = local_transpose.reduce_by_index(
lambda df: df.apply(lambda x: x), axis=1)

# Reassign the columns within partition to self.index.
# We have to use _depoly_func instead of _map_partition due to
# new_labels argument
def _reassign_columns(df, new_labels):
df.columns = new_labels
return df
df._df = [
_deploy_func.remote(
_reassign_columns,
part,
self.index) for part in df._df]

return df

T = property(transpose)

def dropna(self, axis, how, thresh=None, subset=[], inplace=False):
Expand Down Expand Up @@ -563,9 +576,15 @@ def count(self, axis=0, level=None, numeric_only=False):
for _ in range(len(self._df))
for idx in self.columns]

return sum(ray.get(self._map_partitions(lambda df: df.count(
axis=axis, level=level, numeric_only=numeric_only
), index=temp_index)._df))
collapsed_df = sum(
ray.get(
self._map_partitions(
lambda df: df.count(
axis=axis,
level=level,
numeric_only=numeric_only),
index=temp_index)._df))
return collapsed_df

def cov(self, min_periods=None):
raise NotImplementedError("Not Yet implemented.")
Expand Down Expand Up @@ -865,7 +884,9 @@ def iterrows(self):
iters = ray.get([
_deploy_func.remote(
lambda df: list(df.iterrows()), part) for part in self._df])
return itertools.chain.from_iterable(iters)
iters = itertools.chain.from_iterable(iters)
series = map(lambda idx_series_tuple: idx_series_tuple[1], iters)
return zip(self.index, series)

def items(self):
"""Iterator over (column name, Series) pairs.
Expand All @@ -884,6 +905,7 @@ def items(self):
def concat_iters(iterables):
for partitions in zip(*iterables):
series = pd.concat([_series for _, _series in partitions])
series.index = self.index
yield (series.name, series)

return concat_iters(iters)
Expand Down Expand Up @@ -919,7 +941,20 @@ def itertuples(self, index=True, name='Pandas'):
_deploy_func.remote(
lambda df: list(df.itertuples(index=index, name=name)),
part) for part in self._df])
return itertools.chain.from_iterable(iters)
iters = itertools.chain.from_iterable(iters)

def _replace_index(row_tuple, idx):
# We need to use try-except here because
# isinstance(row_tuple, namedtuple) won't work.
try:
row_tuple = row_tuple._replace(Index=idx)
except AttributeError: # Tuple not namedtuple
row_tuple = (idx,) + row_tuple[1:]
return row_tuple

if index:
iters = itertools.starmap(_replace_index, zip(iters, self.index))
return iters

def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
Expand Down Expand Up @@ -1100,8 +1135,7 @@ def pop(self, item):
popped = to_pandas(self._map_partitions(
lambda df: df.pop(item)))
self._df = self._map_partitions(lambda df: df.drop([item], axis=1))._df
self.columns = [col for col in self.columns if col != item]

self.columns = self.columns.drop(item)
return popped

def pow(self, other, axis='columns', level=None, fill_value=None):
Expand Down Expand Up @@ -1949,13 +1983,14 @@ def from_pandas(df, npartitions=None, chunksize=None, sort=True):
while len(temp_df) > chunksize:
t_df = temp_df[:chunksize]
lengths.append(len(t_df))
# reindex here because we want a pd.RangeIndex within the partitions.
# It is smaller and sometimes faster.
t_df.reindex()
# reset_index here because we want a pd.RangeIndex
# within the partitions. It is smaller and sometimes faster.
t_df = t_df.reset_index(drop=True)
top = ray.put(t_df)
dataframes.append(top)
temp_df = temp_df[chunksize:]
else:
temp_df = temp_df.reset_index(drop=True)
dataframes.append(ray.put(temp_df))
lengths.append(len(temp_df))

Expand Down
Loading

0 comments on commit d78a22f

Please sign in to comment.