Skip to content

Commit

Permalink
[DataFrame] Implement Inter-DataFrame operations (ray-project#1937)
Browse files Browse the repository at this point in the history
  • Loading branch information
devin-petersohn authored and robertnishihara committed Apr 30, 2018
1 parent 34bc6ce commit 0c477fb
Show file tree
Hide file tree
Showing 4 changed files with 628 additions and 319 deletions.
37 changes: 20 additions & 17 deletions python/ray/dataframe/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import print_function

import pandas
import numpy as np
from .dataframe import DataFrame
from .utils import _reindex_helper

Expand Down Expand Up @@ -110,21 +111,23 @@ def name_incrementer(i):
# from remote memory built in the previous line. In the future, we won't be
# building new DataFrames, rather just partitioning the DataFrames.
if axis == 0:
new_rows = [_reindex_helper.remote(part, all_columns[i],
final_columns, axis)
for i in range(len(objs))
for part in objs[i]._row_partitions]

return DataFrame(row_partitions=new_rows,
columns=final_columns,
index=final_index)

new_blocks = np.array([_reindex_helper._submit(
args=tuple([all_columns[i], final_columns, axis,
len(objs[0]._block_partitions)] + part.tolist()),
num_return_vals=len(objs[0]._block_partitions))
for i in range(len(objs))
for part in objs[i]._block_partitions])
else:
new_columns = [_reindex_helper.remote(part, all_index[i],
final_index, axis)
for i in range(len(objs))
for part in objs[i]._col_partitions]

return DataFrame(col_partitions=new_columns,
columns=final_columns,
index=final_index)
# Transposing the columns is necessary because the remote task treats
# everything like rows and returns in row-major format. Luckily, this
# operation is cheap in numpy.
new_blocks = np.array([_reindex_helper._submit(
args=tuple([all_index[i], final_index, axis,
len(objs[0]._block_partitions.T)] + part.tolist()),
num_return_vals=len(objs[0]._block_partitions.T))
for i in range(len(objs))
for part in objs[i]._block_partitions.T]).T

return DataFrame(block_partitions=new_blocks,
columns=final_columns,
index=final_index)
Loading

0 comments on commit 0c477fb

Please sign in to comment.