From ea05559dc748eafe7de4f3ea39401b2d3f21bb4e Mon Sep 17 00:00:00 2001 From: Chathura Widanage <7312649+chathurawidanage@users.noreply.github.com> Date: Wed, 12 May 2021 10:07:30 -0400 Subject: [PATCH] DOC: Adding Cylon under ecosystem/out of core (#41402) --- doc/source/ecosystem.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index d53d0556dca04..bc2325f15852c 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -405,6 +405,35 @@ Blaze provides a standard API for doing computations with various in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, PySpark. +`Cylon `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Cylon is a fast, scalable, distributed memory parallel runtime with a pandas +like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache +Arrow format to represent the data in-memory. Cylon DataFrame API implements +most of the core operators of pandas such as merge, filter, join, concat, +group-by, drop_duplicates, etc. These operators are designed to work across +thousands of cores to scale applications. It can interoperate with pandas +DataFrame by reading data from pandas or converting data to pandas so users +can selectively scale parts of their pandas DataFrame applications. + +.. code:: python + + from pycylon import read_csv, DataFrame, CylonEnv + from pycylon.net import MPIConfig + + # Initialize Cylon distributed environment + config: MPIConfig = MPIConfig() + env: CylonEnv = CylonEnv(config=config, distributed=True) + + df1: DataFrame = read_csv('/tmp/csv1.csv') + df2: DataFrame = read_csv('/tmp/csv2.csv') + + # Using 1000s of cores across the cluster to compute the join + df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) + + print(df3) + `Dask `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~