A booster πͺ for your Parquet file sizes.
pip3 install virtual-parquet
or
pip3 install .
A demo can be found at examples/demo.ipynb
.
import pandas as pd
import virtual
df = pd.read_csv('file.csv')
...
virtual.to_parquet(df, 'file_virtual.parquet')
% Virtualization finished: Check out 'file.parquet'.
import virtual
df = virtual.from_parquet('file_virtual.parquet')
import virtual
virtual.query(
'select avg(price) from read_parquet("file_virtual.parquet") where year >= 2024',
engine = 'duckdb'
)
import pandas as pd
import virtual
df = pd.read_csv('file.csv')
functions = virtual.train(df)
% Functions saved under
functions.json
.
Please do cite our (very) cool work if you use virtual
in your work.
@inproceedings{
virtual,
title={{Lightweight Correlation-Aware Table Compression}},
author={Mihail Stoian and Alexander van Renen and Jan Kobiolka and Ping-Lin Kuo and Josif Grabocka and Andreas Kipf},
booktitle={NeurIPS 2024 Third Table Representation Learning Workshop},
year={2024},
url={https://openreview.net/forum?id=z7eIn3aShi}
}