Skip to content

Commit

Permalink
BENCH: Add benchmarks for np.loadtxt reading from CSV format
Browse files Browse the repository at this point in the history
* NumPy currently has no asv benchmarks for np.loadtxt; now adding
a number of benchmarks for np.loadtxt handling of CSV format files
to prevent performance regressions
  • Loading branch information
tylerjereddy committed Aug 9, 2018
1 parent c0b4340 commit f25b18f
Showing 1 changed file with 177 additions and 0 deletions.
177 changes: 177 additions & 0 deletions benchmarks/benchmarks/bench_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .common import Benchmark, get_squares

import numpy as np
from io import StringIO


class Copy(Benchmark):
Expand Down Expand Up @@ -62,3 +63,179 @@ def setup(self):

def time_vb_savez_squares(self):
np.savez('tmp.npz', self.squares)

class LoadtxtCSVComments(Benchmark):
# benchmarks for np.loadtxt comment handling
# when reading in CSV files

params = [10, int(1e2), int(1e4), int(1e5)]
param_names = ['num_lines']

def setup(self, num_lines):
data = [u'1,2,3 # comment'] * num_lines
# unfortunately, timeit will only run setup()
# between repeat events, but not for iterations
# within repeats, so the StringIO object
# will have to be rewinded in the benchmark proper
self.data_comments = StringIO(u'\n'.join(data))

def time_comment_loadtxt_csv(self, num_lines):
# benchmark handling of lines with comments
# when loading in from csv files

# inspired by similar benchmark in pandas
# for read_csv

# need to rewind StringIO object (unfortunately
# confounding timing result somewhat) for every
# call to timing test proper
np.loadtxt(self.data_comments,
delimiter=u',')
self.data_comments.seek(0)

class LoadtxtCSVdtypes(Benchmark):
# benchmarks for np.loadtxt operating with
# different dtypes parsed / cast from CSV files

params = (['float32', 'float64', 'int32', 'int64',
'complex128', 'str', 'object'],
[10, int(1e2), int(1e4), int(1e5)])
param_names = ['dtype', 'num_lines']

def setup(self, dtype, num_lines):
data = [u'5, 7, 888'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))

def time_loadtxt_dtypes_csv(self, dtype, num_lines):
# benchmark loading arrays of various dtypes
# from csv files

# state-dependent timing benchmark requires
# rewind of StringIO object

np.loadtxt(self.csv_data,
delimiter=u',',
dtype=dtype)
self.csv_data.seek(0)

class LoadtxtCSVStructured(Benchmark):
# benchmarks for np.loadtxt operating with
# a structured data type & CSV file

def setup(self):
num_lines = 50000
data = [u"M, 21, 72, X, 155"] * num_lines
self.csv_data = StringIO(u'\n'.join(data))

def time_loadtxt_csv_struct_dtype(self):
# obligate rewind of StringIO object
# between iterations of a repeat:

np.loadtxt(self.csv_data,
delimiter=u',',
dtype=[('category_1', 'S1'),
('category_2', 'i4'),
('category_3', 'f8'),
('category_4', 'S1'),
('category_5', 'f8')])
self.csv_data.seek(0)


class LoadtxtCSVSkipRows(Benchmark):
# benchmarks for loadtxt row skipping when
# reading in csv file data; a similar benchmark
# is present in the pandas asv suite

params = [0, 500, 10000]
param_names = ['skiprows']

def setup(self, skiprows):
np.random.seed(123)
test_array = np.random.rand(100000, 3)
self.fname = 'test_array.csv'
np.savetxt(fname=self.fname,
X=test_array,
delimiter=',')

def time_skiprows_csv(self, skiprows):
np.loadtxt(self.fname,
delimiter=',',
skiprows=skiprows)

class LoadtxtReadUint64Integers(Benchmark):
# pandas has a similar CSV reading benchmark
# modified to suit np.loadtxt

params = [550, 1000, 10000]
param_names = ['size']

def setup(self, size):
arr = np.arange(size).astype('uint64') + 2**63
self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))

def time_read_uint64(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data1)
self.data1.seek(0)

def time_read_uint64_neg_values(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data2)
self.data2.seek(0)

class LoadtxtUseColsCSV(Benchmark):
# benchmark selective column reading from CSV files
# using np.loadtxt

params = [2, [1, 3], [1, 3, 5, 7]]
param_names = ['usecols']

def setup(self, usecols):
num_lines = 5000
data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))

def time_loadtxt_usecols_csv(self, usecols):
# must rewind StringIO because of state
# dependence of file reading
np.loadtxt(self.csv_data,
delimiter=u',',
usecols=usecols)
self.csv_data.seek(0)

class LoadtxtCSVDateTime(Benchmark):
# benchmarks for np.loadtxt operating with
# datetime data in a CSV file

params = [20, 200, 2000, 20000]
param_names = ['num_lines']

def setup(self, num_lines):
# create the equivalent of a two-column CSV file
# with date strings in the first column and random
# floating point data in the second column
dates = np.arange('today', 20, dtype=np.datetime64)
np.random.seed(123)
values = np.random.rand(20)
date_line = u''

for date, value in zip(dates, values):
date_line += (str(date) + ',' + str(value) + '\n')

# expand data to specified number of lines
data = date_line * (num_lines // 20)
self.csv_data = StringIO(data)

def time_loadtxt_csv_datetime(self, num_lines):
# rewind StringIO object -- the timing iterations
# are state-dependent
X = np.loadtxt(self.csv_data,
delimiter=u',',
dtype=([('dates', 'M8[us]'),
('values', 'float64')]))
self.csv_data.seek(0)

0 comments on commit f25b18f

Please sign in to comment.