BENCH: Add benchmarks for np.loadtxt reading from CSV format

* NumPy currently has no asv benchmarks for np.loadtxt; now adding a number of benchmarks for np.loadtxt handling of CSV format files to prevent performance regressions
sciunto · Aug 9, 2018 · f25b18f · f25b18f
1 parent c0b4340
commit f25b18f
Showing 1 changed file with 177 additions and 0 deletions.
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
@@ -3,6 +3,7 @@
 from .common import Benchmark, get_squares
 
 import numpy as np
+from io import StringIO
 
 
 class Copy(Benchmark):
@@ -62,3 +63,179 @@ def setup(self):
 
     def time_vb_savez_squares(self):
         np.savez('tmp.npz', self.squares)
+
+class LoadtxtCSVComments(Benchmark):
+    # benchmarks for np.loadtxt comment handling
+    # when reading in CSV files
+
+    params = [10, int(1e2), int(1e4), int(1e5)]
+    param_names = ['num_lines']
+
+    def setup(self, num_lines):
+        data = [u'1,2,3 # comment'] * num_lines
+        # unfortunately, timeit will only run setup()
+        # between repeat events, but not for iterations
+        # within repeats, so the StringIO object
+        # will have to be rewinded in the benchmark proper
+        self.data_comments = StringIO(u'\n'.join(data))
+
+    def time_comment_loadtxt_csv(self, num_lines):
+        # benchmark handling of lines with comments
+        # when loading in from csv files
+
+        # inspired by similar benchmark in pandas
+        # for read_csv
+
+        # need to rewind StringIO object (unfortunately
+        # confounding timing result somewhat) for every
+        # call to timing test proper
+        np.loadtxt(self.data_comments,
+                   delimiter=u',')
+        self.data_comments.seek(0)
+
+class LoadtxtCSVdtypes(Benchmark):
+    # benchmarks for np.loadtxt operating with
+    # different dtypes parsed / cast from CSV files
+
+    params = (['float32', 'float64', 'int32', 'int64',
+               'complex128', 'str', 'object'],
+              [10, int(1e2), int(1e4), int(1e5)])
+    param_names = ['dtype', 'num_lines']
+
+    def setup(self, dtype, num_lines):
+        data = [u'5, 7, 888'] * num_lines
+        self.csv_data = StringIO(u'\n'.join(data))
+
+    def time_loadtxt_dtypes_csv(self, dtype, num_lines):
+        # benchmark loading arrays of various dtypes
+        # from csv files
+
+        # state-dependent timing benchmark requires
+        # rewind of StringIO object
+
+        np.loadtxt(self.csv_data,
+                   delimiter=u',',
+                   dtype=dtype)
+        self.csv_data.seek(0)
+
+class LoadtxtCSVStructured(Benchmark):
+    # benchmarks for np.loadtxt operating with
+    # a structured data type & CSV file
+
+    def setup(self):
+        num_lines = 50000
+        data = [u"M, 21, 72, X, 155"] * num_lines
+        self.csv_data = StringIO(u'\n'.join(data))
+
+    def time_loadtxt_csv_struct_dtype(self):
+        # obligate rewind of StringIO object
+        # between iterations of a repeat:
+
+        np.loadtxt(self.csv_data,
+                   delimiter=u',',
+                   dtype=[('category_1', 'S1'),
+                          ('category_2', 'i4'),
+                          ('category_3', 'f8'),
+                          ('category_4', 'S1'),
+                          ('category_5', 'f8')])
+        self.csv_data.seek(0)
+
+
+class LoadtxtCSVSkipRows(Benchmark):
+    # benchmarks for loadtxt row skipping when
+    # reading in csv file data; a similar benchmark
+    # is present in the pandas asv suite
+
+    params = [0, 500, 10000]
+    param_names = ['skiprows']
+
+    def setup(self, skiprows):
+        np.random.seed(123)
+        test_array = np.random.rand(100000, 3)
+        self.fname = 'test_array.csv'
+        np.savetxt(fname=self.fname,
+                   X=test_array,
+                   delimiter=',')
+
+    def time_skiprows_csv(self, skiprows):
+        np.loadtxt(self.fname,
+                   delimiter=',',
+                   skiprows=skiprows)
+
+class LoadtxtReadUint64Integers(Benchmark):
+    # pandas has a similar CSV reading benchmark
+    # modified to suit np.loadtxt
+
+    params = [550, 1000, 10000]
+    param_names = ['size']
+
+    def setup(self, size):
+        arr = np.arange(size).astype('uint64') + 2**63
+        self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
+        arr = arr.astype(object)
+        arr[500] = -1
+        self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))
+
+    def time_read_uint64(self, size):
+        # mandatory rewind of StringIO object
+        # between iterations of a repeat:
+        np.loadtxt(self.data1)
+        self.data1.seek(0)
+
+    def time_read_uint64_neg_values(self, size):
+        # mandatory rewind of StringIO object
+        # between iterations of a repeat:
+        np.loadtxt(self.data2)
+        self.data2.seek(0)
+
+class LoadtxtUseColsCSV(Benchmark):
+    # benchmark selective column reading from CSV files
+    # using np.loadtxt
+
+    params = [2, [1, 3], [1, 3, 5, 7]]
+    param_names = ['usecols']
+
+    def setup(self, usecols):
+        num_lines = 5000
+        data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
+        self.csv_data = StringIO(u'\n'.join(data))
+
+    def time_loadtxt_usecols_csv(self, usecols):
+        # must rewind StringIO because of state
+        # dependence of file reading
+        np.loadtxt(self.csv_data,
+                   delimiter=u',',
+                   usecols=usecols)
+        self.csv_data.seek(0)
+
+class LoadtxtCSVDateTime(Benchmark):
+    # benchmarks for np.loadtxt operating with
+    # datetime data in a CSV file
+
+    params = [20, 200, 2000, 20000]
+    param_names = ['num_lines']
+
+    def setup(self, num_lines):
+        # create the equivalent of a two-column CSV file
+        # with date strings in the first column and random
+        # floating point data in the second column
+        dates = np.arange('today', 20, dtype=np.datetime64)
+        np.random.seed(123)
+        values = np.random.rand(20)
+        date_line = u''
+
+        for date, value in zip(dates, values):
+            date_line += (str(date) + ',' + str(value) + '\n')
+
+        # expand data to specified number of lines
+        data = date_line * (num_lines // 20)
+        self.csv_data = StringIO(data)
+
+    def time_loadtxt_csv_datetime(self, num_lines):
+        # rewind StringIO object -- the timing iterations
+        # are state-dependent
+        X = np.loadtxt(self.csv_data,
+                       delimiter=u',',
+                       dtype=([('dates', 'M8[us]'),
+                               ('values', 'float64')]))
+        self.csv_data.seek(0)