Merge pull request #1672 from quantopian/narrow-labelarray

narrow labelarray
quantcatalyst · Mar 3, 2017 · f90cd1c · f90cd1c
2 parents d140665 + 35338df
commit f90cd1c
Show file tree

Hide file tree

Showing 4 changed files with 297 additions and 77 deletions.
diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py
@@ -1,10 +1,13 @@
 from itertools import product
 from operator import eq, ne
-import numpy as np
 import warnings
 
+import numpy as np
+from toolz import take
+
 from zipline.lib.labelarray import LabelArray
 from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
+from zipline.testing.predicates import assert_equal
 from zipline.utils.compat import unicode
 
 
@@ -337,3 +340,85 @@ def test_setitem_array(self):
         # Write the whole array.
         arr[:] = orig_arr
         check_arrays(arr, orig_arr)
+
+    def test_narrow_code_storage(self):
+        def check_roundtrip(arr):
+            assert_equal(
+                arr.as_string_array(),
+                LabelArray(
+                    arr.as_string_array(),
+                    arr.missing_value,
+                ).as_string_array(),
+            )
+
+        def create_categories(width, plus_one):
+            length = int(width / 8) + plus_one
+            return [
+                ''.join(cs)
+                for cs in take(
+                    2 ** width + plus_one,
+                    product([chr(c) for c in range(256)], repeat=length),
+                )
+            ]
+
+        # uint8
+        categories = create_categories(8, plus_one=False)
+        arr = LabelArray(
+            [],
+            missing_value=categories[0],
+            categories=categories,
+        )
+        self.assertEqual(arr.itemsize, 1)
+        check_roundtrip(arr)
+
+        # uint8 inference
+        arr = LabelArray(categories, missing_value=categories[0])
+        self.assertEqual(arr.itemsize, 1)
+        check_roundtrip(arr)
+
+        # just over uint8
+        categories = create_categories(8, plus_one=True)
+        arr = LabelArray(
+            [],
+            missing_value=categories[0],
+            categories=categories,
+        )
+        self.assertEqual(arr.itemsize, 2)
+        check_roundtrip(arr)
+
+        # uint16 inference
+        arr = LabelArray(categories, missing_value=categories[0])
+        self.assertEqual(arr.itemsize, 2)
+        check_roundtrip(arr)
+
+        # fits in uint16
+        categories = create_categories(16, plus_one=False)
+        arr = LabelArray(
+            [], missing_value=categories[0],
+            categories=categories,
+        )
+        self.assertEqual(arr.itemsize, 2)
+        check_roundtrip(arr)
+
+        # uint16 inference
+        arr = LabelArray(categories, missing_value=categories[0])
+        self.assertEqual(arr.itemsize, 2)
+        check_roundtrip(arr)
+
+        # just over uint16
+        categories = create_categories(16, plus_one=True)
+        arr = LabelArray(
+            [],
+            missing_value=categories[0],
+            categories=categories,
+        )
+        self.assertEqual(arr.itemsize, 4)
+        check_roundtrip(arr)
+
+        # uint32 inference
+        arr = LabelArray(categories, missing_value=categories[0])
+        self.assertEqual(arr.itemsize, 4)
+        check_roundtrip(arr)
+
+        # NOTE: we could do this for 32 and 64; however, no one has enough RAM
+        # or time for that.
diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx
@@ -1,85 +1,112 @@
 """
 Factorization algorithms.
 """
-from numpy cimport ndarray, int64_t, PyArray_Check, import_array
-from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros
+from libc.math cimport floor, log
+cimport numpy as np
+import numpy as np
 
-import_array()
+from zipline.utils.numpy_utils import unsigned_int_dtype_with_size_in_bytes
 
+np.import_array()
 
-cpdef factorize_strings_known_categories(ndarray[object] values,
-                                         list categories,
-                                         object missing_value,
-                                         int sort):
-    """
-    Factorize an array whose categories are already known.
 
-    Any entries not in the specified categories will be given the code for
-    `missing_value`.
-    """
+cdef inline double log2(double d):
+    return log(d) / log(2);
+
+
+ctypedef fused unsigned_integral:
+    np.uint8_t
+    np.uint16_t
+    np.uint32_t
+    np.uint64_t
+
+
+cdef factorize_strings_known_impl(np.ndarray[object] values,
+                                  Py_ssize_t nvalues,
+                                  list categories,
+                                  object missing_value,
+                                  bint sort,
+                                  np.ndarray[unsigned_integral] codes):
     if missing_value not in categories:
         categories.insert(0, missing_value)
 
     if sort:
         categories = sorted(categories)
 
-    cdef:
-        Py_ssize_t      nvalues = len(values)
-        dict reverse_categories = dict(
-            zip(categories, range(len(categories)))
-        )
-
-    if not nvalues:
-        return (
-            asarray([], dtype=int64),
-            asarray(categories, dtype=object),
-            reverse_categories,
-        )
-
-    cdef:
-        Py_ssize_t            i
-        Py_ssize_t missing_code = reverse_categories[missing_value]
-        ndarray[int64_t]  codes = empty(nvalues, dtype=int64)
+    cdef dict reverse_categories = dict(
+        zip(categories, range(len(categories)))
+    )
+    cdef Py_ssize_t i
+    cdef Py_ssize_t missing_code = reverse_categories[missing_value]
 
     for i in range(nvalues):
         codes[i] = reverse_categories.get(values[i], missing_code)
 
-    return codes, asarray(categories, dtype=object), reverse_categories
+    return codes, np.asarray(categories, dtype=object), reverse_categories
 
 
-cpdef factorize_strings(ndarray[object] values,
-                        object missing_value,
-                        int sort):
+cpdef factorize_strings_known_categories(np.ndarray[object] values,
+                                         list categories,
+                                         object missing_value,
+                                         bint sort):
     """
-    Factorize an array of (possibly duplicated) labels into an array of indices
-    into a unique array of labels.
-
-    This is ~30% faster than pandas.factorize, at the cost of not having
-    special treatment for NaN, which we don't care about because we only
-    support arrays of strings.
+    Factorize an array whose categories are already known.
 
-    (Though it's faster even if you throw in the nan checks that pandas does,
-    because we're using dict and list instead of PyObjectHashTable and
-    ObjectVector.  Python's builtin data structures are **really**
-    well-optimized.)
+    Any entries not in the specified categories will be given the code for
+    `missing_value`.
     """
-    cdef:
-        Py_ssize_t      nvalues = len(values)
-        list         categories = [missing_value]
-        dict reverse_categories = {missing_value: 0}
-
-    # Short circuit on empty array.
-    if not nvalues:
-        return (
-            asarray([], dtype=int64),
-            asarray(categories, dtype=object),
-            reverse_categories,
+    cdef Py_ssize_t ncategories = len(categories)
+    cdef Py_ssize_t nvalues = len(values)
+    if ncategories <= 2 ** 8:
+        return factorize_strings_known_impl[np.uint8_t](
+            values,
+            nvalues,
+            categories,
+            missing_value,
+            sort,
+            np.empty(nvalues, dtype=np.uint8)
+        )
+    elif ncategories <= 2 ** 16:
+        return factorize_strings_known_impl[np.uint16_t](
+            values,
+            nvalues,
+            categories,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint16),
+        )
+    elif ncategories <= 2 ** 32:
+        return factorize_strings_known_impl[np.uint32_t](
+            values,
+            nvalues,
+            categories,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint32),
         )
+    elif ncategories <= 2 ** 64:
+        return factorize_strings_known_impl[np.uint64_t](
+            values,
+            nvalues,
+            categories,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint64),
+        )
+    else:
+        raise ValueError('ncategories larger than uint64')
+
 
-    cdef:
-        Py_ssize_t      i, code
-        object              key = None
-        ndarray[int64_t]  codes = empty(nvalues, dtype=int64)
+cdef factorize_strings_impl(np.ndarray[object] values,
+                            Py_ssize_t nvalues,
+                            object missing_value,
+                            bint sort,
+                            np.ndarray[unsigned_integral] codes):
+    cdef list categories = [missing_value]
+    cdef dict reverse_categories = {missing_value: 0}
+
+    cdef Py_ssize_t i, code
+    cdef object key = None
 
     for i in range(nvalues):
         key = values[i]
@@ -91,24 +118,102 @@ cpdef factorize_strings(ndarray[object] values,
             categories.append(key)
         codes[i] = code
 
-    cdef ndarray[int64_t, ndim=1] sorter
-    cdef ndarray[int64_t, ndim=1] reverse_indexer
+    cdef np.ndarray[np.int64_t, ndim=1] sorter
+    cdef np.ndarray[unsigned_integral, ndim=1] reverse_indexer
     cdef int ncategories
-    cdef ndarray[object] categories_array = asarray(categories, dtype=object)
+    cdef np.ndarray[object] categories_array = np.asarray(
+        categories,
+        dtype=object,
+    )
 
     if sort:
         # This is all adapted from pandas.core.algorithms.factorize.
         ncategories = len(categories_array)
-        sorter = zeros(ncategories, dtype=int64)
+        sorter = np.zeros(ncategories, dtype=np.int64)
 
         # Don't include missing_value in the argsort, because None is
         # unorderable with bytes/str in py3. Always just sort it to 0.
         sorter[1:] = categories_array[1:].argsort() + 1
-        reverse_indexer = empty(ncategories, dtype=int64)
-        reverse_indexer.put(sorter, arange(ncategories))
+        reverse_indexer = np.empty(ncategories, dtype=codes.dtype)
+        reverse_indexer.put(sorter, np.arange(ncategories))
 
         codes = reverse_indexer.take(codes)
         categories_array = categories_array.take(sorter)
         reverse_categories = dict(zip(categories_array, range(ncategories)))
 
     return codes, categories_array, reverse_categories
+
+
+cpdef factorize_strings(np.ndarray[object] values,
+                        object missing_value,
+                        int sort):
+    """
+    Factorize an array of (possibly duplicated) labels into an array of indices
+    into a unique array of labels.
+
+    This is ~30% faster than pandas.factorize, at the cost of not having
+    special treatment for NaN, which we don't care about because we only
+    support arrays of strings.
+
+    (Though it's faster even if you throw in the nan checks that pandas does,
+    because we're using dict and list instead of PyObjectHashTable and
+    ObjectVector.  Python's builtin data structures are **really**
+    well-optimized.)
+    """
+    cdef Py_ssize_t nvalues = len(values)
+    cdef np.ndarray codes
+    cdef np.ndarray categories_array
+    cdef dict reverse_categories
+
+    if nvalues <= 2 ** 8:
+        # we won't try to shrink because the ``codes`` array cannot get any
+        # smaller
+        return factorize_strings_impl[np.uint8_t](
+            values,
+            nvalues,
+            missing_value,
+            sort,
+            np.empty(nvalues, dtype=np.uint8)
+        )
+    elif nvalues <= 2 ** 16:
+        (codes,
+         categories_array,
+         reverse_categories) = factorize_strings_impl[np.uint16_t](
+            values,
+            nvalues,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint16),
+        )
+    elif nvalues <= 2 ** 32:
+        (codes,
+         categories_array,
+         reverse_categories) = factorize_strings_impl[np.uint32_t](
+            values,
+            nvalues,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint32),
+        )
+    elif nvalues <= 2 ** 64:
+        (codes,
+         categories_array,
+         reverse_categories) = factorize_strings_impl[np.uint64_t](
+            values,
+            nvalues,
+            missing_value,
+            sort,
+            np.empty(nvalues, np.uint64),
+        )
+    else:
+        # unreachable
+        raise ValueError('nvalues larger than uint64')
+
+    if len(categories_array) < 2 ** codes.dtype.itemsize:
+        # if there are a lot of duplicates in the values we may need to shrink
+        # the width of the ``codes`` array
+        codes = codes.astype(unsigned_int_dtype_with_size_in_bytes(
+            floor(log2(len(categories_array))),
+        ))
+
+    return codes, categories_array, reverse_categories