Skip to content

Commit

Permalink
Merge pull request #1672 from quantopian/narrow-labelarray
Browse files Browse the repository at this point in the history
narrow labelarray
  • Loading branch information
llllllllll authored Mar 3, 2017
2 parents d140665 + 35338df commit f90cd1c
Show file tree
Hide file tree
Showing 4 changed files with 297 additions and 77 deletions.
87 changes: 86 additions & 1 deletion tests/test_labelarray.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from itertools import product
from operator import eq, ne
import numpy as np
import warnings

import numpy as np
from toolz import take

from zipline.lib.labelarray import LabelArray
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
from zipline.testing.predicates import assert_equal
from zipline.utils.compat import unicode


Expand Down Expand Up @@ -337,3 +340,85 @@ def test_setitem_array(self):
# Write the whole array.
arr[:] = orig_arr
check_arrays(arr, orig_arr)

def test_narrow_code_storage(self):
def check_roundtrip(arr):
assert_equal(
arr.as_string_array(),
LabelArray(
arr.as_string_array(),
arr.missing_value,
).as_string_array(),
)

def create_categories(width, plus_one):
length = int(width / 8) + plus_one
return [
''.join(cs)
for cs in take(
2 ** width + plus_one,
product([chr(c) for c in range(256)], repeat=length),
)
]

# uint8
categories = create_categories(8, plus_one=False)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 1)
check_roundtrip(arr)

# uint8 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 1)
check_roundtrip(arr)

# just over uint8
categories = create_categories(8, plus_one=True)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)

# uint16 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)

# fits in uint16
categories = create_categories(16, plus_one=False)
arr = LabelArray(
[], missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)

# uint16 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)

# just over uint16
categories = create_categories(16, plus_one=True)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 4)
check_roundtrip(arr)

# uint32 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 4)
check_roundtrip(arr)

# NOTE: we could do this for 32 and 64; however, no one has enough RAM
# or time for that.
233 changes: 169 additions & 64 deletions zipline/lib/_factorize.pyx
Original file line number Diff line number Diff line change
@@ -1,85 +1,112 @@
"""
Factorization algorithms.
"""
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros
from libc.math cimport floor, log
cimport numpy as np
import numpy as np

import_array()
from zipline.utils.numpy_utils import unsigned_int_dtype_with_size_in_bytes

np.import_array()

cpdef factorize_strings_known_categories(ndarray[object] values,
list categories,
object missing_value,
int sort):
"""
Factorize an array whose categories are already known.

Any entries not in the specified categories will be given the code for
`missing_value`.
"""
cdef inline double log2(double d):
return log(d) / log(2);


ctypedef fused unsigned_integral:
np.uint8_t
np.uint16_t
np.uint32_t
np.uint64_t


cdef factorize_strings_known_impl(np.ndarray[object] values,
Py_ssize_t nvalues,
list categories,
object missing_value,
bint sort,
np.ndarray[unsigned_integral] codes):
if missing_value not in categories:
categories.insert(0, missing_value)

if sort:
categories = sorted(categories)

cdef:
Py_ssize_t nvalues = len(values)
dict reverse_categories = dict(
zip(categories, range(len(categories)))
)

if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
)

cdef:
Py_ssize_t i
Py_ssize_t missing_code = reverse_categories[missing_value]
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
cdef dict reverse_categories = dict(
zip(categories, range(len(categories)))
)
cdef Py_ssize_t i
cdef Py_ssize_t missing_code = reverse_categories[missing_value]

for i in range(nvalues):
codes[i] = reverse_categories.get(values[i], missing_code)

return codes, asarray(categories, dtype=object), reverse_categories
return codes, np.asarray(categories, dtype=object), reverse_categories


cpdef factorize_strings(ndarray[object] values,
object missing_value,
int sort):
cpdef factorize_strings_known_categories(np.ndarray[object] values,
list categories,
object missing_value,
bint sort):
"""
Factorize an array of (possibly duplicated) labels into an array of indices
into a unique array of labels.
This is ~30% faster than pandas.factorize, at the cost of not having
special treatment for NaN, which we don't care about because we only
support arrays of strings.
Factorize an array whose categories are already known.
(Though it's faster even if you throw in the nan checks that pandas does,
because we're using dict and list instead of PyObjectHashTable and
ObjectVector. Python's builtin data structures are **really**
well-optimized.)
Any entries not in the specified categories will be given the code for
`missing_value`.
"""
cdef:
Py_ssize_t nvalues = len(values)
list categories = [missing_value]
dict reverse_categories = {missing_value: 0}

# Short circuit on empty array.
if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
cdef Py_ssize_t ncategories = len(categories)
cdef Py_ssize_t nvalues = len(values)
if ncategories <= 2 ** 8:
return factorize_strings_known_impl[np.uint8_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, dtype=np.uint8)
)
elif ncategories <= 2 ** 16:
return factorize_strings_known_impl[np.uint16_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint16),
)
elif ncategories <= 2 ** 32:
return factorize_strings_known_impl[np.uint32_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint32),
)
elif ncategories <= 2 ** 64:
return factorize_strings_known_impl[np.uint64_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint64),
)
else:
raise ValueError('ncategories larger than uint64')


cdef:
Py_ssize_t i, code
object key = None
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
cdef factorize_strings_impl(np.ndarray[object] values,
Py_ssize_t nvalues,
object missing_value,
bint sort,
np.ndarray[unsigned_integral] codes):
cdef list categories = [missing_value]
cdef dict reverse_categories = {missing_value: 0}

cdef Py_ssize_t i, code
cdef object key = None

for i in range(nvalues):
key = values[i]
Expand All @@ -91,24 +118,102 @@ cpdef factorize_strings(ndarray[object] values,
categories.append(key)
codes[i] = code

cdef ndarray[int64_t, ndim=1] sorter
cdef ndarray[int64_t, ndim=1] reverse_indexer
cdef np.ndarray[np.int64_t, ndim=1] sorter
cdef np.ndarray[unsigned_integral, ndim=1] reverse_indexer
cdef int ncategories
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
cdef np.ndarray[object] categories_array = np.asarray(
categories,
dtype=object,
)

if sort:
# This is all adapted from pandas.core.algorithms.factorize.
ncategories = len(categories_array)
sorter = zeros(ncategories, dtype=int64)
sorter = np.zeros(ncategories, dtype=np.int64)

# Don't include missing_value in the argsort, because None is
# unorderable with bytes/str in py3. Always just sort it to 0.
sorter[1:] = categories_array[1:].argsort() + 1
reverse_indexer = empty(ncategories, dtype=int64)
reverse_indexer.put(sorter, arange(ncategories))
reverse_indexer = np.empty(ncategories, dtype=codes.dtype)
reverse_indexer.put(sorter, np.arange(ncategories))

codes = reverse_indexer.take(codes)
categories_array = categories_array.take(sorter)
reverse_categories = dict(zip(categories_array, range(ncategories)))

return codes, categories_array, reverse_categories


cpdef factorize_strings(np.ndarray[object] values,
object missing_value,
int sort):
"""
Factorize an array of (possibly duplicated) labels into an array of indices
into a unique array of labels.
This is ~30% faster than pandas.factorize, at the cost of not having
special treatment for NaN, which we don't care about because we only
support arrays of strings.
(Though it's faster even if you throw in the nan checks that pandas does,
because we're using dict and list instead of PyObjectHashTable and
ObjectVector. Python's builtin data structures are **really**
well-optimized.)
"""
cdef Py_ssize_t nvalues = len(values)
cdef np.ndarray codes
cdef np.ndarray categories_array
cdef dict reverse_categories

if nvalues <= 2 ** 8:
# we won't try to shrink because the ``codes`` array cannot get any
# smaller
return factorize_strings_impl[np.uint8_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, dtype=np.uint8)
)
elif nvalues <= 2 ** 16:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint16_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint16),
)
elif nvalues <= 2 ** 32:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint32_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint32),
)
elif nvalues <= 2 ** 64:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint64_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint64),
)
else:
# unreachable
raise ValueError('nvalues larger than uint64')

if len(categories_array) < 2 ** codes.dtype.itemsize:
# if there are a lot of duplicates in the values we may need to shrink
# the width of the ``codes`` array
codes = codes.astype(unsigned_int_dtype_with_size_in_bytes(
floor(log2(len(categories_array))),
))

return codes, categories_array, reverse_categories
Loading

0 comments on commit f90cd1c

Please sign in to comment.