forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
better_unique.py
77 lines (55 loc) · 1.99 KB
/
better_unique.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from pandas import DataFrame
import timeit
setup = """
from pandas import Series
import pandas._tseries as _tseries
import random
import numpy as np
def better_unique(values):
uniques = _tseries.fast_unique(values)
id_map = _tseries.map_indices_buf(uniques)
labels = _tseries.get_unique_labels(values, id_map)
return uniques, labels
tot = 100000
def get_test_data(ngroups=100, n=tot):
unique_groups = range(ngroups)
random.shuffle(unique_groups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
return arr
arr = get_test_data(ngroups=%d)
"""
group_sizes = [10, 100, 1000, 10000,
20000, 30000, 40000,
50000, 60000, 70000,
80000, 90000, 100000]
numbers = [100, 100, 50] + [10] * 10
numpy = []
wes = []
for sz, n in zip(group_sizes, numbers):
# wes_timer = timeit.Timer(stmt='better_unique(arr)',
# setup=setup % sz)
wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)',
setup=setup % sz)
numpy_timer = timeit.Timer(stmt='np.unique(arr)',
setup=setup % sz)
print n
numpy_result = numpy_timer.timeit(number=n) / n
wes_result = wes_timer.timeit(number=n) / n
print 'Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result)
wes.append(wes_result)
numpy.append(numpy_result)
result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes)
def make_plot(numpy, wes):
pass
# def get_test_data(ngroups=100, n=100000):
# unique_groups = range(ngroups)
# random.shuffle(unique_groups)
# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
# if len(arr) < n:
# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
# dtype=object)
# return arr
# arr = get_test_data(ngroups=1000)