-
Notifications
You must be signed in to change notification settings - Fork 2.8k
/
Copy pathbenchmark_iterating.py
98 lines (79 loc) · 3.68 KB
/
benchmark_iterating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import os
import tempfile
import datasets
from utils import generate_example_dataset, get_duration
SPEED_TEST_N_EXAMPLES = 50_000
SMALL_TEST = 5_000
RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
@get_duration
def read(dataset: datasets.Dataset, length):
for i in range(length):
_ = dataset[i]
@get_duration
def read_batch(dataset: datasets.Dataset, length, batch_size):
for i in range(0, len(dataset), batch_size):
_ = dataset[i : i + batch_size]
@get_duration
def read_formatted(dataset: datasets.Dataset, length, type):
with dataset.formatted_as(type=type):
for i in range(length):
_ = dataset[i]
@get_duration
def read_formatted_batch(dataset: datasets.Dataset, length, batch_size, type):
with dataset.formatted_as(type=type):
for i in range(0, length, batch_size):
_ = dataset[i : i + batch_size]
def benchmark_iterating():
times = {"num examples": SPEED_TEST_N_EXAMPLES}
functions = [
(read, {"length": SMALL_TEST}),
(read, {"length": SPEED_TEST_N_EXAMPLES}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 10}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 100}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000}),
(read_formatted, {"type": "numpy", "length": SMALL_TEST}),
(read_formatted, {"type": "pandas", "length": SMALL_TEST}),
(read_formatted, {"type": "torch", "length": SMALL_TEST}),
(read_formatted, {"type": "tensorflow", "length": SMALL_TEST}),
(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 10}),
(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 1_000}),
]
functions_shuffled = [
(read, {"length": SMALL_TEST}),
(read, {"length": SPEED_TEST_N_EXAMPLES}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 10}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 100}),
(read_batch, {"length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000}),
(read_formatted, {"type": "numpy", "length": SMALL_TEST}),
(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 10}),
(read_formatted_batch, {"type": "numpy", "length": SMALL_TEST, "batch_size": 1_000}),
]
with tempfile.TemporaryDirectory() as tmp_dir:
print("generating dataset")
features = datasets.Features(
{"list": datasets.Sequence(datasets.Value("float32")), "numbers": datasets.Value("float32")}
)
dataset = generate_example_dataset(
os.path.join(tmp_dir, "dataset.arrow"),
features,
num_examples=SPEED_TEST_N_EXAMPLES,
seq_shapes={"list": (100,)},
)
print("first set of iterations")
for func, kwargs in functions:
print(func.__name__, str(kwargs))
times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func(dataset, **kwargs)
print("shuffling dataset")
dataset = dataset.shuffle()
print("Second set of iterations (after shuffling")
for func, kwargs in functions_shuffled:
print("shuffled ", func.__name__, str(kwargs))
times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func(
dataset, **kwargs
)
with open(RESULTS_FILE_PATH, "wb") as f:
f.write(json.dumps(times).encode("utf-8"))
if __name__ == "__main__": # useful to run the profiler
benchmark_iterating()