-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathtest_imputer_iterators.py
109 lines (88 loc) · 3.78 KB
/
test_imputer_iterators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not
# use this file except in compliance with the License. A copy of the License
# is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
DataWig imputer iterator tests
"""
import itertools
import numpy as np
import pandas as pd
from datawig.column_encoders import (BowEncoder, CategoricalEncoder,
SequentialEncoder)
from datawig.iterators import ImputerIterDf
from datawig.imputer import INSTANCE_WEIGHT_COLUMN
feature_col = "features"
label_col = "labels"
max_tokens = 100
num_labels = 10
def test_iter_next_df(data_frame):
it = get_new_iterator_df(data_frame())
_, next_batch = next(it), next(it)
print("Array : " + str(next_batch.label[0].asnumpy()))
assert ((next_batch.label[0].asnumpy() == np.array([[9.], [9.]])).all())
def test_iter_df_bow(data_frame):
df = data_frame()
it = get_new_iterator_df_bow(df)
tt = next(it)
bow = tt.data[0].asnumpy()[0, :]
true = it.data_columns[0].vectorizer.transform([df.loc[0,'features']]).toarray()[0]
assert (true - bow).sum() < 1e-5
def test_iter_provide_label_or_data_df(data_frame):
it = get_new_iterator_df(data_frame())
# pylint: disable=unsubscriptable-object
assert it.provide_data[0][0] == INSTANCE_WEIGHT_COLUMN
assert it.provide_data[1][0] == label_col
assert it.provide_data[1][1] == (2, 2)
assert it.provide_label[0][0] == label_col
assert it.provide_label[0][1] == (2, 1)
def test_iter_index_df(data_frame):
it = get_new_iterator_df(data_frame())
idx_it = list(itertools.chain(*[b.index for b in it]))
idx_true = data_frame().index.tolist()
assert idx_it == idx_true
def test_iter_decoder_df():
# draw skewed brands
brands = [{feature_col: brand} for brand in
list(map(lambda e: str(int(e)), np.random.exponential(scale=1, size=1000)))]
brand_df = pd.DataFrame(brands)
it = ImputerIterDf(brand_df,
data_columns=[SequentialEncoder(feature_col, max_tokens=10, seq_len=2)],
label_columns=[CategoricalEncoder(feature_col, max_tokens=100)],
batch_size=2)
decoded = it.decode(next(it).label)
np.testing.assert_array_equal(decoded[0], brand_df[feature_col].head(it.batch_size).values)
def test_iter_padding_offset():
col = 'brand'
df = pd.DataFrame(
[{col: brand} for brand in
list(map(lambda e: str(int(e)), np.random.exponential(scale=1, size=36)))]
)
df_train = df.sample(frac=0.5)
it = ImputerIterDf(
df_train,
data_columns=[BowEncoder(col)],
label_columns=[CategoricalEncoder(col, max_tokens=5)],
batch_size=32
)
assert it.start_padding_idx == df_train.shape[0]
def get_new_iterator_df_bow(df):
return ImputerIterDf(df,
data_columns=[BowEncoder(feature_col, max_tokens=max_tokens)],
label_columns=[CategoricalEncoder(label_col, max_tokens=num_labels)],
batch_size=2)
def get_new_iterator_df(df):
return ImputerIterDf(df,
data_columns=[SequentialEncoder(label_col,
max_tokens=max_tokens,
seq_len=2)],
label_columns=[CategoricalEncoder(label_col, max_tokens=max_tokens)],
batch_size=2)