-
Notifications
You must be signed in to change notification settings - Fork 6
/
ctr_model.py
129 lines (97 loc) · 4.2 KB
/
ctr_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Runnable script for RTB strategy evaluation"""
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from rtb.bidding import (FlatBiddingStrategy,
BidSimulator, GoalBiddingStrategy,
EffectiveCPCBiddingStrategy, RandomBiddingStrategy)
def sample_data(df, sample_col, fraction=500):
"""Sample data based on boolean column. The sample will contain all rows
with true value of the column and randomly sampled fraction *
len(true_values)
Parameters
----------
df : pd.DataFrame
Input dataframe
sample_col : str
Boolean column name (sampling indicator)
fraction : int
Fraction of 'false' values to sample
Returns
-------
ds : pd.DataFrame
sampled dataframe"""
df_col = df[df[sample_col] == 1]
df_nocol = df[df[sample_col] == 0]
df_nocol_sample_rows = \
np.random.choice(df_nocol.index,
size=int(len(df_col) * (fraction / 100)),
replace=False)
df_nocol_sample = df_nocol.loc[df_nocol_sample_rows]
df = pd.concat([df_col, df_nocol_sample])
return df
def preprocess_data(df):
"""Preprocess data for CTR model"""
result = df.copy()
one_hot_col_names = ['ad_slot_visibility',
'browser',
'ad_exchange',
'device',
'os',
'region_id']
one_hot_cols = pd.get_dummies(df[one_hot_col_names])
result = pd.concat([result, one_hot_cols], axis=1)
result.drop(one_hot_col_names, axis=1, inplace=True)
# extract date features
result['year'] = result['timestamp'].apply(lambda ts: ts.year)
result['month'] = result['timestamp'].apply(lambda ts: ts.month)
result['day'] = result['timestamp'].apply(lambda ts: ts.day)
result['weekday'] = result['timestamp'].apply(lambda ts: ts.weekday)
# ad_slot has only one value,
# and user agent and timestamp were parsed before
result.drop(['user_agent',
'ad_slot', 'ad_slot_id', 'timestamp'], axis=1, inplace=True)
return result
def main():
parser = \
argparse.ArgumentParser(description="Preprocess IPinYou RTB Dataset")
parser.add_argument('--verbose',
'-v',
action='store_true',
help="show percentage of data loaded")
parser.add_argument('--input', '-i', type=str, default="clicks.hdf",
help="Input HDF filename")
args = parser.parse_args()
print("Loading data...")
data = pd.read_hdf(args.input, 'clicks')
data_preproc = sample_data(data, 'click')
data_preproc = preprocess_data(data_preproc)
data_preproc.drop(data_preproc.select_dtypes(include=['object']).columns,
axis=1,
inplace=True) # drop all string columns
x = data_preproc.drop(['click', 'paying_price'], axis=1)
y = data_preproc['click']
x_train, x_test, y_train, y_test = train_test_split(x, y,
stratify=y,
test_size=0.33)
ctr_model = LogisticRegression(C=0.01, class_weight='balanced')
scores = cross_val_score(ctr_model,
x_train,
y_train,
scoring='roc_auc',
cv=7, verbose=3)
print("Mean CV score: %f" % np.mean(scores))
ctr_model.fit(x, y)
bid_simulators = [BidSimulator(data_preproc, FlatBiddingStrategy(80)),
BidSimulator(data_preproc, RandomBiddingStrategy(80)),
BidSimulator(data_preproc, GoalBiddingStrategy(80)),
BidSimulator(data_preproc, EffectiveCPCBiddingStrategy(data_preproc))] # LEAK HERE! split data_preproc
for bid_simulator in bid_simulators:
print(bid_simulator)
stats = bid_simulator.run(ctr_model)
print(stats)
print(BidSimulator.metrics_report(stats))
if __name__ == '__main__':
main()