|
| 1 | +import pandas as pd |
| 2 | +from sklearn.ensemble import RandomForestRegressor |
| 3 | +from sklearn.ensemble import BaggingRegressor |
| 4 | +from sklearn.ensemble import ExtraTreesRegressor |
| 5 | +from sklearn.ensemble import AdaBoostRegressor |
| 6 | +from sklearn.ensemble import GradientBoostingRegressor |
| 7 | +from sklearn.ensemble import RandomTreesEmbedding |
| 8 | +from sklearn.neural_network import MLPRegressor |
| 9 | +from sklearn.linear_model import ElasticNet |
| 10 | +from sklearn.gaussian_process import GaussianProcessRegressor |
| 11 | +from sklearn.model_selection import train_test_split |
| 12 | +from sklearn.preprocessing import LabelEncoder |
| 13 | +from sklearn.preprocessing import Imputer |
| 14 | +from sklearn import metrics |
| 15 | +import numpy as np |
| 16 | + |
| 17 | + |
| 18 | +def get_gaussian_process_regressor(): |
| 19 | + gp = GaussianProcessRegressor() |
| 20 | + return [gp],['Gaussian Process'] |
| 21 | + |
| 22 | + |
| 23 | +def get_mlp_regressor(num_hidden_units=51): |
| 24 | + mlp = MLPRegressor(hidden_layer_sizes=num_hidden_units) |
| 25 | + return [mlp],['Multi-Layer Perceptron'] |
| 26 | + |
| 27 | + |
| 28 | +def get_ensemble_models(): |
| 29 | + rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3,random_state=42) |
| 30 | + bag = BaggingRegressor(n_estimators=51,random_state=42) |
| 31 | + extra = ExtraTreesRegressor(n_estimators=71,random_state=42) |
| 32 | + ada = AdaBoostRegressor(random_state=42) |
| 33 | + grad = GradientBoostingRegressor(n_estimators=101,random_state=42) |
| 34 | + classifier_list = [rf,bag,extra,ada,grad] |
| 35 | + classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost'] |
| 36 | + return classifier_list, classifier_name_list |
| 37 | + |
| 38 | + |
| 39 | +def get_linear_model(): |
| 40 | + elastic_net = ElasticNet() |
| 41 | + return [elastic_net],['Elastic Net'] |
| 42 | + |
| 43 | + |
| 44 | +def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test): |
| 45 | + print '--------- For Model : ', trained_model_name ,' ---------\n' |
| 46 | + predicted_values = trained_model.predict(X_test) |
| 47 | + print "Mean Absolute Error : ", metrics.mean_absolute_error(y_test,predicted_values) |
| 48 | + print "Median Absolute Error : ", metrics.median_absolute_error(y_test,predicted_values) |
| 49 | + print "Mean Squared Error : ", metrics.mean_squared_error(y_test,predicted_values) |
| 50 | + print "R2 Score : ", metrics.r2_score(y_test,predicted_values) |
| 51 | + print "---------------------------------------\n" |
| 52 | + |
| 53 | + |
| 54 | +def label_encode_frame(dataframe): |
| 55 | + columns = dataframe.columns |
| 56 | + encoder = LabelEncoder() |
| 57 | + for column in columns: |
| 58 | + if type(dataframe[column][0]) is np.nan: |
| 59 | + for i in range(len(dataframe)): |
| 60 | + if i > 1000: |
| 61 | + break |
| 62 | + if type(dataframe[column][i]) is str: |
| 63 | + dataframe[column] = encoder.fit_transform(dataframe[column].values) |
| 64 | + break |
| 65 | + elif type(dataframe[column][0]) is str: |
| 66 | + dataframe[column] = encoder.fit_transform(dataframe[column].values) |
| 67 | + return dataframe |
| 68 | + |
| 69 | + |
| 70 | +def spilt_date(list_of_date_string,separator='-',format='yyyy-mm-dd'): |
| 71 | + month_list = list([]) |
| 72 | + day_list = list([]) |
| 73 | + year_list = list([]) |
| 74 | + for date_string in list_of_date_string: |
| 75 | + date_list = date_string.strip().split(separator) |
| 76 | + month_list.append(date_list[1]) |
| 77 | + day_list.append(date_list[2]) |
| 78 | + year_list.append(date_list[0]) |
| 79 | + return month_list,day_list,year_list |
| 80 | + |
| 81 | + |
| 82 | +def isfloat(value): |
| 83 | + try: |
| 84 | + float(value) |
| 85 | + return True |
| 86 | + except: |
| 87 | + return False |
| 88 | + |
| 89 | + |
| 90 | +def handle_mixed_data_types(dataframe): |
| 91 | + for column_name in dataframe.columns: |
| 92 | + column_data = list(dataframe[column_name].values) |
| 93 | + float_count = 0 |
| 94 | + float_sum = 0.0 |
| 95 | + string_count = 0 |
| 96 | + for data in column_data: |
| 97 | + if isfloat(data): |
| 98 | + float_count += 1.0 |
| 99 | + float_sum += float(data) |
| 100 | + else: |
| 101 | + string_count += 1 |
| 102 | + if float_count >= string_count: |
| 103 | + mean = float_sum/float_count |
| 104 | + for index,value in enumerate(column_data): |
| 105 | + if not isfloat(value): |
| 106 | + column_data[index] = mean |
| 107 | + else: |
| 108 | + column_data[index] = float(value) |
| 109 | + dataframe[column_name] = column_data |
| 110 | + return dataframe |
| 111 | + |
| 112 | + |
| 113 | +weather_filename = 'weather.csv' |
| 114 | +train_filename = 'train.csv' |
| 115 | +key_filename = 'key.csv' |
| 116 | +weather_frame = pd.read_csv(weather_filename) |
| 117 | +train_frame = pd.read_csv(train_filename) |
| 118 | +key_frame = pd.read_csv(key_filename) |
| 119 | +weather_frame.drop(['codesum','depart'],axis=1,inplace=True) |
| 120 | +weather_frame = handle_mixed_data_types(weather_frame) |
| 121 | +final_frame = pd.merge(train_frame,key_frame,how='inner',left_on='store_nbr',right_on='store_nbr') |
| 122 | +final_frame = pd.merge(final_frame,weather_frame,how='inner',left_on=['station_nbr','date'],right_on=['station_nbr','date']) |
| 123 | +target_values = list(final_frame['units'].values) |
| 124 | +final_frame['month'], final_frame['day'], final_frame['year'] = spilt_date(list(final_frame['date'].values)) |
| 125 | +del final_frame['units'] |
| 126 | +del final_frame['date'] |
| 127 | +X_train,X_test,y_train,y_test = train_test_split(final_frame.values,target_values,test_size=0.2,random_state=42) |
| 128 | +regressor_list,regressor_name_list = get_ensemble_models() |
| 129 | +for regressor,regressor_name in zip(regressor_list,regressor_name_list): |
| 130 | + regressor.fit(X_train,y_train) |
| 131 | + print_evaluation_metrics(regressor,regressor_name,X_test,y_test) |
0 commit comments