Skip to content

Commit

Permalink
Section 6.4 - Predictions
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherGS committed Oct 14, 2019
1 parent b7de0ff commit 0ab5790
Show file tree
Hide file tree
Showing 15 changed files with 133 additions and 67 deletions.
File renamed without changes.
25 changes: 25 additions & 0 deletions packages/regression_model/regression_model/config/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pathlib

import regression_model


PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
TRAINED_MODEL_DIR = PACKAGE_ROOT / 'trained_models'
DATASET_DIR = PACKAGE_ROOT / 'datasets'

# data
TESTING_DATA_FILE = 'test.csv'
TRAINING_DATA_FILE = 'train.csv'
TARGET = 'SalePrice'


# variables
FEATURES = ['MSSubClass', 'MSZoning', 'Neighborhood',
'OverallQual', 'OverallCond', 'YearRemodAdd',
'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
'LotFrontage',
# this one is only to calculate temporal variable:
'YrSold']
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import preprocessors as pp
from regression_model import preprocessors as pp


# categorical variables with NA in train set
Expand Down Expand Up @@ -39,7 +39,7 @@
('temporal_variable',
pp.TemporalVariableEstimator(
variables=TEMPORAL_VARS,
reference_variable=TEMPORAL_VARS)),
reference_variable=DROP_FEATURES)),
('rare_label_encoder',
pp.RareLabelCategoricalEncoder(
tol=0.01,
Expand Down
20 changes: 20 additions & 0 deletions packages/regression_model/regression_model/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
import pandas as pd

from regression_model.processing.data_management import load_pipeline
from regression_model.config import config


pipeline_file_name = 'regression_model.pkl'
_price_pipe = load_pipeline(file_name=pipeline_file_name)


def make_prediction(*, input_data) -> dict:
"""Make a prediction using the saved model pipeline."""

data = pd.read_json(input_data)
prediction = _price_pipe.predict(data[config.FEATURES])
output = np.exp(prediction)
response = {'predictions': output}

return response
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas as pd
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

from regression_model.config import config


def load_dataset(*, file_name: str
) -> pd.DataFrame:
_data = pd.read_csv(f'{config.DATASET_DIR}/{file_name}')
return _data


def save_pipeline(*, pipeline_to_persist) -> None:
"""Persist the pipeline."""

save_file_name = 'regression_model.pkl'
save_path = config.TRAINED_MODEL_DIR / save_file_name
joblib.dump(pipeline_to_persist, save_path)

print('saved pipeline')


def load_pipeline(*, file_name: str
) -> Pipeline:
"""Load a persisted pipeline."""

file_path = config.TRAINED_MODEL_DIR / file_name
saved_pipeline = joblib.load(filename=file_path)
return saved_pipeline
34 changes: 34 additions & 0 deletions packages/regression_model/regression_model/train_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import numpy as np
from sklearn.model_selection import train_test_split

from regression_model import pipeline
from regression_model.processing.data_management import (
load_dataset, save_pipeline)
from regression_model.config import config


def run_training() -> None:
"""Train the model."""

# read training data
data = load_dataset(file_name=config.TRAINING_DATA_FILE)

# divide train and test
X_train, X_test, y_train, y_test = train_test_split(
data[config.FEATURES],
data[config.TARGET],
test_size=0.1,
random_state=0) # we are setting the seed here

# transform the target
y_train = np.log(y_train)
y_test = np.log(y_test)

pipeline.price_pipe.fit(X_train[config.FEATURES],
y_train)

save_pipeline(pipeline_to_persist=pipeline.price_pipe)


if __name__ == '__main__':
run_training()
Empty file.
Empty file.
18 changes: 18 additions & 0 deletions packages/regression_model/tests/test_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import math

from regression_model.predict import make_prediction
from regression_model.processing.data_management import load_dataset


def test_make_single_prediction():
# Given
test_data = load_dataset(file_name='test.csv')
single_test_json = test_data[0:1].to_json(orient='records')

# When
subject = make_prediction(input_data=single_test_json)

# Then
assert subject is not None
assert isinstance(subject.get('predictions')[0], float)
assert math.ceil(subject.get('predictions')[0]) == 112476
64 changes: 0 additions & 64 deletions packages/regression_model/train_pipeline.py

This file was deleted.

5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
jupyter==1.0.0
matplotlib==3.0.2
pandas==0.23.4
scikit-learn==0.20.2
scikit-learn==0.20.2

# testing
pytest>=4.6.6,<5.0.0

0 comments on commit 0ab5790

Please sign in to comment.