-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
15d9fab
commit 8536cf3
Showing
4 changed files
with
250 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,9 @@ | ||
pandas | ||
numpy | ||
seaborn | ||
-e . | ||
dill | ||
matplotlib | ||
sckit-learn | ||
catboost | ||
xgboost | ||
# -e . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import sys | ||
import data_ingestion | ||
from dataclasses import dataclass | ||
import numpy as np | ||
import pandas as pd | ||
import os | ||
|
||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import OneHotEncoder, StandardScaler | ||
|
||
from src.exception import CustomException | ||
from src.logger import logging | ||
from src.utils import save_object | ||
|
||
|
||
@dataclass | ||
class DataTransformationConfig: | ||
preprocessor_obj_file_path = os.path.join('artifact', "preprocessor.pkl") | ||
|
||
|
||
class DataTransformation: | ||
def __init__(self): | ||
self.data_transformation_config = DataTransformationConfig() | ||
|
||
|
||
def get_data_transformer_obj(self): | ||
''' | ||
Responsible for data transformation. | ||
''' | ||
try: | ||
numerical_columns = ['writing score', 'reading score'] | ||
categorical_columns = [ | ||
'gender', | ||
'race/ethnicity', | ||
'parental level of education', | ||
'lunch', | ||
'test preparation course' | ||
] | ||
|
||
num_pipeline = Pipeline( | ||
steps = [ | ||
("imputer", SimpleImputer(statergy="Median")), | ||
("scaler", StandardScaler()) | ||
] | ||
) | ||
cat_pipeline = Pipeline( | ||
steps = [ | ||
("imputer", SimpleImputer(strategy="most_frequent")) | ||
("one_hot_encoder", OneHotEncoder()) | ||
("scaler", StandardScaler()) | ||
] | ||
) | ||
|
||
logging.info(f"Categorical columns: {categorical_columns}") | ||
logging.info(f"Numerical columns: {numerical_columns}") | ||
|
||
preprocessor = ColumnTransformer( | ||
[ | ||
("num_pipeline", num_pipeline, numerical_columns), | ||
("cat_pipeline", cat_pipeline, categorical_columns) | ||
] | ||
) | ||
|
||
return preprocessor | ||
|
||
|
||
except Exception as e: | ||
raise CustomException(e, sys) | ||
|
||
def initiate_data_transformation(self, train_path, test_path): | ||
|
||
try: | ||
train_df = pd.read_csv(train_path) | ||
test_df = pd.read_csv(test_path) | ||
|
||
|
||
logging.info("Read train and test data completed") | ||
|
||
logging.info("Obtaining preprocessing object") | ||
|
||
preprocessing_obj = self.get_data_transformer_obj() | ||
|
||
target_column_name = "math score" | ||
numerical_columns = ['writing score', 'reading score'] | ||
|
||
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) | ||
target_feature_train_df = train_df[target_column_name] | ||
|
||
input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) | ||
target_feature_test_df = test_df[target_column_name] | ||
|
||
logging.info( | ||
f"applying preprocessing object on training dataframe and testing dataframe" | ||
) | ||
|
||
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df) | ||
input_feature_test_arr = preprocessing_obj.fit_transform(input_feature_train_df) | ||
|
||
|
||
train_arr = np.c_[ | ||
input_feature_train_arr, | ||
np.array(target_feature_train_df) | ||
] | ||
|
||
test_arr = np.c_[ | ||
input_feature_test_arr, | ||
np.array(target_feature_test_df) | ||
] | ||
|
||
logging.info(f"Saved preprocessing object.") | ||
|
||
save_object( | ||
file_path=self.data_transformation_config.preprocessor_obj_file_path, | ||
obj=preprocessing_obj | ||
) | ||
|
||
return ( | ||
train_arr, | ||
test_arr, | ||
self.data_transformation_config.preprocessor_obj_file_path | ||
) | ||
|
||
except Exception as e: | ||
raise CustomException(e, sys) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import os | ||
import sys | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from src.exception import CustomException | ||
import dill | ||
|
||
|
||
def save_object(file_path, obj): | ||
|
||
try: | ||
dir_path = os.path.dirname(file_path) | ||
|
||
os.makedirs(dir_path, exist_ok=True) | ||
|
||
|
||
with open(file_path, "wb") as file_obj: | ||
dill.dump(obj, file_obj) | ||
|
||
|
||
except Exception as e: | ||
raise CustomException(e, sys) |