diff --git a/requirements.txt b/requirements.txt index 0937926..0bfc8db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,9 @@ pandas numpy seaborn --e . \ No newline at end of file +dill +matplotlib +sckit-learn +catboost +xgboost +# -e . \ No newline at end of file diff --git a/src/components/data_ingestion.py b/src/components/data_ingestion.py index 2befe6f..dc53acc 100644 --- a/src/components/data_ingestion.py +++ b/src/components/data_ingestion.py @@ -1,3 +1,57 @@ +# import os +# import sys +# from src.exception import CustomException +# from src.logger import logging +# import pandas as pd + +# from sklearn.model_selection import train_test_split +# from dataclasses import dataclass +# from src.components.data_transformation import DataTransformation +# from src.components.data_transformation import DataTransformationConfig + + +# @dataclass +# class DataIngestionConfig: +# train_data_path: str=os.path.join('artifacts', "train.csv") +# test_data_path: str=os.path.join('artifacts', "test.csv") +# raw_data_path: str=os.path.join('artifacts', "data.csv") + +# class DataIngestion: +# def __init__(self): +# self.ingestion_config = DataIngestionConfig() + +# def initiate_data_ingestion(self): +# logging.info("Entered Data ingestion method or component.") + +# try: +# df = pd.read_csv('notebook/data/StudentsPerformance.csv') +# logging.info("Read the datset as dataframe") + +# os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True) + +# df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True) +# logging.info("Train_test_split_initiated.") +# train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) + +# train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True) +# test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True) +# logging.info("Ingestion of data is completed.") + + +# return ( +# self.ingestion_config.train_data_path, +# self.ingestion_config.test_data_path +# ) +# except Exception as e: +# raise CustomException(e, sys) + +# if __name__ == "__main__": +# obj = DataIngestion() +# train_data, test_data = obj.initiate_data_ingestion() + +# data_transformation = DataTransformation() +# data_transformation.initiate_data_transformation(train_data, test_data) + import os import sys from src.exception import CustomException @@ -7,41 +61,54 @@ from sklearn.model_selection import train_test_split from dataclasses import dataclass +from src.components.data_transformation import DataTransformation +from src.components.data_transformation import DataTransformationConfig + +# from src.components.model_trainer import ModelTrainerConfig +# from src.components.model_trainer import ModelTrainer @dataclass class DataIngestionConfig: - train_data_path: str=os.path.join('artifacts', "train.csv") - test_data_path: str=os.path.join('artifacts', "test.csv") - raw_data_path: str=os.path.join('artifacts', "data.csv") - + train_data_path: str=os.path.join('artifacts',"train.csv") + test_data_path: str=os.path.join('artifacts',"test.csv") + raw_data_path: str=os.path.join('artifacts',"data.csv") + class DataIngestion: def __init__(self): - self.ingestion_config = DataIngestionConfig() - + self.ingestion_config=DataIngestionConfig() + def initiate_data_ingestion(self): - logging.info("Entered Data ingestion method or component.") - + logging.info("Entered the data ingestion method or component") try: - df = pd.read_csv('notebook/data/StudentsPerformance.csv') - logging.info("Read the datset as dataframe") - - os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True) - - df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True) - logging.info("Train_test_split_initiated.") - train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) - - train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True) - test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True) - logging.info("Ingestion of data is completed.") - - - return ( + df=pd.read_csv('notebook\data\stud.csv') + logging.info('Read the dataset as dataframe') + + os.makedirs(os.path.dirname(self.ingestion_config.train_data_path),exist_ok=True) + + df.to_csv(self.ingestion_config.raw_data_path,index=False,header=True) + + logging.info("Train test split initiated") + train_set,test_set=train_test_split(df,test_size=0.2,random_state=42) + + train_set.to_csv(self.ingestion_config.train_data_path,index=False,header=True) + + test_set.to_csv(self.ingestion_config.test_data_path,index=False,header=True) + + logging.info("Inmgestion of the data iss completed") + + return( self.ingestion_config.train_data_path, self.ingestion_config.test_data_path + ) except Exception as e: - raise CustomException(e, sys) + raise CustomException(e,sys) -if __name__ == "__main__": - obj = DataIngestion() - obj.initiate_data_ingestion() \ No newline at end of file +if __name__=="__main__": + obj=DataIngestion() + train_data,test_data=obj.initiate_data_ingestion() + + data_transformation=DataTransformation() + train_arr,test_arr,_=data_transformation.initiate_data_transformation(train_data,test_data) + + # modeltrainer=ModelTrainer() + # print(modeltrainer.initiate_model_trainer(train_arr,test_arr)) \ No newline at end of file diff --git a/src/components/data_transformation.py b/src/components/data_transformation.py index e69de29..872b58a 100644 --- a/src/components/data_transformation.py +++ b/src/components/data_transformation.py @@ -0,0 +1,127 @@ +import sys +import data_ingestion +from dataclasses import dataclass +import numpy as np +import pandas as pd +import os + +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler + +from src.exception import CustomException +from src.logger import logging +from src.utils import save_object + + +@dataclass +class DataTransformationConfig: + preprocessor_obj_file_path = os.path.join('artifact', "preprocessor.pkl") + + +class DataTransformation: + def __init__(self): + self.data_transformation_config = DataTransformationConfig() + + + def get_data_transformer_obj(self): + ''' + Responsible for data transformation. + ''' + try: + numerical_columns = ['writing score', 'reading score'] + categorical_columns = [ + 'gender', + 'race/ethnicity', + 'parental level of education', + 'lunch', + 'test preparation course' + ] + + num_pipeline = Pipeline( + steps = [ + ("imputer", SimpleImputer(statergy="Median")), + ("scaler", StandardScaler()) + ] + ) + cat_pipeline = Pipeline( + steps = [ + ("imputer", SimpleImputer(strategy="most_frequent")) + ("one_hot_encoder", OneHotEncoder()) + ("scaler", StandardScaler()) + ] + ) + + logging.info(f"Categorical columns: {categorical_columns}") + logging.info(f"Numerical columns: {numerical_columns}") + + preprocessor = ColumnTransformer( + [ + ("num_pipeline", num_pipeline, numerical_columns), + ("cat_pipeline", cat_pipeline, categorical_columns) + ] + ) + + return preprocessor + + + except Exception as e: + raise CustomException(e, sys) + + def initiate_data_transformation(self, train_path, test_path): + + try: + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + + + logging.info("Read train and test data completed") + + logging.info("Obtaining preprocessing object") + + preprocessing_obj = self.get_data_transformer_obj() + + target_column_name = "math score" + numerical_columns = ['writing score', 'reading score'] + + input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) + target_feature_train_df = train_df[target_column_name] + + input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) + target_feature_test_df = test_df[target_column_name] + + logging.info( + f"applying preprocessing object on training dataframe and testing dataframe" + ) + + input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df) + input_feature_test_arr = preprocessing_obj.fit_transform(input_feature_train_df) + + + train_arr = np.c_[ + input_feature_train_arr, + np.array(target_feature_train_df) + ] + + test_arr = np.c_[ + input_feature_test_arr, + np.array(target_feature_test_df) + ] + + logging.info(f"Saved preprocessing object.") + + save_object( + file_path=self.data_transformation_config.preprocessor_obj_file_path, + obj=preprocessing_obj + ) + + return ( + train_arr, + test_arr, + self.data_transformation_config.preprocessor_obj_file_path + ) + + except Exception as e: + raise CustomException(e, sys) + \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index e69de29..53f4007 100644 --- a/src/utils.py +++ b/src/utils.py @@ -0,0 +1,23 @@ +import os +import sys + +import numpy as np +import pandas as pd +from src.exception import CustomException +import dill + + +def save_object(file_path, obj): + + try: + dir_path = os.path.dirname(file_path) + + os.makedirs(dir_path, exist_ok=True) + + + with open(file_path, "wb") as file_obj: + dill.dump(obj, file_obj) + + + except Exception as e: + raise CustomException(e, sys) \ No newline at end of file