Skip to content

Commit

Permalink
Data ingestion pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
sarthakone105 committed May 6, 2024
1 parent 15d9fab commit 8536cf3
Show file tree
Hide file tree
Showing 4 changed files with 250 additions and 28 deletions.
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
pandas
numpy
seaborn
-e .
dill
matplotlib
sckit-learn
catboost
xgboost
# -e .
121 changes: 94 additions & 27 deletions src/components/data_ingestion.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,57 @@
# import os
# import sys
# from src.exception import CustomException
# from src.logger import logging
# import pandas as pd

# from sklearn.model_selection import train_test_split
# from dataclasses import dataclass
# from src.components.data_transformation import DataTransformation
# from src.components.data_transformation import DataTransformationConfig


# @dataclass
# class DataIngestionConfig:
# train_data_path: str=os.path.join('artifacts', "train.csv")
# test_data_path: str=os.path.join('artifacts', "test.csv")
# raw_data_path: str=os.path.join('artifacts', "data.csv")

# class DataIngestion:
# def __init__(self):
# self.ingestion_config = DataIngestionConfig()

# def initiate_data_ingestion(self):
# logging.info("Entered Data ingestion method or component.")

# try:
# df = pd.read_csv('notebook/data/StudentsPerformance.csv')
# logging.info("Read the datset as dataframe")

# os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

# df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
# logging.info("Train_test_split_initiated.")
# train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
# test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
# logging.info("Ingestion of data is completed.")


# return (
# self.ingestion_config.train_data_path,
# self.ingestion_config.test_data_path
# )
# except Exception as e:
# raise CustomException(e, sys)

# if __name__ == "__main__":
# obj = DataIngestion()
# train_data, test_data = obj.initiate_data_ingestion()

# data_transformation = DataTransformation()
# data_transformation.initiate_data_transformation(train_data, test_data)

import os
import sys
from src.exception import CustomException
Expand All @@ -7,41 +61,54 @@
from sklearn.model_selection import train_test_split
from dataclasses import dataclass

from src.components.data_transformation import DataTransformation
from src.components.data_transformation import DataTransformationConfig

# from src.components.model_trainer import ModelTrainerConfig
# from src.components.model_trainer import ModelTrainer
@dataclass
class DataIngestionConfig:
train_data_path: str=os.path.join('artifacts', "train.csv")
test_data_path: str=os.path.join('artifacts', "test.csv")
raw_data_path: str=os.path.join('artifacts', "data.csv")
train_data_path: str=os.path.join('artifacts',"train.csv")
test_data_path: str=os.path.join('artifacts',"test.csv")
raw_data_path: str=os.path.join('artifacts',"data.csv")

class DataIngestion:
def __init__(self):
self.ingestion_config = DataIngestionConfig()
self.ingestion_config=DataIngestionConfig()

def initiate_data_ingestion(self):
logging.info("Entered Data ingestion method or component.")

logging.info("Entered the data ingestion method or component")
try:
df = pd.read_csv('notebook/data/StudentsPerformance.csv')
logging.info("Read the datset as dataframe")

os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
logging.info("Train_test_split_initiated.")
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
logging.info("Ingestion of data is completed.")


return (
df=pd.read_csv('notebook\data\stud.csv')
logging.info('Read the dataset as dataframe')

os.makedirs(os.path.dirname(self.ingestion_config.train_data_path),exist_ok=True)

df.to_csv(self.ingestion_config.raw_data_path,index=False,header=True)

logging.info("Train test split initiated")
train_set,test_set=train_test_split(df,test_size=0.2,random_state=42)

train_set.to_csv(self.ingestion_config.train_data_path,index=False,header=True)

test_set.to_csv(self.ingestion_config.test_data_path,index=False,header=True)

logging.info("Inmgestion of the data iss completed")

return(
self.ingestion_config.train_data_path,
self.ingestion_config.test_data_path

)
except Exception as e:
raise CustomException(e, sys)
raise CustomException(e,sys)

if __name__ == "__main__":
obj = DataIngestion()
obj.initiate_data_ingestion()
if __name__=="__main__":
obj=DataIngestion()
train_data,test_data=obj.initiate_data_ingestion()

data_transformation=DataTransformation()
train_arr,test_arr,_=data_transformation.initiate_data_transformation(train_data,test_data)

# modeltrainer=ModelTrainer()
# print(modeltrainer.initiate_model_trainer(train_arr,test_arr))
127 changes: 127 additions & 0 deletions src/components/data_transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import sys
import data_ingestion
from dataclasses import dataclass
import numpy as np
import pandas as pd
import os

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object


@dataclass
class DataTransformationConfig:
preprocessor_obj_file_path = os.path.join('artifact', "preprocessor.pkl")


class DataTransformation:
def __init__(self):
self.data_transformation_config = DataTransformationConfig()


def get_data_transformer_obj(self):
'''
Responsible for data transformation.
'''
try:
numerical_columns = ['writing score', 'reading score']
categorical_columns = [
'gender',
'race/ethnicity',
'parental level of education',
'lunch',
'test preparation course'
]

num_pipeline = Pipeline(
steps = [
("imputer", SimpleImputer(statergy="Median")),
("scaler", StandardScaler())
]
)
cat_pipeline = Pipeline(
steps = [
("imputer", SimpleImputer(strategy="most_frequent"))
("one_hot_encoder", OneHotEncoder())
("scaler", StandardScaler())
]
)

logging.info(f"Categorical columns: {categorical_columns}")
logging.info(f"Numerical columns: {numerical_columns}")

preprocessor = ColumnTransformer(
[
("num_pipeline", num_pipeline, numerical_columns),
("cat_pipeline", cat_pipeline, categorical_columns)
]
)

return preprocessor


except Exception as e:
raise CustomException(e, sys)

def initiate_data_transformation(self, train_path, test_path):

try:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


logging.info("Read train and test data completed")

logging.info("Obtaining preprocessing object")

preprocessing_obj = self.get_data_transformer_obj()

target_column_name = "math score"
numerical_columns = ['writing score', 'reading score']

input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
target_feature_test_df = test_df[target_column_name]

logging.info(
f"applying preprocessing object on training dataframe and testing dataframe"
)

input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessing_obj.fit_transform(input_feature_train_df)


train_arr = np.c_[
input_feature_train_arr,
np.array(target_feature_train_df)
]

test_arr = np.c_[
input_feature_test_arr,
np.array(target_feature_test_df)
]

logging.info(f"Saved preprocessing object.")

save_object(
file_path=self.data_transformation_config.preprocessor_obj_file_path,
obj=preprocessing_obj
)

return (
train_arr,
test_arr,
self.data_transformation_config.preprocessor_obj_file_path
)

except Exception as e:
raise CustomException(e, sys)

23 changes: 23 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import sys

import numpy as np
import pandas as pd
from src.exception import CustomException
import dill


def save_object(file_path, obj):

try:
dir_path = os.path.dirname(file_path)

os.makedirs(dir_path, exist_ok=True)


with open(file_path, "wb") as file_obj:
dill.dump(obj, file_obj)


except Exception as e:
raise CustomException(e, sys)

0 comments on commit 8536cf3

Please sign in to comment.