Skip to content

thor-harsh/Iranian-Churn

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 

Repository files navigation

Iranian-Churn

#Full Code

-- coding: utf-8 --

"""Classification and Regression on Iranian Churn Dataset.ipynb

Automatically generated by Colaboratory.

Original file is located at https://colab.research.google.com/drive/1n5Y7iE8Cf0-nXqO6gzCRxL_xRDIdrkC0 """

import csv import pandas as pd import numpy as np import seaborn as sns

df=pd.read_csv('Customer Churn.csv') df

"""# EDA"""

df.describe().T

df.info()

df=df.rename(columns={"Call Failure": "call_failure", "Complains": "complains", "Subscription Length": "subs_len", "Charge Amount": "charge_amount", "Seconds of Use": "total_sec_calls", "Frequency of use": "total_num_calls", "Frequency of SMS": "total_num_sms", "Distinct Called Numbers": "distinct_call_nums", "Age Group": "age_group", "Tariff Plan": "tariff_plan", "Status": "status", "Age": "age", "Customer Value": "customer_value"})

print(df.isna().values.any()) #or df.isnull().values.sum()

df.nunique()

import matplotlib.pyplot as plt plt.figure(figsize=(4,4)) sns.countplot(data=df,x='Churn')

sns.countplot(data=df,x='age')

sns.countplot(data=df,x="age",hue='Churn')

#Check correlation df.corr()['Churn']

plt.(df['complains'],df['Churn'])

#Plotting it in a heatmap plt.figure(figsize=(16,12)) sns.heatmap(df.corr(),annot=True,cmap="viridis",linewidths=0.5) plt.show()

#feature importance using corr df.drop('Churn', axis=1).corrwith(df.Churn).plot(kind='barh', figsize=(8, 6), color='blue', title="Churn vs all other features")

#Predictive Power Score !pip install ppscore

import seaborn as sns import ppscore as pps plt.figure(figsize=(8,6)) matrix_df = pps.matrix(df).pivot(columns='x', index='y', values='ppscore')

sns.heatmap(matrix_df, annot=True) plt.tight_layout()

#for cat data distribution import matplotlib

plt.figure(figsize=(32, 32)) matplotlib.rc('axes', titlesize=24)#cols size

cat_feature_col=["Complains",'Charge Amount', "Age Group", "Tariff Plan", "Status", "Age"] for i, column in enumerate(cat_feature_col, 1): plt.subplot(4, 4, i) df[df["Churn"] == 0][column].hist(bins=20, color='pink', label='churn = 0(non-churn)', alpha=1) df[df["Churn"] == 1][column].hist(bins=20, color='tomato', label='churn = 1(churn)', alpha=1) plt.legend(fontsize='medium') plt.title(column)

#since age_group and age is highly correlated, we decide to del age_group so remove similiar faetures that are highly correlated df=df.drop(columns=["Age Group"])

#Then we can check the required dataframe df

all_feature_col=["call_failure", "subs_len", "total_sec_calls", "total_num_calls", "total_num_sms", "distinct_call_nums","customer_value"] #plot the pairplot sns.pairplot(df[all_feature_col+['Churn']],hue='Churn')

df

"""## Splitting the dataset into training set and testing set"""

from sklearn.model_selection import train_test_split

extracting feature and target for this dataset

X = df.drop('Churn', axis=1) y=df.loc[:,'Churn'].values

split data into 80-20 for training set / test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify=y)

y_test

y

print("X_train size is", len(X_train)) print("y_train size is", len(y_train)) print('\n')

print("X_test size is", len(X_test)) print("y_test size is", len(y_test))

"""## Normalization"""

from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_normalized_arr=scaler.transform(X_train) X_train_normalized_df=pd.DataFrame(X_train_normalized_arr, columns=list(X.columns))

X_test_normalized_arr=scaler.transform(X_test) X_test_normalized_df=pd.DataFrame(X_test_normalized_arr, columns=list(X.columns))

print(len(X_train_normalized_arr)) print(len(X_test_normalized_arr))

print(len(X_train_normalized_df)) print(len(X_test_normalized_df))

X_train_normalized_df

X_test_normalized_df

"""# Classification"""

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import AdaBoostClassifier from sklearn.linear_model import Perceptron

Define a list of classifiers

classifiers = [ DecisionTreeClassifier(), GaussianNB(), KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(), AdaBoostClassifier(), Perceptron() ]

Lists to store evaluation metrics

accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = []

Train and evaluate each classifier

for classifier in classifiers: classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

accuracy_scores.append(accuracy)
precision_scores.append(precision)
recall_scores.append(recall)
f1_scores.append(f1)

print(f"Classifier: {type(classifier)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print('Confusion matrix:\n',confusion_matrix(y_test,y_pred))
print("--------------------------------------------")

from sklearn.metrics import confusion_matrix

Plotting performance of different models

labels = [classifier for classifier in classifiers] x = range(len(classifiers))

plt.figure(figsize=(10, 8)) plt.bar(x, accuracy_scores, label='Accuracy') plt.bar(x, precision_scores, label='Precision') plt.bar(x, recall_scores, label='Recall') plt.bar(x, f1_scores, label='F1-Score')

accuracy_scores_percentage = [score * 100 for score in accuracy_scores] def add(x, y): for i in range(len(x)): plt.text(x[i], accuracy_scores[i] / 2, f'{int(y[i])}%', ha='center', color='white')

# Call the function to add value labels

add(x, accuracy_scores_percentage)

print(accuracy_scores_percentage)

plt.xlabel('Classifiers') plt.ylabel('Score') plt.title('Performance of All Classification Models') plt.xticks(x, labels, rotation=90) plt.legend(bbox_to_anchor=(1.05,1.0))

plt.tight_layout()

plt.show()

"""# Regression"""

from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor

y = df.loc[:, 'Churn'].values X = df.drop('Churn', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Models

models = { "Linear Regression": LinearRegression(), "Decision Tree Regressor": DecisionTreeRegressor(random_state=42), "SVR (RBF)": SVR(kernel='rbf'), "Random Forest Regressor": RandomForestRegressor(random_state=42) }

Evaluation metrics

metrics = { "Mean Squared Error (MSE)": mean_squared_error, "Mean Absolute Error (MAE)": mean_absolute_error, "Root Mean Squared Error (RMSE)": lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)) }

Perform training and evaluation for each model

for model_name, model in models.items(): print(f"Training and evaluating {model_name}") model.fit(X_train, y_train) y_pred = model.predict(X_test) final_y_pred=np.where(y_pred>0.5,1,0)

# Print evaluation metrics
print("Evaluation metrics:")
for metric_name, metric_func in metrics.items():
    metric_value = metric_func(y_test, y_pred)
    print(f"{metric_name}: {metric_value:.2f}")
print(f"'R Squared Value': {model.score(X_test,y_test)}")
print('\n')

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published