#Full Code
"""Classification and Regression on Iranian Churn Dataset.ipynb
Automatically generated by Colaboratory.
Original file is located at https://colab.research.google.com/drive/1n5Y7iE8Cf0-nXqO6gzCRxL_xRDIdrkC0 """
import csv import pandas as pd import numpy as np import seaborn as sns
df=pd.read_csv('Customer Churn.csv') df
"""# EDA"""
df.describe().T
df.info()
df=df.rename(columns={"Call Failure": "call_failure", "Complains": "complains", "Subscription Length": "subs_len", "Charge Amount": "charge_amount", "Seconds of Use": "total_sec_calls", "Frequency of use": "total_num_calls", "Frequency of SMS": "total_num_sms", "Distinct Called Numbers": "distinct_call_nums", "Age Group": "age_group", "Tariff Plan": "tariff_plan", "Status": "status", "Age": "age", "Customer Value": "customer_value"})
print(df.isna().values.any()) #or df.isnull().values.sum()
df.nunique()
import matplotlib.pyplot as plt plt.figure(figsize=(4,4)) sns.countplot(data=df,x='Churn')
sns.countplot(data=df,x='age')
sns.countplot(data=df,x="age",hue='Churn')
#Check correlation df.corr()['Churn']
plt.(df['complains'],df['Churn'])
#Plotting it in a heatmap plt.figure(figsize=(16,12)) sns.heatmap(df.corr(),annot=True,cmap="viridis",linewidths=0.5) plt.show()
#feature importance using corr df.drop('Churn', axis=1).corrwith(df.Churn).plot(kind='barh', figsize=(8, 6), color='blue', title="Churn vs all other features")
#Predictive Power Score !pip install ppscore
import seaborn as sns import ppscore as pps plt.figure(figsize=(8,6)) matrix_df = pps.matrix(df).pivot(columns='x', index='y', values='ppscore')
sns.heatmap(matrix_df, annot=True) plt.tight_layout()
#for cat data distribution import matplotlib
plt.figure(figsize=(32, 32)) matplotlib.rc('axes', titlesize=24)#cols size
cat_feature_col=["Complains",'Charge Amount', "Age Group", "Tariff Plan", "Status", "Age"] for i, column in enumerate(cat_feature_col, 1): plt.subplot(4, 4, i) df[df["Churn"] == 0][column].hist(bins=20, color='pink', label='churn = 0(non-churn)', alpha=1) df[df["Churn"] == 1][column].hist(bins=20, color='tomato', label='churn = 1(churn)', alpha=1) plt.legend(fontsize='medium') plt.title(column)
#since age_group and age is highly correlated, we decide to del age_group so remove similiar faetures that are highly correlated df=df.drop(columns=["Age Group"])
#Then we can check the required dataframe df
all_feature_col=["call_failure", "subs_len", "total_sec_calls", "total_num_calls", "total_num_sms", "distinct_call_nums","customer_value"] #plot the pairplot sns.pairplot(df[all_feature_col+['Churn']],hue='Churn')
df
"""## Splitting the dataset into training set and testing set"""
from sklearn.model_selection import train_test_split
X = df.drop('Churn', axis=1) y=df.loc[:,'Churn'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify=y)
y_test
y
print("X_train size is", len(X_train)) print("y_train size is", len(y_train)) print('\n')
print("X_test size is", len(X_test)) print("y_test size is", len(y_test))
"""## Normalization"""
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_normalized_arr=scaler.transform(X_train) X_train_normalized_df=pd.DataFrame(X_train_normalized_arr, columns=list(X.columns))
X_test_normalized_arr=scaler.transform(X_test) X_test_normalized_df=pd.DataFrame(X_test_normalized_arr, columns=list(X.columns))
print(len(X_train_normalized_arr)) print(len(X_test_normalized_arr))
print(len(X_train_normalized_df)) print(len(X_test_normalized_df))
X_train_normalized_df
X_test_normalized_df
"""# Classification"""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import AdaBoostClassifier from sklearn.linear_model import Perceptron
classifiers = [ DecisionTreeClassifier(), GaussianNB(), KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(), AdaBoostClassifier(), Perceptron() ]
accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = []
for classifier in classifiers: classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy_scores.append(accuracy)
precision_scores.append(precision)
recall_scores.append(recall)
f1_scores.append(f1)
print(f"Classifier: {type(classifier)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print('Confusion matrix:\n',confusion_matrix(y_test,y_pred))
print("--------------------------------------------")
from sklearn.metrics import confusion_matrix
labels = [classifier for classifier in classifiers] x = range(len(classifiers))
plt.figure(figsize=(10, 8)) plt.bar(x, accuracy_scores, label='Accuracy') plt.bar(x, precision_scores, label='Precision') plt.bar(x, recall_scores, label='Recall') plt.bar(x, f1_scores, label='F1-Score')
accuracy_scores_percentage = [score * 100 for score in accuracy_scores] def add(x, y): for i in range(len(x)): plt.text(x[i], accuracy_scores[i] / 2, f'{int(y[i])}%', ha='center', color='white')
add(x, accuracy_scores_percentage)
plt.xlabel('Classifiers') plt.ylabel('Score') plt.title('Performance of All Classification Models') plt.xticks(x, labels, rotation=90) plt.legend(bbox_to_anchor=(1.05,1.0))
plt.show()
"""# Regression"""
from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor
y = df.loc[:, 'Churn'].values X = df.drop('Churn', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = { "Linear Regression": LinearRegression(), "Decision Tree Regressor": DecisionTreeRegressor(random_state=42), "SVR (RBF)": SVR(kernel='rbf'), "Random Forest Regressor": RandomForestRegressor(random_state=42) }
metrics = { "Mean Squared Error (MSE)": mean_squared_error, "Mean Absolute Error (MAE)": mean_absolute_error, "Root Mean Squared Error (RMSE)": lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)) }
for model_name, model in models.items(): print(f"Training and evaluating {model_name}") model.fit(X_train, y_train) y_pred = model.predict(X_test) final_y_pred=np.where(y_pred>0.5,1,0)
# Print evaluation metrics
print("Evaluation metrics:")
for metric_name, metric_func in metrics.items():
metric_value = metric_func(y_test, y_pred)
print(f"{metric_name}: {metric_value:.2f}")
print(f"'R Squared Value': {model.score(X_test,y_test)}")
print('\n')