diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 86a5dd968779..d5fa31135073 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -47,12 +47,18 @@ k ) - 5. Have fun.. + 5. Transfers Dataframe into excel format it must have feature called + 'Clust' with k means clustering numbers in it. + """ import numpy as np +import pandas as pd from matplotlib import pyplot as plt from sklearn.metrics import pairwise_distances +import warnings + +warnings.filterwarnings("ignore") TAG = "K-MEANS-CLUST/ " @@ -202,3 +208,158 @@ def kmeans( verbose=True, ) plot_heterogeneity(heterogeneity, k) + + +def ReportGenerator( + df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None +) -> pd.DataFrame: + """ + Function generates easy-erading clustering report. It takes 2 arguments as an input: + DataFrame - dataframe with predicted cluester column; + FillMissingReport - dictionary of rules how we are going to fill missing + values of for final report generate (not included in modeling); + in order to run the function following libraries must be imported: + import pandas as pd + import numpy as np + + >>> data = pd.DataFrame() + >>> data['numbers'] = [1, 2, 3] + >>> data['col1'] = [0.5, 2.5, 4.5] + >>> data['col2'] = [100, 200, 300] + >>> data['col3'] = [10, 20, 30] + >>> data['Cluster'] = [1, 1, 2] + >>> ReportGenerator(data, ['col1', 'col2'], 0) + Features Type Mark 1 2 + 0 # of Customers ClusterSize False 2.000000 1.000000 + 1 % of Customers ClusterProportion False 0.666667 0.333333 + 2 col1 mean_with_zeros True 1.500000 4.500000 + 3 col2 mean_with_zeros True 150.000000 300.000000 + 4 numbers mean_with_zeros False 1.500000 3.000000 + .. ... ... ... ... ... + 99 dummy 5% False 1.000000 1.000000 + 100 dummy 95% False 1.000000 1.000000 + 101 dummy stdev False 0.000000 NaN + 102 dummy mode False 1.000000 1.000000 + 103 dummy median False 1.000000 1.000000 + + [104 rows x 5 columns] + """ + # Fill missing values with given rules + if FillMissingReport: + df.fillna(value=FillMissingReport, inplace=True) + df["dummy"] = 1 + numeric_cols = df.select_dtypes(np.number).columns + report = ( + df.groupby(["Cluster"])[ # constract report dataframe + numeric_cols + ] # group by cluster number + .agg( + [ + ("sum", np.sum), + ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), + ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()), + ( + "mean_25-75", + lambda x: np.mean( + np.nan_to_num( + sorted(x)[ + round((len(x) * 25 / 100)) : round(len(x) * 75 / 100) + ] + ) + ), + ), + ("mean_with_na", np.mean), + ("min", lambda x: x.min()), + ("5%", lambda x: x.quantile(0.05)), + ("25%", lambda x: x.quantile(0.25)), + ("50%", lambda x: x.quantile(0.50)), + ("75%", lambda x: x.quantile(0.75)), + ("95%", lambda x: x.quantile(0.95)), + ("max", lambda x: x.max()), + ("count", lambda x: x.count()), + ("stdev", lambda x: x.std()), + ("mode", lambda x: x.mode()[0]), + ("median", lambda x: x.median()), + ("# > 0", lambda x: (x > 0).sum()), + ] + ) + .T.reset_index() + .rename(index=str, columns={"level_0": "Features", "level_1": "Type"}) + ) # rename columns + + clustersize = report[ + (report["Features"] == "dummy") & (report["Type"] == "count") + ] # caclulating size of cluster(count of clientID's) + clustersize.Type = ( + "ClusterSize" # rename created cluster df to match report column names + ) + clustersize.Features = "# of Customers" + clusterproportion = pd.DataFrame( + clustersize.iloc[:, 2:].values + / clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster + ) + clusterproportion[ + "Type" + ] = "% of Customers" # rename created cluster df to match report column names + clusterproportion["Features"] = "ClusterProportion" + cols = clusterproportion.columns.tolist() + cols = cols[-2:] + cols[:-2] + clusterproportion = clusterproportion[cols] # rearrange columns to match report + clusterproportion.columns = report.columns + a = pd.DataFrame( + abs( + report[report["Type"] == "count"].iloc[:, 2:].values + - clustersize.iloc[:, 2:].values + ) + ) # generating df with count of nan values + a["Features"] = 0 + a["Type"] = "# of nan" + a.Features = report[ + report["Type"] == "count" + ].Features.tolist() # filling values in order to match report + cols = a.columns.tolist() + cols = cols[-2:] + cols[:-2] + a = a[cols] # rearrange columns to match report + a.columns = report.columns # rename columns to match report + report = report.drop( + report[report.Type == "count"].index + ) # drop count values except cluster size + report = pd.concat( + [report, a, clustersize, clusterproportion], axis=0 + ) # concat report with clustert size and nan values + report["Mark"] = report["Features"].isin(ClusteringVariables) + cols = report.columns.tolist() + cols = cols[0:2] + cols[-1:] + cols[2:-1] + report = report[cols] + sorter1 = { + "ClusterSize": 9, + "ClusterProportion": 8, + "mean_with_zeros": 7, + "mean_with_na": 6, + "max": 5, + "50%": 4, + "min": 3, + "25%": 2, + "75%": 1, + "# of nan": 0, + "# > 0": -1, + "sum_with_na": -2, + } + report = ( + report.assign( + Sorter1=lambda x: x.Type.map(sorter1), + Sorter2=lambda x: list(reversed(range(len(x)))), + ) + .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False) + .drop(["Sorter1", "Sorter2"], axis=1) + ) + report.columns.name = "" + report = report.reset_index() + report.drop(columns=["index"], inplace=True) + return report + + +if __name__ == "__main__": + import doctest + + doctest.testmod()