Skip to content

Commit

Permalink
Create DMA Clustering File
Browse files Browse the repository at this point in the history
  • Loading branch information
neelgupta2112 authored Mar 22, 2023
1 parent 039e21f commit e4fa6f5
Showing 1 changed file with 93 additions and 0 deletions.
93 changes: 93 additions & 0 deletions DMA Clustering File
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
### RUN - Pivoting store_sales into wide format needed for segmentation
store_sales_wide = (store_sales[[store_sales_datecol,store_sales_keycol,store_sales_KPIcol]]
.pivot(index=store_sales_datecol,
columns=store_sales_keycol,
values=store_sales_KPIcol
)
#.fillna(0)
)

sales_corr_db = store_sales_wide.corr()

## Parameters for segmentation
#Define correlation cutoff to be included as a 'best match' for a given DMA
#CURRENT BEST PARAMETERS FOR AZO are 0.86 and 7
corr_cutoff = 0.74
#Define minimum DMA count for a group
min_dma_count = 1 #don't change and if you feel like you need to go down to 1, this probably isn't a useful tool
sales_volume_threshold = 0.07


## turn this into an adjacency matrix and then create a mapping to hold index info
A = sales_corr_db.to_numpy()
A = (A >= corr_cutoff).astype(int)

mapping = dict(zip(range(0, len(sales_corr_db)), sales_corr_db.index))

#graph density as a useful metric to tune corr_cutoff above
graph_density = (sum(sum(A))*2) / ((205)*(204))
print(graph_density)

#find maximal cliques
import networkx as nx
import itertools
import time

from networkx.algorithms.clique import find_cliques as maximal_cliques
G = nx.from_numpy_array(A)
start = time.time()
#cliques2 = [s for s in nx.enumerate_all_cliques(G) if len(s) > min_store_count]
cliques2 = [s for s in maximal_cliques(G) if len(s) >= min_store_count]
end = time.time()
time_taken = start - end
print(time_taken)

#creating a mapping dictionary from DMA to sales, so we can calculate sales volume of a clique
sales_mapping = {}
for dma in store_sales['DMA_CODE'].unique():
sales_mapping[dma] = store_sales[store_sales['DMA_CODE'] == dma]['TOTAL_RETAIL'].sum()

#function so get the sales of a clique
def sales_sum(clique1):
return sum(sales_mapping[dma] for dma in clique1)

#convert numpy indices back into DMAs
cliques = []
for element in cliques2:
new_element = []
for element2 in element:
new_element.append((mapping[element2]))
cliques.append(new_element)

#turn maximal cliques into clusters by taking the largest sales volume cliques, and removing them and their elements from the pool and iterating until no more maximal cliques are left
groups = {}
grouped_DMA = []
total_sales = store_sales['TOTAL_RETAIL'].sum()
i = 1
while len(cliques) != 0:
largest = max(cliques, key=sales_sum)
sales = 0
if (sales_sum(largest) / total_sales) >= sales_volume_threshold:
cliques.remove(largest)
groupname = 'Group_' + str(i)
i += 1
groups.update({
groupname:largest
})
grouped_DMA = grouped_DMA + largest
for clique in cliques:
for item in clique.copy():
if item in largest:
clique.remove(item)
else:
break
cliques = [clique for clique in cliques if len(clique) >= min_dma_count]
groups['Other'] = [dma for (index, dma) in mapping.items() if dma not in grouped_DMA]
print(groups)







0 comments on commit e4fa6f5

Please sign in to comment.