import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
# Load in the r magic
%load_ext rpy2.ipython
# We need ggplot2
%R require(ggplot2)

array([1])

df_offers = pd.read_excel("./data/WineKMC.xlsx", sheetname=0)
df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
df_offers.head()

df_transactions = pd.read_excel("./data/WineKMC.xlsx", sheetname=1)
df_transactions.columns = ["customer_name", "offer_id"]
df_transactions['n'] = 1
df_transactions.head()

df = pd.merge(df_offers, df_transactions)
df.head()
df.tail()

matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
matrix = matrix.fillna(0).reset_index()
matrix.head()
x_cols = matrix.columns[1:]

range_n_clusters = [2, 3, 4, 5, 6, 7]

for n_clusters in range_n_clusters:
    # initialize kmeans for each n clusters between 2--6
    kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
    cluster_labels = kmeans.fit_predict(matrix[x_cols])

    # silhouette_score for n clusters
    silhouette_avg = silhouette_score(matrix[x_cols], cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

('For n_clusters =', 2, 'The average silhouette_score is :', 0.093655732834927133)
('For n_clusters =', 3, 'The average silhouette_score is :', 0.11889942863638556)
('For n_clusters =', 4, 'The average silhouette_score is :', 0.12347053919571699)
('For n_clusters =', 5, 'The average silhouette_score is :', 0.14092516241984757)
('For n_clusters =', 6, 'The average silhouette_score is :', 0.1371798939109807)
('For n_clusters =', 7, 'The average silhouette_score is :', 0.1161092456616906)

cluster = KMeans(n_clusters=5, random_state=4)
matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
matrix.cluster.value_counts()

2    29
1    22
0    21
4    16
3    12
Name: cluster, dtype: int64

%%R -i matrix -w 800 -h 600 -u px

ggplot(matrix, aes(x=factor(cluster))) + 
                            geom_bar() +
geom_text(stat='count', 
          aes(label=..count..), vjust=-1, colour="black") +
                       xlab("Cluster") + 
                       ylab("Customers\n(# in cluster)")

pca = PCA(n_components=2)
matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
matrix.head()
matrix = matrix.reset_index()
matrix.head()

customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
customer_clusters.head()

df = pd.merge(df_transactions, customer_clusters)
df.head()
df = pd.merge(df_offers, df)
df.head()

%%R -i df -w 800 -h 600 -u px

ggplot(df) + 
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) + 
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")

cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))
cluster_centers.head()

%%R -i df,cluster_centers -w 800 -h 600 -u px

ggplot(df) + 
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) + 
geom_point(data=cluster_centers, aes(x=x, y=y), color="red") +
geom_point(data=cluster_centers, aes(x=x, y=y), color="red", size=100, alpha=.1, legend=FALSE) +
geom_text(data=cluster_centers, aes(x=x, y=y, label=cluster), size=5, vjust=1.5, colour="black") +
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")

df['is_2'] = df.cluster==2
df.groupby("is_2").varietal.value_counts()
df.groupby("is_2")[['min_qty', 'discount']].mean()

is_2   varietal          
False  Champagne             57
       Espumante             40
       Prosecco              36
       Pinot Noir            34
       Malbec                25
       Pinot Grigio          16
       Cabernet Sauvignon    12
       Merlot                10
       Chardonnay             4
True   Champagne             24
       Cabernet Sauvignon    20
       Chardonnay            11
       Merlot                10
       Pinot Noir            10
       Malbec                 7
       Prosecco               7
       Pinot Grigio           1
Name: varietal, dtype: int64

	customer_name	offer_id	n
0	Smith	2	1
1	Smith	24	1
2	Johnson	17	1
3	Johnson	24	1
4	Johnson	26	1

offer_id	customer_name	cluster	x	y
0	Adams	4	1.007580	0.108215
1	Allen	2	-0.287539	0.044715
2	Anderson	3	-0.392032	1.038391
3	Bailey	0	0.699477	-0.022542
4	Baker	0	0.088183	-0.471695

	customer_name	offer_id	n	cluster	x	y
0	Smith	2	1	3	-0.300227	0.829765
1	Smith	24	1	3	-0.300227	0.829765
2	Johnson	17	1	3	-0.461726	1.313485
3	Johnson	24	1	3	-0.461726	1.313485
4	Johnson	26	1	3	-0.461726	1.313485

	offer_id	campaign	varietal	min_qty	discount	origin	past_peak	customer_name	n	cluster	x	y
0	1	January	Malbec	72	56	France	False	Thomas	1	2	-1.041879	0.122314
1	1	January	Malbec	72	56	France	False	Jackson	1	1	-0.839578	-0.328823
2	1	January	Malbec	72	56	France	False	Mitchell	1	2	-0.342938	0.302186
3	1	January	Malbec	72	56	France	False	Peterson	1	2	-0.797379	0.475995
4	1	January	Malbec	72	56	France	False	Wood	1	2	-0.551215	-0.395670

	x	y	cluster
0	0.493408	-0.085877	0
1	-0.473904	-0.625128	1
2	-0.367099	0.044973	2
3	-0.402085	1.129321	3
4	0.970952	0.043761	4

	offer_id	campaign	varietal	min_qty	discount	origin	past_peak	customer_name	n
319	31	December	Champagne	72	89	France	False	Fisher	1
320	32	December	Cabernet Sauvignon	72	45	Germany	True	Young	1
321	32	December	Cabernet Sauvignon	72	45	Germany	True	Cooper	1
322	32	December	Cabernet Sauvignon	72	45	Germany	True	Gomez	1
323	32	December	Cabernet Sauvignon	72	45	Germany	True	Kelly	1

offer_id	customer_name	7	9	...	24	26	27	29	30	31
0	Adams	0.0	0.0	...	0.0	0.0	0.0	1.0	1.0	0.0
1	Allen	0.0	1.0	...	0.0	0.0	1.0	0.0	0.0	0.0
2	Anderson	0.0	0.0	...	1.0	1.0	0.0	0.0	0.0	0.0
3	Bailey	1.0	0.0	...	0.0	0.0	0.0	0.0	1.0	0.0
4	Baker	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0

offer_id	customer_name	7	9	...	26	27	29	30	31	cluster	x	y
0	Adams	0.0	0.0	...	0.0	0.0	1.0	1.0	0.0	4	1.007580	0.108215
1	Allen	0.0	1.0	...	0.0	1.0	0.0	0.0	0.0	2	-0.287539	0.044715
2	Anderson	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	3	-0.392032	1.038391
3	Bailey	1.0	0.0	...	0.0	0.0	0.0	1.0	0.0	0	0.699477	-0.022542
4	Baker	1.0	0.0	...	0.0	0.0	0.0	0.0	1.0	0	0.088183	-0.471695

offer_id	index	customer_name	7	...	26	27	29	30	31	cluster	x	y
0	0	Adams	0.0	...	0.0	0.0	1.0	1.0	0.0	4	1.007580	0.108215
1	1	Allen	0.0	...	0.0	1.0	0.0	0.0	0.0	2	-0.287539	0.044715
2	2	Anderson	0.0	...	1.0	0.0	0.0	0.0	0.0	3	-0.392032	1.038391
3	3	Bailey	1.0	...	0.0	0.0	0.0	1.0	0.0	0	0.699477	-0.022542
4	4	Baker	1.0	...	0.0	0.0	0.0	0.0	1.0	0	0.088183	-0.471695