In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
# Load in the r magic
%load_ext rpy2.ipython
# We need ggplot2
%R require(ggplot2)
Out[1]:
array([1])
In [2]:
df_offers = pd.read_excel("./data/WineKMC.xlsx", sheetname=0)
df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
df_offers.head()
Out[2]:
offer_id campaign varietal min_qty discount origin past_peak
0 1 January Malbec 72 56 France False
1 2 January Pinot Noir 72 17 France False
2 3 February Espumante 144 32 Oregon True
3 4 February Champagne 72 48 France True
4 5 February Cabernet Sauvignon 144 44 New Zealand True
In [3]:
df_transactions = pd.read_excel("./data/WineKMC.xlsx", sheetname=1)
df_transactions.columns = ["customer_name", "offer_id"]
df_transactions['n'] = 1
df_transactions.head()
Out[3]:
customer_name offer_id n
0 Smith 2 1
1 Smith 24 1
2 Johnson 17 1
3 Johnson 24 1
4 Johnson 26 1
In [4]:
df = pd.merge(df_offers, df_transactions)
df.head()
df.tail()
Out[4]:
offer_id campaign varietal min_qty discount origin past_peak customer_name n
0 1 January Malbec 72 56 France False Thomas 1
1 1 January Malbec 72 56 France False Jackson 1
2 1 January Malbec 72 56 France False Mitchell 1
3 1 January Malbec 72 56 France False Peterson 1
4 1 January Malbec 72 56 France False Wood 1
Out[4]:
offer_id campaign varietal min_qty discount origin past_peak customer_name n
319 31 December Champagne 72 89 France False Fisher 1
320 32 December Cabernet Sauvignon 72 45 Germany True Young 1
321 32 December Cabernet Sauvignon 72 45 Germany True Cooper 1
322 32 December Cabernet Sauvignon 72 45 Germany True Gomez 1
323 32 December Cabernet Sauvignon 72 45 Germany True Kelly 1
In [5]:
matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
matrix = matrix.fillna(0).reset_index()
matrix.head()
x_cols = matrix.columns[1:]
Out[5]:
offer_id customer_name 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31 32
0 Adams 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
1 Allen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
2 Anderson 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Bailey 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 Baker 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

5 rows × 33 columns

In [6]:
range_n_clusters = [2, 3, 4, 5, 6, 7]

for n_clusters in range_n_clusters:
    # initialize kmeans for each n clusters between 2--6
    kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
    cluster_labels = kmeans.fit_predict(matrix[x_cols])

    # silhouette_score for n clusters
    silhouette_avg = silhouette_score(matrix[x_cols], cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
('For n_clusters =', 2, 'The average silhouette_score is :', 0.093655732834927133)
('For n_clusters =', 3, 'The average silhouette_score is :', 0.11889942863638556)
('For n_clusters =', 4, 'The average silhouette_score is :', 0.12347053919571699)
('For n_clusters =', 5, 'The average silhouette_score is :', 0.14092516241984757)
('For n_clusters =', 6, 'The average silhouette_score is :', 0.1371798939109807)
('For n_clusters =', 7, 'The average silhouette_score is :', 0.1161092456616906)
In [7]:
cluster = KMeans(n_clusters=5, random_state=4)
matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
matrix.cluster.value_counts()
Out[7]:
2    29
1    22
0    21
4    16
3    12
Name: cluster, dtype: int64
In [8]:
%%R -i matrix -w 800 -h 600 -u px

ggplot(matrix, aes(x=factor(cluster))) + 
                            geom_bar() +
geom_text(stat='count', 
          aes(label=..count..), vjust=-1, colour="black") +
                       xlab("Cluster") + 
                       ylab("Customers\n(# in cluster)")
In [9]:
pca = PCA(n_components=2)
matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
matrix.head()
matrix = matrix.reset_index()
matrix.head()
Out[9]:
offer_id customer_name 1 2 3 4 5 6 7 8 9 ... 26 27 28 29 30 31 32 cluster x y
0 Adams 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 4 1.007580 0.108215
1 Allen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 2 -0.287539 0.044715
2 Anderson 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 3 -0.392032 1.038391
3 Bailey 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0.699477 -0.022542
4 Baker 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0 0.088183 -0.471695

5 rows × 36 columns

Out[9]:
offer_id index customer_name 1 2 3 4 5 6 7 8 ... 26 27 28 29 30 31 32 cluster x y
0 0 Adams 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 4 1.007580 0.108215
1 1 Allen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 2 -0.287539 0.044715
2 2 Anderson 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 3 -0.392032 1.038391
3 3 Bailey 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0.699477 -0.022542
4 4 Baker 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0 0.088183 -0.471695

5 rows × 37 columns

In [10]:
customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
customer_clusters.head()
Out[10]:
offer_id customer_name cluster x y
0 Adams 4 1.007580 0.108215
1 Allen 2 -0.287539 0.044715
2 Anderson 3 -0.392032 1.038391
3 Bailey 0 0.699477 -0.022542
4 Baker 0 0.088183 -0.471695
In [11]:
df = pd.merge(df_transactions, customer_clusters)
df.head()
df = pd.merge(df_offers, df)
df.head()
Out[11]:
customer_name offer_id n cluster x y
0 Smith 2 1 3 -0.300227 0.829765
1 Smith 24 1 3 -0.300227 0.829765
2 Johnson 17 1 3 -0.461726 1.313485
3 Johnson 24 1 3 -0.461726 1.313485
4 Johnson 26 1 3 -0.461726 1.313485
Out[11]:
offer_id campaign varietal min_qty discount origin past_peak customer_name n cluster x y
0 1 January Malbec 72 56 France False Thomas 1 2 -1.041879 0.122314
1 1 January Malbec 72 56 France False Jackson 1 1 -0.839578 -0.328823
2 1 January Malbec 72 56 France False Mitchell 1 2 -0.342938 0.302186
3 1 January Malbec 72 56 France False Peterson 1 2 -0.797379 0.475995
4 1 January Malbec 72 56 France False Wood 1 2 -0.551215 -0.395670
In [12]:
%%R -i df -w 800 -h 600 -u px

ggplot(df) + 
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) + 
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")
In [13]:
cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))
cluster_centers.head()
Out[13]:
x y cluster
0 0.493408 -0.085877 0
1 -0.473904 -0.625128 1
2 -0.367099 0.044973 2
3 -0.402085 1.129321 3
4 0.970952 0.043761 4
In [14]:
%%R -i df,cluster_centers -w 800 -h 600 -u px

ggplot(df) + 
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) + 
geom_point(data=cluster_centers, aes(x=x, y=y), color="red") +
geom_point(data=cluster_centers, aes(x=x, y=y), color="red", size=100, alpha=.1, legend=FALSE) +
geom_text(data=cluster_centers, aes(x=x, y=y, label=cluster), size=5, vjust=1.5, colour="black") +
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")
In [15]:
df['is_2'] = df.cluster==2
df.groupby("is_2").varietal.value_counts()
df.groupby("is_2")[['min_qty', 'discount']].mean()
Out[15]:
is_2   varietal          
False  Champagne             57
       Espumante             40
       Prosecco              36
       Pinot Noir            34
       Malbec                25
       Pinot Grigio          16
       Cabernet Sauvignon    12
       Merlot                10
       Chardonnay             4
True   Champagne             24
       Cabernet Sauvignon    20
       Chardonnay            11
       Merlot                10
       Pinot Noir            10
       Malbec                 7
       Prosecco               7
       Pinot Grigio           1
Name: varietal, dtype: int64
Out[15]:
min_qty discount
is_2
False 47.358974 57.170940
True 87.133333 65.488889