import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
# Load in the r magic
%load_ext rpy2.ipython
# We need ggplot2
%R require(ggplot2)
df_offers = pd.read_excel("./data/WineKMC.xlsx", sheetname=0)
df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
df_offers.head()
df_transactions = pd.read_excel("./data/WineKMC.xlsx", sheetname=1)
df_transactions.columns = ["customer_name", "offer_id"]
df_transactions['n'] = 1
df_transactions.head()
df = pd.merge(df_offers, df_transactions)
df.head()
df.tail()
matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
matrix = matrix.fillna(0).reset_index()
matrix.head()
x_cols = matrix.columns[1:]
range_n_clusters = [2, 3, 4, 5, 6, 7]
for n_clusters in range_n_clusters:
# initialize kmeans for each n clusters between 2--6
kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
cluster_labels = kmeans.fit_predict(matrix[x_cols])
# silhouette_score for n clusters
silhouette_avg = silhouette_score(matrix[x_cols], cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
cluster = KMeans(n_clusters=5, random_state=4)
matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
matrix.cluster.value_counts()
%%R -i matrix -w 800 -h 600 -u px
ggplot(matrix, aes(x=factor(cluster))) +
geom_bar() +
geom_text(stat='count',
aes(label=..count..), vjust=-1, colour="black") +
xlab("Cluster") +
ylab("Customers\n(# in cluster)")
pca = PCA(n_components=2)
matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
matrix.head()
matrix = matrix.reset_index()
matrix.head()
customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
customer_clusters.head()
df = pd.merge(df_transactions, customer_clusters)
df.head()
df = pd.merge(df_offers, df)
df.head()
%%R -i df -w 800 -h 600 -u px
ggplot(df) +
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) +
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")
cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))
cluster_centers.head()
%%R -i df,cluster_centers -w 800 -h 600 -u px
ggplot(df) +
geom_point(aes(x=x, y=y, fill=factor(cluster)), size=3, col="#7f7f7f", shape=21) +
geom_point(data=cluster_centers, aes(x=x, y=y), color="red") +
geom_point(data=cluster_centers, aes(x=x, y=y), color="red", size=100, alpha=.1, legend=FALSE) +
geom_text(data=cluster_centers, aes(x=x, y=y, label=cluster), size=5, vjust=1.5, colour="black") +
theme_bw(base_family="Helvetica") +
ggtitle("Customers Grouped by Cluster")
df['is_2'] = df.cluster==2
df.groupby("is_2").varietal.value_counts()
df.groupby("is_2")[['min_qty', 'discount']].mean()