import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')


df = pd.read_csv('Mall_Customers.csv')


df.head()


df.shape

(200, 5)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


df.isnull().any()
# df.isnull().sum()
# sns.heatmap(df.isnull(),cmap = 'magma',cbar = False);
df.duplicated().sum()

CustomerID                False
Gender                    False
Age                       False
Annual Income (k$)        False
Spending Score (1-100)    False
dtype: bool

0


df.describe().T


#Columns's string treatment and drop unused column
df.columns = [s.strip().replace(' ', '_') for s in df.columns]
df = df.rename(columns = {'Annual_Income_(k$)':'Annual_Income', 'Spending_Score_(1-100)':'Spending_Score'})
# df = df.drop(columns = 'CustomerID')
df.drop('CustomerID', axis=1, inplace=True)


df.columns

Index(['Gender', 'Age', 'Annual_Income', 'Spending_Score'], dtype='object')


categorical_features = []
numerical_features = []

for column in df:
    if is_numeric_dtype(df[column]):
        numerical_features.append(column)
    elif is_string_dtype(df[column]):
        categorical_features.append(column)
        

# print('Categorical Features :', *categorical_features)
print('Categorical Features :', categorical_features)
# print('Numerical Features :', *numerical_features)
print('Numerical Features :', numerical_features)

Categorical Features : ['Gender']
Numerical Features : ['Age', 'Annual_Income', 'Spending_Score']


# Make a function to create numeric plots
def create_numeric_plot(columns):
    fig, axs = plt.subplots(len(columns), 2, figsize=(9, 8))
    for i, col in enumerate(columns):
        sns.boxplot(df[col], ax = axs[i][0])
        sns.distplot(df[col], ax = axs[i][1])
        axs[i][0].set_title('mean = %.2f\n median = %.2f\n std = %.2f'%(df[col].mean(), df[col].median(), df[col].std()))
    plt.setp(axs)
    plt.tight_layout()
    plt.show()

# Call create_numeric_plot function
create_numeric_plot(numerical_features)

  adjustable: {'box', 'datalim'}
  agg_filter: a filter function, which takes a (m, n, 3) float array and a dpi value, and returns a (m, n, 3) array
  alpha: scalar or None
  anchor: (float, float) or {'C', 'SW', 'S', 'SE', 'E', 'NE', ...}
  animated: bool
  aspect: {'auto', 'equal'} or float
  autoscale_on: bool
  autoscalex_on: bool
  autoscaley_on: bool
  axes_locator: Callable[[Axes, Renderer], Bbox]
  axisbelow: bool or 'line'
  box_aspect: float or None
  clip_box: `.Bbox`
  clip_on: bool
  clip_path: Patch or (Path, Transform) or None
  facecolor or fc: color
  figure: `.Figure`
  frame_on: bool
  gid: str
  in_layout: bool
  label: object
  navigate: bool
  navigate_mode: unknown
  path_effects: `.AbstractPathEffect`
  picker: None or bool or float or callable
  position: [left, bottom, width, height] or `~matplotlib.transforms.Bbox`
  prop_cycle: unknown
  rasterization_zorder: float or None
  rasterized: bool
  sketch_params: (scale: float, length: float, randomness: float)
  snap: bool or None
  subplotspec: unknown
  title: str
  transform: `.Transform`
  url: str
  visible: bool
  xbound: unknown
  xlabel: str
  xlim: (bottom: float, top: float)
  xmargin: float greater than -0.5
  xscale: {"linear", "log", "symlog", "logit", ...} or `.ScaleBase`
  xticklabels: unknown
  xticks: unknown
  ybound: unknown
  ylabel: str
  ylim: (bottom: float, top: float)
  ymargin: float greater than -0.5
  yscale: {"linear", "log", "symlog", "logit", ...} or `.ScaleBase`
  yticklabels: unknown
  yticks: unknown
  zorder: float


df.shape
for i in numerical_features:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[i]<=(Q3+1.5*IQR))&(df[i]>=(Q1-1.5*IQR))]
df = df.reset_index(drop=True)
df.shape
df.head()

(200, 4)

(198, 4)


df['Gender'].value_counts()

Female    112
Male       86
Name: Gender, dtype: int64


df['Gender'].value_counts(normalize=True)

Female    0.565657
Male      0.434343
Name: Gender, dtype: float64


# Make a function to create categorical plots
def create_categorical_plot(columns):
    fig, axs = plt.subplots(len(columns), 1, figsize=(5, 5))
    for i, col in enumerate(columns):
        sns.countplot(df[col], order = df[col].value_counts().head(10).index, ax =axs)
        axs.set_title('Countplot '+ col, fontsize = 20)
        plt.xticks(rotation = 0)
        #Create annotate
        for i in axs.patches:
            axs.annotate(format(i.get_height(), '.0f'),
                            (i.get_x() + i.get_width() / 2., i.get_height()),
                            ha = 'center',
                            va = 'center',
                            xytext = (0, 10),
                            textcoords = 'offset points')

        # Setting Plot
        sns.despine(right=True,top = True, left = True)
        axs.axes.yaxis.set_visible(False)
        plt.setp(axs)
        plt.tight_layout()
        plt.show();

# Call create_categorical_plot function
create_categorical_plot(categorical_features)

  adjustable: {'box', 'datalim'}
  agg_filter: a filter function, which takes a (m, n, 3) float array and a dpi value, and returns a (m, n, 3) array
  alpha: scalar or None
  anchor: (float, float) or {'C', 'SW', 'S', 'SE', 'E', 'NE', ...}
  animated: bool
  aspect: {'auto', 'equal'} or float
  autoscale_on: bool
  autoscalex_on: bool
  autoscaley_on: bool
  axes_locator: Callable[[Axes, Renderer], Bbox]
  axisbelow: bool or 'line'
  box_aspect: float or None
  clip_box: `.Bbox`
  clip_on: bool
  clip_path: Patch or (Path, Transform) or None
  facecolor or fc: color
  figure: `.Figure`
  frame_on: bool
  gid: str
  in_layout: bool
  label: object
  navigate: bool
  navigate_mode: unknown
  path_effects: `.AbstractPathEffect`
  picker: None or bool or float or callable
  position: [left, bottom, width, height] or `~matplotlib.transforms.Bbox`
  prop_cycle: unknown
  rasterization_zorder: float or None
  rasterized: bool
  sketch_params: (scale: float, length: float, randomness: float)
  snap: bool or None
  subplotspec: unknown
  title: str
  transform: `.Transform`
  url: str
  visible: bool
  xbound: unknown
  xlabel: str
  xlim: (bottom: float, top: float)
  xmargin: float greater than -0.5
  xscale: {"linear", "log", "symlog", "logit", ...} or `.ScaleBase`
  xticklabels: unknown
  xticks: unknown
  ybound: unknown
  ylabel: str
  ylim: (bottom: float, top: float)
  ymargin: float greater than -0.5
  yscale: {"linear", "log", "symlog", "logit", ...} or `.ScaleBase`
  yticklabels: unknown
  yticks: unknown
  zorder: float


plt.figure(figsize=(10,10))
sns.pairplot(data = df,hue='Gender',diag_kind='kde')
plt.show();

<Figure size 720x720 with 0 Axes>


ut = np.triu(df.corr())
lt = np.tril(df.corr())

fig,ax = plt.subplots(nrows = 1, ncols = 2,figsize = (15,5))
plt.subplot(1,2,1)
sns.heatmap(df.corr(),cmap = 'magma',annot = True,cbar = 'True',mask = ut);
plt.title('Correlation Matrix : Upper Triangular Format');

plt.subplot(1,2,2)
sns.heatmap(df.corr(),cmap = 'magma',annot = True,cbar = 'True',mask = lt);
plt.title('Correlation Matrix : Lower Triangular Format');


# Spending vs. Age
X = list()
# X.append(df[["Age", "Spending_Score"]].values)
# Spending vs. Age vs. Annual Income
X.append(df[["Age", "Annual_Income", "Spending_Score"]].values)

N = list()
# N.append(list(df[["Age", "Spending_Score"]].columns))
N.append(list(df[["Age", "Annual_Income", "Spending_Score"]].columns))


K = list()
for i in range(len(X)):
    model = KMeans(random_state=42)
    # visualizer = KElbowVisualizer(model, k=(2,10), metric='silhouette')
    visualizer = KElbowVisualizer(model, k=(2, 20))
    visualizer.fit(X[i])
    visualizer.show()
    # print(visualizer.elbow_value_)
    K.append(visualizer.elbow_value_) 
    model = KMeans(n_clusters = visualizer.elbow_value_, random_state=42)
    sil_visualizer = SilhouetteVisualizer(model)
    sil_visualizer.fit(X[i])    
    sil_visualizer.show()
    plt.show()

KElbowVisualizer(ax=<AxesSubplot:>,
                 estimator=KMeans(n_clusters=19, random_state=42), k=(2, 20))

<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

SilhouetteVisualizer(ax=<AxesSubplot:>,
                     estimator=KMeans(n_clusters=6, random_state=42))

<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 198 Samples in 6 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>


# https://stackoverflow.com/questions/65325834/using-a-variable-for-the-group-by-method-in-by-python-pandas
def create_all_summary(df,features,column_to_aggregate,agg_method): 
    df_output = df.groupby(features)[column_to_aggregate].agg(agg_method)
    return df_output

# Numerical Data Cluster Visualization
def cluster_num_plot():
    # Numerical Data Cluster Visualization
    for i in numerical_features:
        plt.figure(figsize=(6,4))
        ax = sns.boxplot(x = 'cluster',y = i, data = df)
        plt.title('\nBox Plot {}\n'.format(i), fontsize=15)
        plt.show();

def cluster_cat_plot():
    # Categorical Data Cluster Visualization
    for i in categorical_features:
        plt.figure(figsize=(9,7))
        ax = sns.countplot(data = df, x = 'cluster')
        plt.title('\nCount Plot {}\n'.format(i), fontsize=15)
        # ax.legend(loc="upper center")
        for p in ax.patches:
            ax.annotate(format(p.get_height(), '.0f'),
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha = 'center',
                        va = 'center',
                        xytext = (0, 10),
                        textcoords = 'offset points')

        sns.despine(right=True,top = True, left = True)
        ax.axes.yaxis.set_visible(False)
        plt.show();


    for i in categorical_features:
        plt.figure(figsize=(9,7))
        ax = sns.countplot(data = df, x = 'cluster', hue = i )
        plt.title('\nCount Plot {}\n'.format(i), fontsize=15)
        ax.legend(loc="upper center")
        for p in ax.patches:
            ax.annotate(format(p.get_height(), '.0f'),
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha = 'center',
                        va = 'center',
                        xytext = (0, 10),
                        textcoords = 'offset points')

        sns.despine(right=True,top = True, left = True)
        ax.axes.yaxis.set_visible(False)
        plt.show();

for i in range(len(X)):
    kmeans = KMeans(n_clusters = K[i], init = 'k-means++', random_state = 42)
    kmeans.fit(X[i])
    df['cluster'] = kmeans.labels_
    cluster_num_plot()
    cluster_cat_plot()
    create_all_summary(df, ['cluster'], N[i], ['count', 'min', 'mean', 'max'])
    create_all_summary(df, ['cluster', 'Gender'], N[i], ['count', 'min', 'mean', 'max'])

KMeans(n_clusters=6, random_state=42)

	count	mean	std	min	25%	50%	75%	max
CustomerID	200.0	100.50	57.879185	1.0	50.75	100.5	150.25	200.0
Age	200.0	38.85	13.969007	18.0	28.75	36.0	49.00	70.0
Annual Income (k$)	200.0	60.56	26.264721	15.0	41.50	61.5	78.00	137.0
Spending Score (1-100)	200.0	50.20	25.823522	1.0	34.75	50.0	73.00	99.0

	Age				Annual_Income				Spending_Score
	count	min	mean	max	count	min	mean	max	count	min	mean	max
cluster
0	38	27	32.763158	40	38	69	85.210526	126	38	63	82.105263	97
1	21	19	44.142857	67	21	15	25.142857	39	21	3	19.523810	40
2	34	19	41.970588	59	34	71	86.794118	126	34	1	17.264706	39
3	38	18	27.000000	40	38	39	56.657895	76	38	29	49.131579	61
4	22	18	25.272727	35	22	15	25.727273	39	22	61	79.363636	99
5	45	43	56.155556	70	45	38	53.377778	67	45	35	49.088889	60

		Age				Annual_Income				Spending_Score
		count	min	mean	max	count	min	mean	max	count	min	mean	max
cluster	Gender
0	Female	21	27	32.190476	38	21	70	86.047619	120	21	69	81.666667	95
0	Male	17	27	33.470588	40	17	69	84.176471	126	17	63	82.647059	97
1	Female	13	20	41.538462	58	13	16	26.538462	39	13	5	20.692308	40
1	Male	8	19	48.375000	67	8	15	22.875000	33	8	3	17.625000	39
2	Female	15	34	44.600000	57	15	73	92.333333	126	15	5	21.600000	39
2	Male	19	19	39.894737	59	19	71	82.421053	113	19	1	13.842105	36
3	Female	25	18	27.960000	40	25	39	57.360000	76	25	29	47.120000	61
3	Male	13	18	25.153846	40	13	42	55.307692	67	13	41	53.000000	60
4	Female	13	20	25.461538	35	13	16	25.692308	39	13	65	80.538462	99
4	Male	9	18	25.000000	35	9	15	25.777778	38	9	61	77.666667	92
5	Female	25	43	54.080000	68	25	38	53.240000	67	25	35	49.520000	59
5	Male	20	47	58.750000	70	20	39	53.550000	63	20	36	48.550000	60

Let's start clustering ...¶

★ Mall Customer Segementation Dataset ★

Description:¶

Gender has no effect on distribution of other features. So, gender is excluded from our feature set.¶

Persona¶