import os
import math
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
from scipy.stats import randint
from scipy.stats import loguniform
from IPython.display import display

from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from scikitplot.metrics import plot_roc_curve as auc_roc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, \
f1_score, roc_auc_score, roc_curve, precision_score, recall_score

from keras.models import Sequential
from keras.layers import Dense
# from keras.optimizers import SGD,Adam
from tensorflow.keras.optimizers import SGD, Adam

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,6]

import warnings 
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')

import tensorflow as tf
def set_seed(seed=1234):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()


import pandas as pd
import numpy as np

# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays 
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..  
# the original dataset would probably call this ['Species']

df = pd.read_csv('./tested.csv')
df.columns.to_list()
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.head()
df.tail()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']


target = 'Survived'
labels = ['Not-Survived', 'Survived']
features = [i for i in df.columns.values if i not in [target]]
print(features)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


original_df = df.copy(deep=True)
original_df.head()


print('\n\033[1mInference:\033[0m The Dataset consists of {} features & {} samples.'.format(df.shape[1], df.shape[0]))

Inference: The Dataset consists of 8 features & 418 samples.


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


# Checking number of unique rows in each feature
df.nunique().sort_values()

Survived      2
Sex           2
Pclass        3
Embarked      3
SibSp         7
Parch         8
Age          79
Fare        169
dtype: int64


# Checking number of unique rows in each feature

nu = df[features].nunique().sort_values()
nf = []; cf = []; #nnf = 0; ncf = 0; #numerical & categorical features

for i in range(df[features].shape[1]):
    if nu.values[i]<=15:cf.append(nu.index[i])
    else: nf.append(nu.index[i])

print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(nf),len(cf)))

Inference: The Datset has 2 numerical & 5 categorical features.


# Checking the stats of all the columns
df.describe().T


df.value_counts(target)

Survived
0    266
1    152
dtype: int64


df.groupby(target).size()

Survived
0    266
1    152
dtype: int64


# Correlation with target  
#df[features].apply(lambda x: x.corr(df[target]))


# Correlation with target  
df[features].corrwith(df[target])

Pclass   -0.108615
Age      -0.000013
SibSp     0.099943
Parch     0.159120
Fare      0.191514
dtype: float64


# type(df[features].corrwith(df[target]))
# dir(df[features].corrwith(df[target]))


df[features].corrwith(df[target]).index
df[features].corrwith(df[target]).values

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

array([-1.08614521e-01, -1.26823571e-05,  9.99433703e-02,  1.59120113e-01,
        1.91513743e-01])


# x = df[features].corrwith(df[target]).index.tolist()
# y = df[features].corrwith(df[target]).values.round(2).tolist()
 
# # creating the bar plot
# plt.bar(x, y, color ='salmon', width = 0.4)
# plt.title("Correlation with target", fontweight='bold', size=20)
# for i in range(len(y)):
#     plt.annotate(str(y[i]), xy=(x[i],y[i]), ha='center', va='bottom')
# plt.show()


x = df[features].corrwith(df[target]).index.tolist()
y = df[features].corrwith(df[target]).values.round(2).tolist()

ax = sns.barplot(x, y)
for i in ax.containers:
    ax.bar_label(i,)
plt.title('Correlation with Target')
plt.show()

[Text(0, 0, '-0.11'),
 Text(0, 0, '0'),
 Text(0, 0, '0.1'),
 Text(0, 0, '0.16'),
 Text(0, 0, '0.19')]

Text(0.5, 1.0, 'Correlation with Target')


# plt.figure(figsize=(8,6))
sns.heatmap(df[features].corr(), annot=True)

<AxesSubplot:>


sns.pairplot(df, hue=target, height=2)

<seaborn.axisgrid.PairGrid at 0x21539f08ac0>


#fig, axes = plt.subplots(2, 2)
  
#axes[0,0].set_title("Sepal Length")
#axes[0,0].hist(df['sepal length (cm)'])
  
#axes[0,1].set_title("Sepal Width")
#axes[0,1].hist(df['sepal width (cm)']);
  
#axes[1,0].set_title("Petal Length")
#axes[1,0].hist(df['petal length (cm)']);
  
#axes[1,1].set_title("Petal Width")
#axes[1,1].hist(df['petal width (cm)']);


#Let us first analyze the distribution of the target variable

MAP={}
for e, i in enumerate(df[target].unique()):
    MAP[i]=labels[e]
#MAP={0:'Not-Survived',1:'Survived'}
df1 = df.copy()
df1[target]=df1[target].map(MAP)
explode=np.zeros(len(labels))
explode[-1]=0.1
print('\033[1mTarget Variable Distribution'.center(55))
plt.pie(df1[target].value_counts(), labels=df1[target].value_counts().index, counterclock=False, shadow=True, 
        explode=explode, autopct='%1.1f%%', radius=1, startangle=0)
plt.show()

            Target Variable Distribution

([<matplotlib.patches.Wedge at 0x2153ce95370>,
  <matplotlib.patches.Wedge at 0x2153ce95d30>],
 [Text(-0.45695648023571717, -1.000595210447554, 'Not-Survived'),
  Text(0.4984979784389643, 1.0915584113973316, 'Survived')],
 [Text(-0.24924898921948208, -0.5457792056986657, '63.6%'),
  Text(0.29079048742272917, 0.6367424066484433, '36.4%')])


# # distplot is depreciated
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "sepal length (cm)").add_legend()
  
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "sepal width (cm)").add_legend()
  
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "petal length (cm)").add_legend()
  
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "petal width (cm)").add_legend()
  
# plt.show()


# # https://stackoverflow.com/questions/63895392/seaborn-is-not-plotting-within-defined-subplots
# #fig, axes = plt.subplots(2,2)
# #sns.set(rc={"figure.figsize": (8, 4)});
# #subplot(2,2,1)
# sns.displot(df, x="sepal length (cm)", kind="kde", hue="Species", ax=axes[0,0])
# #subplot(2,2,2)
# sns.displot(df, x="sepal width (cm)", kind="kde", hue="Species", ax=axes[0,1])
# #subplot(2,2,3)
# sns.displot(df, x="petal length (cm)", kind="kde", hue="Species", ax=axes[1,0])
# #subplot(2,2,4)
# sns.displot(df, x="petal width (cm)", kind="kde", hue="Species", ax=axes[1,1])  
# plt.show()


# https://stackoverflow.com/questions/63895392/seaborn-is-not-plotting-within-defined-subplots
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(displot, "sepal length (cm)").add_legend()
# plot.map(displot, "sepal width (cm)").add_legend()
# plot.map(displot, "petal length (cm)").add_legend()
# plot.map(displot, "petal width (cm)").add_legend()  
# plt.show()


df1['Sex'] = df1['Sex'].map({'male':1.0, 'female':0.0})
df_melt = df1[[target]+nf].melt(id_vars=target, var_name='attribute', value_name= 'measurement')

df_melt.head()

min_x, max_x = -100, 300

sns.displot(
    data=df_melt, 
    x='measurement', 
    #hue='Species', 
    #kind='kde',
    kde =True, 
    #fill=True,
    col='attribute',
    #bins=10,
    binrange=(min_x, max_x),
    #kde_kws={'clip': (min_x, max_x)},
    facet_kws=dict(sharey=False, sharex=False) # different scale on x-axis, y-axis
)

<seaborn.axisgrid.FacetGrid at 0x2153cc4c430>


sns.boxplot(
    data=df_melt, 
    y='attribute', 
    x='measurement',
    #hue='Species'
    orient="h"
)

<AxesSubplot:xlabel='measurement', ylabel='attribute'>


sns.boxplot(
    data=df_melt, 
    x='attribute', 
    y='measurement',
    #hue='Species',
    orient="v"
)

<AxesSubplot:xlabel='attribute', ylabel='measurement'>


# Visualising the categorical features 

print('\033[1mVisualising Categorical Features:'.center(100))

n=3
plt.figure(figsize=[15,3*math.ceil(len(cf)/n)])

for i in range(len(cf)):
    if df[cf[i]].nunique()<=15:
        plt.subplot(math.ceil(len(cf)/n),n,i+1)
        ax = sns.countplot(df[cf[i]])
        ax.bar_label(container=ax.containers[0])
    #else:
    #    plt.subplot(2,2,i)
    #    sns.countplot(df[cf[i]])
plt.tight_layout()
plt.show()

                               Visualising Categorical Features:

<Figure size 1080x432 with 0 Axes>

<AxesSubplot:>

[Text(0, 0, '266'), Text(0, 0, '152')]

<AxesSubplot:>

[Text(0, 0, '107'), Text(0, 0, '93'), Text(0, 0, '218')]

<AxesSubplot:>

[Text(0, 0, '46'), Text(0, 0, '270'), Text(0, 0, '102')]

<AxesSubplot:>

[Text(0, 0, '283'),
 Text(0, 0, '110'),
 Text(0, 0, '14'),
 Text(0, 0, '4'),
 Text(0, 0, '4'),
 Text(0, 0, '1'),
 Text(0, 0, '2')]


# Understanding the Numerical feature set

print('\033[1mFeatures Distribution'.center(100))

n=4
nf = [i for i in features if i not in cf]

plt.figure(figsize=[15,3*math.ceil(len(features)/n)])
for c in range(len(nf)):
    plt.subplot(math.ceil(len(features)/n),n,c+1)
    sns.distplot(df[nf[c]])
plt.tight_layout()
plt.show()

plt.figure(figsize=[15,3*math.ceil(len(features)/n)])
for c in range(len(nf)):
    plt.subplot(math.ceil(len(features)/n),n,c+1)
    df.boxplot(nf[c])
plt.tight_layout()
plt.show()

                                     Features Distribution

<Figure size 1080x432 with 0 Axes>

<AxesSubplot:>

<AxesSubplot:xlabel='Age', ylabel='Density'>

<AxesSubplot:>

<AxesSubplot:xlabel='Fare', ylabel='Density'>

<Figure size 1080x432 with 0 Axes>

<AxesSubplot:>

<AxesSubplot:>

<AxesSubplot:>


# Understanding the relationship between all the features

ppc=[i for i in df.columns if i not in cf]
g=sns.pairplot(df[ppc], hue=target, size=4)
#g.map_upper(sns.kdeplot, levels=1, color=".2")
plt.show()


fig, axes = plt.subplots(1,2)
sns.boxplot(x=target, y="Age", data=df, ax=axes[0])
sns.boxplot(x=target, y="Fare", data=df, ax=axes[1])

<AxesSubplot:xlabel='Survived', ylabel='Age'>

<AxesSubplot:xlabel='Survived', ylabel='Fare'>


fig, axes = plt.subplots(1,2)
sns.boxplot("Age", data=df, ax=axes[0])
sns.boxplot("Fare", data=df, ax=axes[1])

<AxesSubplot:xlabel='Age'>

<AxesSubplot:xlabel='Fare'>


#sns.catplot(data=df_melt, x="measurement", y="attribute", orient="h", kind="box")
#sns.catplot(data=df_melt, y="measurement", x="attribute", orient="v", kind="box")


# Removal of any Duplicat rows (if any)
duplicate = df[df.duplicated(keep=False)]
print(duplicate)

r, c = df.shape

df1 = df.copy()
df1.drop_duplicates(inplace=True)
df1.reset_index(drop=True, inplace=True)

df1.shape
df1.head()

if df1.shape == (r, c):
    print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else: 
    print(f'\n\033[1mInference:\033[0m Number of duplicates dropped ---> {r-df1.shape[0]}')

     Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
3           0       3    male  27.0      0      0   8.6625        S
10          0       3    male   NaN      0      0   7.8958        S
29          0       3    male   NaN      2      0  21.6792        C
36          1       3  female   NaN      0      0   8.0500        S
41          0       1    male   NaN      0      0  26.5500        S
47          0       3    male   NaN      0      0   7.7500        Q
70          1       3  female  24.0      0      0   7.7500        Q
76          0       3    male   NaN      0      0   8.0500        S
78          0       2    male  30.0      0      0  13.0000        S
79          1       3  female  24.0      0      0   7.7500        Q
83          0       3    male   NaN      0      0   7.8958        S
88          1       3  female   NaN      0      0   7.7500        Q
93          0       3    male   NaN      0      0   8.0500        S
102         0       3    male   NaN      0      0   7.7500        Q
103         0       3    male  26.0      0      0   7.7750        S
107         0       3    male   NaN      0      0   7.7500        Q
119         1       2  female  29.0      1      0  26.0000        S
124         0       3    male   NaN      0      0   7.7500        Q
137         0       2    male  26.0      0      0  13.0000        S
144         0       1    male  42.0      0      0  26.5500        S
148         0       1    male   NaN      0      0  26.5500        S
158         0       1    male  42.0      0      0  26.5500        S
170         0       3    male   NaN      0      0   7.5500        S
173         0       3    male   NaN      0      0   7.2292        C
180         0       2    male  30.0      0      0  13.0000        S
183         0       3    male   NaN      0      0   7.7500        Q
204         0       2    male  25.0      0      0  10.5000        S
219         0       3    male   NaN      0      0   8.0500        S
227         1       3  female   NaN      0      0   7.7500        Q
248         1       2  female  29.0      1      0  26.0000        S
255         0       3    male   NaN      0      0   7.5500        S
256         0       3    male   NaN      0      0   7.7500        Q
265         0       3    male   NaN      0      0   7.8958        S
267         0       3    male   NaN      0      0   7.5500        S
268         1       3  female   NaN      0      0   8.0500        S
271         0       3    male   NaN      0      0   7.7500        Q
274         0       3    male   NaN      0      0   7.2250        C
282         1       3  female   NaN      0      0   7.7500        Q
288         0       3    male   NaN      0      0   7.2292        C
289         0       3    male   NaN      0      0   8.0500        S
292         0       3    male   NaN      0      0   7.2292        C
297         0       3    male   NaN      2      0  21.6792        C
304         1       3  female   NaN      0      0   7.7500        Q
320         0       3    male  26.0      0      0   7.7750        S
322         0       2    male  26.0      0      0  13.0000        S
332         0       3    male   NaN      0      0   7.2250        C
339         0       3    male   NaN      0      0   7.2292        C
346         0       2    male  26.0      0      0  13.0000        S
349         1       2  female  31.0      0      0  21.0000        S
351         0       2    male  25.0      0      0  10.5000        S
358         0       3    male   NaN      0      0   7.7500        Q
362         1       2  female  31.0      0      0  21.0000        S
363         0       3    male  27.0      0      0   8.6625        S
380         0       3    male   NaN      0      0   7.7500        Q
410         1       3  female   NaN      0      0   7.7500        Q
413         0       3    male   NaN      0      0   8.0500        S
416         0       3    male   NaN      0      0   8.0500        S

(380, 8)

Inference: Number of duplicates dropped ---> 38


# Check for empty elements
nvc = pd.DataFrame(df1.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/df1.shape[0], 3) * 100
print(nvc)

          Total Null Values  Percentage
Survived                  0         0.0
Pclass                    0         0.0
Sex                       0         0.0
SibSp                     0         0.0
Parch                     0         0.0
Embarked                  0         0.0
Fare                      1         0.3
Age                      58        15.3


# Converting categorical Columns to Numeric
ecc = nvc[nvc['Percentage']!=0].index.values
dcc = [i for i in df.columns if i not in ecc]
print(dcc)

df1.head()

# Target Variable
MAP={}
for i, e in enumerate(df1[target].unique()):
    MAP[e]=i
df1[target] = df1[target].map(MAP)
print('Mapping Target Variable --->', MAP)
df1.head()

df3 = df1[dcc]
fcc = [i for i in cf if i not in ecc]
print(fcc)
df3.head()

# One-Hot Binary Encoding
oh=True
dm=True
for i in fcc:
    #print(i)
    if df3[i].nunique()==2:
        if oh==True: print("\033[1m\nOne-Hot Encoding on features:\033[0m")
        print(i);oh=False
        df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
    if (df3[i].nunique()>2 and df3[i].nunique()<17):
        if dm==True: print("\n\033[1mDummy Encoding on features:\033[0m")
        print(i);dm=False
        df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i], drop_first=True, prefix=str(i)))],axis=1)

df3.shape
df3.head()

['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

Mapping Target Variable ---> {0: 0, 1: 1}

['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']


One-Hot Encoding on features:
Sex

Dummy Encoding on features:
Pclass
Embarked
SibSp
Parch

(380, 19)


# Fixing Empty Categorical Columns

for x in [i for i in ecc if i in cf]:
    a = df1[x]
    b=[]; c=[]

    for i,e in enumerate(a):
        if e!=e:
            b.append(i)
        else:
            c.append(i)

    RF = RandomForestClassifier()
    RF.fit(df3.loc[c],a[c])
    d = RF.predict(df3.loc[b])

    df3[x] = a
    f=0
    for i,e in enumerate(df3[x]):
        if e!=e:
            df3.loc[i,x] = d[f]
            f+=1
    df3 = pd.concat([df3.drop([x], axis=1), pd.DataFrame(pd.get_dummies(df3[x], drop_first=True, prefix=str(x)))],axis=1)   
df3


# Fixing Empty Numerical Columns

for x in [i for i in ecc if i not in cf]:
    a = df1[x]
    b=[]; c=[]

    for i,e in enumerate(a):
        if e!=e:
            b.append(i)
        else:
            c.append(i)

    LR = LinearRegression()
    LR.fit(df3.loc[c],a[c])
    d = LR.predict(df3.loc[b])

    df3[x] = a
    f=0
    for i,e in enumerate(df3[x]):
        if e!=e:
            df3.loc[i,x] = d[f]
            f+=1
    #df3 = pd.concat([df3.drop([x], axis=1), pd.DataFrame(pd.get_dummies(df3[x], drop_first=True, prefix=str(x)))],axis=1)   
df3

LinearRegression()

LinearRegression()


# Removal of outlier:

df4 = df3.copy()

for i in [i for i in df4.columns]:
    if df4[i].nunique()>=12:
        Q1 = df4[i].quantile(0.20)
        Q3 = df4[i].quantile(0.80)
        IQR = Q3 - Q1
        df4 = df4[df4[i] <= (Q3+(1.5*IQR))]
        df4 = df4[df4[i] >= (Q1-(1.5*IQR))]
df4 = df4.reset_index(drop=True)
df4.head()
print('\n\033[1mInference:\033[0m Before removal of outliers, The dataset had {} samples.'.format(df.shape[0]))
print('\033[1mInference:\033[0m After removal of outliers, The dataset now has {} samples.'.format(df4.shape[0]))

Inference: Before removal of outliers, The dataset had 418 samples.
Inference: After removal of outliers, The dataset now has 350 samples.


# Fixing the imbalance using SMOTE Technique
df5 = df4.copy()
print('Original class distribution:')
print(df5[target].value_counts())

xf = df5.columns
X = df5.drop([target], axis=1)
Y = df5[target]

smote = SMOTE()
X, Y = smote.fit_resample(X, Y)

df5 = pd.DataFrame(X, columns=xf)
df5[target] = Y

print('\nClass distribution after applying SMOTE Technique:',)
print(Y.value_counts())

Original class distribution:
0    224
1    126
Name: Survived, dtype: int64

Class distribution after applying SMOTE Technique:
0    224
1    224
Name: Survived, dtype: int64


df = df5.copy()
plt.title('Final Dataset Samples')
plt.pie([df.shape[0], original_df.shape[0]-df4.shape[0], df5.shape[0]-df4.shape[0]], radius = 1, shadow=True,
        labels=['Retained','Dropped','Augmented'], counterclock=False, autopct='%1.1f%%', pctdistance=0.9, explode=[0,0.1,0.1])
plt.pie([df.shape[0]], labels=['100%'], labeldistance=-0, radius=0.78, shadow=True, colors=['powderblue'])
plt.show()

print('\n\033[1mInference:\033[0mThe final dataset after cleanup has {} samples & {} columns.'.format(df.shape[0], df.shape[1]))

Text(0.5, 1.0, 'Final Dataset Samples')

([<matplotlib.patches.Wedge at 0x215406117c0>,
  <matplotlib.patches.Wedge at 0x2154061f0d0>,
  <matplotlib.patches.Wedge at 0x2154061fa60>],
 [Text(-0.7265137443323456, -0.8259405422281892, 'Retained'),
  Text(0.2618915957590218, 1.171073350422933, 'Dropped'),
  Text(1.052277065616864, 0.576812774803716, 'Augmented')],
 [Text(-0.5944203362719191, -0.6757695345503366, '73.0%'),
  Text(0.21824299646585155, 0.9758944586857773, '11.1%'),
  Text(0.87689755468072, 0.4806773123364299, '16.0%')])

([<matplotlib.patches.Wedge at 0x2154062c550>], [Text(0.0, 0.0, '100%')])

Inference:The final dataset after cleanup has 448 samples & 21 columns.


#Splitting the data intro training & testing sets

df = df5.copy()

X = df.drop([target],axis=1)
Y = df[target]
# X = df
# Y = X.pop(target)
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

print('Original set  ---> ',X.shape,Y.shape,'\nTraining set  ---> ',Train_X.shape,Train_Y.shape,'\nTesting set   ---> ', Test_X.shape,'', Test_Y.shape)

Original set  --->  (448, 20) (448,) 
Training set  --->  (358, 20) (358,) 
Testing set   --->  (90, 20)  (90,)


# Feature Scaling (Standardization)

std = StandardScaler()

print('\033[1mStandardardization on Training set'.center(100))
Train_X_std = std.fit_transform(Train_X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
display(Train_X_std.describe())

print('\n','\033[1mStandardardization on Testing set'.center(100))
Test_X_std = std.transform(Test_X)
Test_X_std = pd.DataFrame(Test_X_std, columns=X.columns)
display(Test_X_std.describe())

                               Standardardization on Training set

                                Standardardization on Testing set


#Checking the correlation

features = df.columns
plt.figure(figsize=[24,20])
plt.title('Features Correlation-Plot')
sns.heatmap(df[features].corr(), vmin=-1, vmax=1, center=0, annot=True)
plt.show()

<Figure size 1728x1440 with 0 Axes>

Text(0.5, 1.0, 'Features Correlation-Plot')

<AxesSubplot:title={'center':'Features Correlation-Plot'}>


# Calculate the VIFs to remove multicollinearity

DROP=[]; scores1=[]; scores2=[]; scores3=[]
#scores.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std)))
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y, eval_metric='logloss').predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
        
for i in range(len(X.columns.values)-1):
    vif = pd.DataFrame()
    Xs = X.drop(DROP,axis=1)
    #print(DROP)
    vif['Features'] = Xs.columns
    vif['VIF'] = [variance_inflation_factor(Xs.values, i) for i in range(Xs.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    vif.reset_index(drop=True, inplace=True)
    DROP.append(vif.Features[0])
    if vif.VIF[0]>1:
        scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
        scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
        scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y, eval_metric='logloss').predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
    #print(scores)
    
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.7,0.85])
plt.legend()
plt.grid()
plt.show()

[<matplotlib.lines.Line2D at 0x21541347df0>]

[<matplotlib.lines.Line2D at 0x21541363220>]

[<matplotlib.lines.Line2D at 0x215413634f0>]

<matplotlib.legend.Legend at 0x215412c1400>


# Applying Recurrsive Feature Elimination

# Running RFE with the output number of the variable equal to 10
LR = LogisticRegression()#.fit(Train_X_std, Train_Y)
scores1=[]; scores2=[]; scores3=[]
# scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std, Train_Y, eval_metric='logloss').predict(Test_X_std),average='weighted')*100)

for i in range(len(X.columns.values)):
    rfe = RFE(LR,n_features_to_select=len(Train_X_std.columns)-i)   
    rfe = rfe.fit(Train_X_std, Train_Y)
    scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y, eval_metric='logloss').predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.80,0.84])
plt.legend()
plt.grid()
plt.show()

[<matplotlib.lines.Line2D at 0x215406ba370>]

[<matplotlib.lines.Line2D at 0x215406ba640>]

[<matplotlib.lines.Line2D at 0x215406ba910>]

<matplotlib.legend.Legend at 0x215405f9550>


pca = PCA().fit(Train_X_std)

fig, ax = plt.subplots(figsize=(14,6))
x_values = range(1, pca.n_components_+1)
ax.bar(x_values, pca.explained_variance_ratio_, lw=2, label='Explained Variance')
ax.plot(x_values, np.cumsum(pca.explained_variance_ratio_), lw=2, label='Cumulative Explained Variance', color='red')
plt.plot([0,pca.n_components_+1],[0.90,0.90],'g--')
plt.plot([2,2],[0,1], 'g--')
ax.set_title('Explained variance of components')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance')
plt.grid()
plt.legend()
plt.show()

<BarContainer object of 20 artists>

[<matplotlib.lines.Line2D at 0x21540705430>]

[<matplotlib.lines.Line2D at 0x21540705850>]

[<matplotlib.lines.Line2D at 0x21540705b20>]

Text(0.5, 1.0, 'Explained variance of components')

Text(0.5, 0, 'Principal Component')

Text(0, 0.5, 'Explained Variance')

<matplotlib.legend.Legend at 0x21540841790>


#Applying PCA Transformations

# scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std, Train_Y, eval_metric='logloss').predict(Test_X_std),average='weighted')*100)

scores1=[]; scores2=[]; scores3=[]
for i in range(len(X.columns.values)):
    pca = PCA(n_components=Train_X_std.shape[1]-i)
    Train_X_std_pca = pca.fit_transform(Train_X_std)
    #print('The shape of final transformed training feature set:')
    #print(Train_X_std_pca.shape)
    Train_X_std_pca = pd.DataFrame(Train_X_std_pca)

    Test_X_std_pca = pca.transform(Test_X_std)
    #print('\nThe shape of final transformed testing feature set:')
    #print(Test_X_std_pca.shape)
    Test_X_std_pca = pd.DataFrame(Test_X_std_pca)
    
    scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
    scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
    scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std_pca, Train_Y, eval_metric='logloss').predict(Test_X_std_pca),average='weighted')*100)

    
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.80,0.84])
plt.legend()
plt.grid()
plt.show()

[<matplotlib.lines.Line2D at 0x215407736d0>]

[<matplotlib.lines.Line2D at 0x215407739a0>]

[<matplotlib.lines.Line2D at 0x21540773cd0>]

<matplotlib.legend.Legend at 0x2154073c760>


#Finalising the shortlisted features

rfe = RFE(LR,n_features_to_select=len(Train_X_std.columns)-0)   
rfe = rfe.fit(Train_X_std, Train_Y)

print(f1_score(Test_Y,LogisticRegression().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
print(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
print(f1_score(Test_Y,XGBClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y, eval_metric='logloss').predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    
Train_X_std = Train_X_std[Train_X_std.columns[rfe.support_]]
Test_X_std = Test_X_std[Test_X_std.columns[rfe.support_]]

print(Train_X_std.shape)
print(Test_X_std.shape)

100.0
100.0
100.0
(358, 20)
(90, 20)


print(rfe.support_)
X.columns[rfe.support_]

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]

Index(['Sex', 'Pclass_2', 'Pclass_3', 'Embarked_Q', 'Embarked_S', 'SibSp_1',
       'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_1',
       'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9',
       'Fare', 'Age'],
      dtype='object')


# Let us create first create a table to store the results of various models 

Evaluation_Results = pd.DataFrame(np.zeros((9,5)), columns=['Accuracy', 'Precision','Recall','F1-score','AUC-ROC score'])
Evaluation_Results.index=['Logistic Regression (LR)','Decision Tree Classifier (DT)','Random Forest Classifier (RF)','Naïve Bayes Classifier (NB)',
                         'Support Vector Machine (SVM)','K Nearest Neighbours (KNN)','Extreme Gradient Boosting (XGB)', 'Deep Learning (DL)', 'DUMMY (DM)']
Evaluation_Results


#Let us define functions to summarise the Prediction's scores .

#Classification Summary Function
def Classification_Summary(pred,pred_prob,i):
    Evaluation_Results.iloc[i]['Accuracy']=round(accuracy_score(Test_Y, pred),3)*100   
    Evaluation_Results.iloc[i]['Precision']=round(precision_score(Test_Y, pred, average='weighted'),3)*100 #
    Evaluation_Results.iloc[i]['Recall']=round(recall_score(Test_Y, pred, average='weighted'),3)*100 #
    Evaluation_Results.iloc[i]['F1-score']=round(f1_score(Test_Y, pred, average='weighted'),3)*100 #
    Evaluation_Results.iloc[i]['AUC-ROC score']=round(roc_auc_score(Test_Y, pred_prob[:,1], multi_class='ovr'),3)*100 #[:, 1]
    print('{}{}\033[1m Evaluating {} \033[0m{}{}\n'.format('<'*3,'-'*35,Evaluation_Results.index[i], '-'*35,'>'*3))
    print('Accuracy = {}%'.format(round(accuracy_score(Test_Y, pred),3)*100))
    print('F1 Score = {}%'.format(round(f1_score(Test_Y, pred, average='weighted'),3)*100)) #
    print('\n \033[1mConfusiton Matrix:\033[0m\n',confusion_matrix(Test_Y, pred))
    print('\n\033[1mClassification Report:\033[0m\n',classification_report(Test_Y, pred))
    
    auc_roc(Test_Y, pred_prob, curves=['each_class'])
    plt.show()

#Visualising Function
def AUC_ROC_plot(Test_Y, pred):    
    ref = [0 for _ in range(len(Test_Y))]
    ref_auc = roc_auc_score(Test_Y, ref)
    lr_auc = roc_auc_score(Test_Y, pred)

    ns_fpr, ns_tpr, _ = roc_curve(Test_Y, ref)
    lr_fpr, lr_tpr, _ = roc_curve(Test_Y, pred)

    plt.plot(ns_fpr, ns_tpr, linestyle='--')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='AUC = {}'.format(round(roc_auc_score(Test_Y, pred)*100,2))) 
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()


# Building Logistic Regression Classifier

LR_model = LogisticRegression()

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2'] #'none', 'l1', 'l2', 'elasticnet'
space['C'] = loguniform(1e-5, 100)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(LR_model, space, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

LR = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = LR.predict(Test_X_std)
pred_prob = LR.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,0)

print('\n\033[1mInterpreting the Output of Logistic Regression:\n\033[0m')

print('intercept ', LR.intercept_[0])
print('classes', LR.classes_)
display(pd.DataFrame({'coeff': LR.coef_[0]}, index=Train_X_std.columns))

<<<----------------------------------- Evaluating Logistic Regression (LR) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

Interpreting the Output of Logistic Regression:

intercept  -0.14426344485989095
classes [0 1]


# Building Decision Tree Classifier

DT_model = DecisionTreeClassifier()

n=20
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, n),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(DT_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

DT = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = DT.predict(Test_X_std)
pred_prob = DT.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,1)

print('\n\033[1mInterpreting the output of Decision Tree:\n\033[0m')
tree.plot_tree(DT)
plt.show()

<<<----------------------------------- Evaluating Decision Tree Classifier (DT) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

Interpreting the output of Decision Tree:

[Text(0.5, 0.75, 'X[0] <= -0.062\nentropy = 0.997\nsamples = 358\nvalue = [190, 168]'),
 Text(0.25, 0.25, 'entropy = 0.0\nsamples = 168\nvalue = [0, 168]'),
 Text(0.75, 0.25, 'entropy = 0.0\nsamples = 190\nvalue = [190, 0]')]


RF_model = RandomForestClassifier()

param_dist={'bootstrap': [True, False],
            'max_depth': [10, 20, 50, 100, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [50, 100]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(RF_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

RF = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = RF.predict(Test_X_std)
pred_prob = RF.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,2)

print('\n\033[1mInterpreting the output of Random Forest:\n\033[0m')
rfi=pd.Series(RF.feature_importances_, index=Train_X_std.columns).sort_values(ascending=False)
plt.barh(rfi.index,rfi.values)
plt.show()

<<<----------------------------------- Evaluating Random Forest Classifier (RF) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

Interpreting the output of Random Forest:

<BarContainer object of 20 artists>


# Building Naive Bayes Classifier

NB_model = BernoulliNB()

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(NB_model, params, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

NB = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = NB.predict(Test_X_std)
pred_prob = NB.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,3)

<<<----------------------------------- Evaluating Naïve Bayes Classifier (NB) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90


# Building Support Vector Machine Classifier

SVM_model = SVC(probability=True).fit(Train_X_std, Train_Y)

svm_param = {'C': [0.1, 1, 10, 100, 1000], 
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
             "kernel": ["rbf"],
             "random_state": [1]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(SVM_model, svm_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

SVM = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = SVM.predict(Test_X_std)
pred_prob = SVM.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,4)

<<<----------------------------------- Evaluating Support Vector Machine (SVM) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90


# Building K-Neareset Neighbours Classifier

KNN_model = KNeighborsClassifier()

knn_param = {"n_neighbors": [i for i in range(1,30,5)],
             "weights": ["uniform", "distance"],
             "algorithm": ["ball_tree", "kd_tree", "brute"],
             "leaf_size": [1, 10, 30],
             "p": [1,2]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(KNN_model, knn_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

KNN = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = KNN.predict(Test_X_std)
pred_prob = KNN.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,5)

<<<----------------------------------- Evaluating K Nearest Neighbours (KNN) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90


# Building Extreme Gradient Boosting Classifier

XGB_model = XGBClassifier(eval_metric='mlogloss')

param_dist = {
 "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
 "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(XGB_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

XGB = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = XGB.predict(Test_X_std)
pred_prob = XGB.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,6)

plt.bar( Train_X_std.columns,XGB.feature_importances_,)
plt.show()

<<<----------------------------------- Evaluating Extreme Gradient Boosting (XGB) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

<BarContainer object of 20 artists>


# Building Deep Learning Classifier
# https://stackoverflow.com/questions/63375201/tensorflow-valueerror-logits-and-labels-must-have-the-same-shape-none-2-vs

def set_seed(seed=4):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

DL = Sequential()

DL.add(Dense(16,input_shape=(20,),activation='relu'))
DL.add(Dense(8,activation='relu'))
DL.add(Dense(4,activation='relu'))
DL.add(Dense(2,activation='sigmoid'))
DL.compile('Adam','binary_crossentropy',metrics=['accuracy'])

DL.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 16)                336       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 4)                 36        
                                                                 
 dense_3 (Dense)             (None, 2)                 10        
                                                                 
=================================================================
Total params: 518
Trainable params: 518
Non-trainable params: 0
_________________________________________________________________


encoder =  LabelEncoder()
y = pd.get_dummies(encoder.fit_transform(Train_Y)).values

DL.fit(Train_X_std.to_numpy(), y,epochs=100)
pred_prob = DL.predict(Test_X_std.to_numpy())

pred = np.argmax(pred_prob, axis=1)
Classification_Summary(pred,pred_prob,7)

Epoch 1/100
12/12 [==============================] - 2s 9ms/step - loss: 0.6977 - accuracy: 0.4721
Epoch 2/100
12/12 [==============================] - 0s 2ms/step - loss: 0.6757 - accuracy: 0.4777
Epoch 3/100
12/12 [==============================] - 0s 3ms/step - loss: 0.6539 - accuracy: 0.5000
Epoch 4/100
12/12 [==============================] - 0s 2ms/step - loss: 0.6334 - accuracy: 0.6034
Epoch 5/100
12/12 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.7374
Epoch 6/100
12/12 [==============================] - 0s 2ms/step - loss: 0.5943 - accuracy: 0.8073
Epoch 7/100
12/12 [==============================] - 0s 2ms/step - loss: 0.5734 - accuracy: 0.8268
Epoch 8/100
12/12 [==============================] - 0s 2ms/step - loss: 0.5530 - accuracy: 0.8352
Epoch 9/100
12/12 [==============================] - 0s 2ms/step - loss: 0.5300 - accuracy: 0.8520
Epoch 10/100
12/12 [==============================] - 0s 2ms/step - loss: 0.5088 - accuracy: 0.8631
Epoch 11/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4877 - accuracy: 0.9050
Epoch 12/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4677 - accuracy: 0.9358
Epoch 13/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4500 - accuracy: 0.9497
Epoch 14/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4330 - accuracy: 0.9553
Epoch 15/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4180 - accuracy: 0.9609
Epoch 16/100
12/12 [==============================] - 0s 2ms/step - loss: 0.4044 - accuracy: 0.9665
Epoch 17/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3917 - accuracy: 0.9721
Epoch 18/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3806 - accuracy: 0.9832
Epoch 19/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3700 - accuracy: 0.9832
Epoch 20/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3611 - accuracy: 0.9832
Epoch 21/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3524 - accuracy: 0.9832
Epoch 22/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3445 - accuracy: 0.9860
Epoch 23/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3374 - accuracy: 0.9860
Epoch 24/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3307 - accuracy: 0.9860
Epoch 25/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3250 - accuracy: 0.9860
Epoch 26/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3183 - accuracy: 0.9860
Epoch 27/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3124 - accuracy: 0.9916
Epoch 28/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3074 - accuracy: 0.9916
Epoch 29/100
12/12 [==============================] - 0s 2ms/step - loss: 0.3025 - accuracy: 0.9916
Epoch 30/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2980 - accuracy: 0.9916
Epoch 31/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2938 - accuracy: 0.9944
Epoch 32/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2895 - accuracy: 0.9944
Epoch 33/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2856 - accuracy: 0.9944
Epoch 34/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2818 - accuracy: 0.9944
Epoch 35/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2783 - accuracy: 0.9944
Epoch 36/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2748 - accuracy: 0.9944
Epoch 37/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2714 - accuracy: 0.9944
Epoch 38/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2683 - accuracy: 0.9944
Epoch 39/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2651 - accuracy: 0.9944
Epoch 40/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2621 - accuracy: 0.9944
Epoch 41/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2591 - accuracy: 0.9972
Epoch 42/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2562 - accuracy: 0.9972
Epoch 43/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2535 - accuracy: 0.9972
Epoch 44/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2508 - accuracy: 0.9972
Epoch 45/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2481 - accuracy: 0.9972
Epoch 46/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2455 - accuracy: 0.9972
Epoch 47/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2429 - accuracy: 0.9972
Epoch 48/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2404 - accuracy: 0.9972
Epoch 49/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2380 - accuracy: 1.0000
Epoch 50/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2356 - accuracy: 1.0000
Epoch 51/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2333 - accuracy: 1.0000
Epoch 52/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2310 - accuracy: 1.0000
Epoch 53/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2288 - accuracy: 1.0000
Epoch 54/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2266 - accuracy: 1.0000
Epoch 55/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2244 - accuracy: 1.0000
Epoch 56/100
12/12 [==============================] - 0s 1ms/step - loss: 0.2223 - accuracy: 1.0000
Epoch 57/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2202 - accuracy: 1.0000
Epoch 58/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2183 - accuracy: 1.0000
Epoch 59/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2162 - accuracy: 1.0000
Epoch 60/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2143 - accuracy: 1.0000
Epoch 61/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2123 - accuracy: 1.0000
Epoch 62/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2104 - accuracy: 1.0000
Epoch 63/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2085 - accuracy: 1.0000
Epoch 64/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2066 - accuracy: 1.0000
Epoch 65/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2047 - accuracy: 1.0000
Epoch 66/100
12/12 [==============================] - 0s 2ms/step - loss: 0.2029 - accuracy: 1.0000
Epoch 67/100
12/12 [==============================] - 0s 1ms/step - loss: 0.2011 - accuracy: 1.0000
Epoch 68/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1993 - accuracy: 1.0000
Epoch 69/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1976 - accuracy: 1.0000
Epoch 70/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1959 - accuracy: 1.0000
Epoch 71/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1942 - accuracy: 1.0000
Epoch 72/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1924 - accuracy: 1.0000
Epoch 73/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1908 - accuracy: 1.0000
Epoch 74/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1891 - accuracy: 1.0000
Epoch 75/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1875 - accuracy: 1.0000
Epoch 76/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1859 - accuracy: 1.0000
Epoch 77/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1843 - accuracy: 1.0000
Epoch 78/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1828 - accuracy: 1.0000
Epoch 79/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1812 - accuracy: 1.0000
Epoch 80/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1797 - accuracy: 1.0000
Epoch 81/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1782 - accuracy: 1.0000
Epoch 82/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1767 - accuracy: 1.0000
Epoch 83/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1752 - accuracy: 1.0000
Epoch 84/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1738 - accuracy: 1.0000
Epoch 85/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1723 - accuracy: 1.0000
Epoch 86/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1708 - accuracy: 1.0000
Epoch 87/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1694 - accuracy: 1.0000
Epoch 88/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1680 - accuracy: 1.0000
Epoch 89/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1666 - accuracy: 1.0000
Epoch 90/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1652 - accuracy: 1.0000
Epoch 91/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1639 - accuracy: 1.0000
Epoch 92/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1625 - accuracy: 1.0000
Epoch 93/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1612 - accuracy: 1.0000
Epoch 94/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1599 - accuracy: 1.0000
Epoch 95/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1586 - accuracy: 1.0000
Epoch 96/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1573 - accuracy: 1.0000
Epoch 97/100
12/12 [==============================] - 0s 1ms/step - loss: 0.1560 - accuracy: 1.0000
Epoch 98/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1548 - accuracy: 1.0000
Epoch 99/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1536 - accuracy: 1.0000
Epoch 100/100
12/12 [==============================] - 0s 2ms/step - loss: 0.1523 - accuracy: 1.0000

<keras.callbacks.History at 0x21540f93af0>

<<<----------------------------------- Evaluating Deep Learning (DL) ----------------------------------->>>

Accuracy = 100.0%
F1 Score = 100.0%

 Confusiton Matrix:
 [[34  0]
 [ 0 56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        56

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90


#Plotting Confusion-Matrix of all the predictive Models

def plot_cm(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.columns=labels
    cm.index=labels
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    #fig, ax = plt.subplots()
    sns.heatmap(cm, annot=annot, fmt='')# cmap= "GnBu"
    
def conf_mat_plot(all_models):
    plt.figure(figsize=[20,3.5*math.ceil(len(all_models)*len(labels)/14)])
    
    for i in range(len(all_models)):
        if len(labels)<=4:
            plt.subplot(2,4,i+1)
        else:
            plt.subplot(math.ceil(len(all_models)/3),3,i+1)
        if all_models[i] == DL:
            pred_prob =all_models[i].predict(Test_X_std.to_numpy())
            pred = np.argmax(pred_prob, axis=1)
        else:
            pred = all_models[i].predict(Test_X_std)
        #plot_cm(Test_Y, pred)
        sns.heatmap(confusion_matrix(Test_Y, pred), annot=True, cmap='Blues', fmt='.0f') #vmin=0,vmax=5
        plt.title(Evaluation_Results.index[i])
    plt.tight_layout()
    plt.show()

conf_mat_plot([LR,DT,RF,NB,SVM,KNN,XGB,DL])


from sklearn.dummy import DummyClassifier

dummy_model = DummyClassifier(strategy='most_frequent')

dummy_param = {}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(dummy_model, dummy_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

DUMMY = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = DUMMY.predict(Test_X_std)
pred_prob = DUMMY.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,8)

<<<----------------------------------- Evaluating DUMMY (DM) ----------------------------------->>>

Accuracy = 37.8%
F1 Score = 20.7%

 Confusiton Matrix:
 [[34  0]
 [56  0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.38      1.00      0.55        34
           1       0.00      0.00      0.00        56

    accuracy                           0.38        90
   macro avg       0.19      0.50      0.27        90
weighted avg       0.14      0.38      0.21        90


Train_Y.value_counts()
Test_Y.value_counts()

0    190
1    168
Name: Survived, dtype: int64

1    56
0    34
Name: Survived, dtype: int64


# Comparing all the models Scores

print('\033[1mML Algorithms Comparison'.center(130))
plt.figure(figsize=[12,8])
sns.heatmap(Evaluation_Results, annot=True, vmin=50, vmax=100, cmap='Blues', fmt='.1f')
plt.show()

                                                   ML Algorithms Comparison

<Figure size 864x576 with 0 Axes>

<AxesSubplot:>


Evaluation_Results

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9
0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	1	0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0
2	0	1	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	1	0	1	1	0	0	0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
375	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0
376	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
377	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
378	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
379	0	1	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	...	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9	Fare	Age
0	0	1	0	1	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	7.8292	34.500000
1	1	0	0	1	0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	7.0000	47.000000
2	0	1	1	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	9.6875	62.000000
3	0	1	0	1	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	8.6625	27.000000
4	1	0	0	1	0	1	1	0	0	0	...	0	1	0	0	0	0	0	0	12.2875	22.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
375	1	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	90.0000	37.000000
376	1	0	0	1	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	7.7750	28.000000
377	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	108.9000	39.000000
378	0	1	0	1	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	7.2500	38.500000
379	0	1	0	1	0	0	1	0	0	0	...	0	1	0	0	0	0	0	0	22.3583	19.357866

	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9	Fare	Age
count	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	358.0	358.000000	3.580000e+02	3.580000e+02	358.0	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02	3.580000e+02
mean	-7.939025e-17	-1.141235e-16	-7.939025e-17	-3.473323e-17	-4.961891e-17	4.961891e-18	4.093560e-17	-2.480945e-17	-2.977134e-17	0.0	0.000000	5.458080e-17	-2.604993e-17	0.0	-9.923781e-18	-9.923781e-18	1.488567e-17	-1.984756e-17	6.946647e-17	2.183232e-16
std	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	0.0	1.001400	1.001400e+00	1.001400e+00	0.0	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00	1.001400e+00
min	-1.063462e+00	-5.234705e-01	-1.011236e+00	-3.395067e-01	-1.233340e+00	-5.450701e-01	-1.862313e-01	-7.495317e-02	-7.495317e-02	0.0	-0.052926	-3.496528e-01	-2.367429e-01	0.0	-5.292561e-02	-5.292561e-02	-5.292561e-02	-7.495317e-02	-1.124477e+00	-2.303518e+00
25%	-1.063462e+00	-5.234705e-01	-1.011236e+00	-3.395067e-01	-1.233340e+00	-5.450701e-01	-1.862313e-01	-7.495317e-02	-7.495317e-02	0.0	-0.052926	-3.496528e-01	-2.367429e-01	0.0	-5.292561e-02	-5.292561e-02	-5.292561e-02	-7.495317e-02	-7.355564e-01	-6.381262e-01
50%	9.403247e-01	-5.234705e-01	9.888885e-01	-3.395067e-01	8.108066e-01	-5.450701e-01	-1.862313e-01	-7.495317e-02	-7.495317e-02	0.0	-0.052926	-3.496528e-01	-2.367429e-01	0.0	-5.292561e-02	-5.292561e-02	-5.292561e-02	-7.495317e-02	-4.406071e-01	-2.026184e-01
75%	9.403247e-01	-5.234705e-01	9.888885e-01	-3.395067e-01	8.108066e-01	-5.450701e-01	-1.862313e-01	-7.495317e-02	-7.495317e-02	0.0	-0.052926	-3.496528e-01	-2.367429e-01	0.0	-5.292561e-02	-5.292561e-02	-5.292561e-02	-7.495317e-02	1.807455e-01	6.587269e-01
max	9.403247e-01	1.910327e+00	9.888885e-01	2.945450e+00	8.108066e-01	1.834626e+00	5.369668e+00	1.334166e+01	1.334166e+01	0.0	18.894444	2.859980e+00	4.223992e+00	0.0	1.889444e+01	1.889444e+01	1.889444e+01	1.334166e+01	3.775379e+00	2.694634e+00

	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9	Fare	Age
count	90.000000	90.000000	90.000000	90.000000	90.000000	90.000000	9.000000e+01	90.000000	90.000000	90.000000	90.000000	90.000000	90.000000	90.000000	9.000000e+01	9.000000e+01	9.000000e+01	90.000000	90.000000	90.000000
mean	-0.306476	-0.144880	0.166615	-0.047511	-0.006852	0.115957	-1.862313e-01	0.074120	0.223194	0.011111	0.157601	0.042636	0.358022	0.011111	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	0.063130	-0.134010
std	0.976942	0.887032	0.989645	0.940081	1.007033	1.071845	2.791107e-17	1.414236	1.988763	0.105409	1.997228	1.057180	1.524853	0.105409	6.977768e-18	6.977768e-18	1.395554e-17	0.000000	1.114787	1.049540
min	-1.063462	-0.523470	-1.011236	-0.339507	-1.233340	-0.545070	-1.862313e-01	-0.074953	-0.074953	0.000000	-0.052926	-0.349653	-0.236743	0.000000	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	-1.088190	-2.258101
25%	-1.063462	-0.523470	-1.011236	-0.339507	-1.233340	-0.545070	-1.862313e-01	-0.074953	-0.074953	0.000000	-0.052926	-0.349653	-0.236743	0.000000	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	-0.736112	-0.822046
50%	-1.063462	-0.523470	0.988889	-0.339507	0.810807	-0.545070	-1.862313e-01	-0.074953	-0.074953	0.000000	-0.052926	-0.349653	-0.236743	0.000000	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	-0.428871	-0.280922
75%	0.940325	-0.523470	0.988889	-0.339507	0.810807	1.834626	-1.862313e-01	-0.074953	-0.074953	0.000000	-0.052926	-0.349653	-0.236743	0.000000	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	0.514160	0.502119
max	0.940325	1.910327	0.988889	2.945450	0.810807	1.834626	-1.862313e-01	13.341664	13.341664	1.000000	18.894444	2.859980	4.223992	1.000000	-5.292561e-02	-5.292561e-02	-5.292561e-02	-0.074953	3.749491	2.694634

Let's start bi-classification ...¶

★ Titanic Dataset ★

Description:¶

Meet and Greet Data

1. Data Exploration

2. Exploratory Data Analysis (EDA)

3. Data Preprocessing

4. Data Manipulation

5. Feature Selection/Extraction

5a. Manual Method - VIF¶

5b. Automatic Method - RFE¶

5c. PCA¶

6. Predictive Modeling

1. Logistic Regression:¶

2. Decisoin Tree Classfier:¶

3. Random Forest Classfier:¶

4. Naive Bayes Classfier:¶

5. Support Vector Machine Classfier:¶

6. K-Nearest Neighbours Classfier:¶

7. Extreme Gradient Boosting Classfier:¶

8. Deep Learning:¶

	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	0	3	male	34.5	0	0	7.8292	Q
1	1	3	female	47.0	1	0	7.0000	S
2	0	2	male	62.0	0	0	9.6875	Q
3	0	3	male	27.0	0	0	8.6625	S
4	1	3	female	22.0	1	1	12.2875	S

	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
413	0	3	male	NaN	0	0	8.0500	S
414	1	1	female	39.0	0	0	108.9000	C
415	0	3	male	38.5	0	0	7.2500	S
416	0	3	male	NaN	0	0	8.0500	S
417	0	3	male	NaN	1	1	22.3583	C

	count	mean	std	min	25%	50%	75%	max
Survived	418.0	0.363636	0.481622	0.00	0.0000	0.0000	1.0	1.0000
Pclass	418.0	2.265550	0.841838	1.00	1.0000	3.0000	3.0	3.0000
Age	332.0	30.272590	14.181209	0.17	21.0000	27.0000	39.0	76.0000
SibSp	418.0	0.447368	0.896760	0.00	0.0000	0.0000	1.0	8.0000
Parch	418.0	0.392344	0.981429	0.00	0.0000	0.0000	0.0	9.0000
Fare	417.0	35.627188	55.907576	0.00	7.8958	14.4542	31.5	512.3292

	Survived	attribute	measurement
0	Not-Survived	Age	34.5
1	Survived	Age	47.0
2	Not-Survived	Age	62.0
3	Not-Survived	Age	27.0
4	Survived	Age	22.0

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	Parch_1
0	0	1	0	1	1	0	0	0
1	1	0	0	1	0	1	1	0
2	0	1	1	0	1	0	0	0
3	0	1	0	1	0	1	0	0
4	1	0	0	1	0	1	1	1

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9
0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	1	0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0
2	0	1	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	1	0	1	1	0	0	0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
375	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0
376	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
377	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
378	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
379	0	1	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0

	Accuracy	Precision	Recall	F1-score	AUC-ROC score
Logistic Regression (LR)	0.0	0.0	0.0	0.0	0.0
Decision Tree Classifier (DT)	0.0	0.0	0.0	0.0	0.0
Random Forest Classifier (RF)	0.0	0.0	0.0	0.0	0.0
Naïve Bayes Classifier (NB)	0.0	0.0	0.0	0.0	0.0
Support Vector Machine (SVM)	0.0	0.0	0.0	0.0	0.0
K Nearest Neighbours (KNN)	0.0	0.0	0.0	0.0	0.0
Extreme Gradient Boosting (XGB)	0.0	0.0	0.0	0.0	0.0
Deep Learning (DL)	0.0	0.0	0.0	0.0	0.0
DUMMY (DM)	0.0	0.0	0.0	0.0	0.0

	coeff
Sex	-0.849003
Pclass_2	-0.095318
Pclass_3	-0.018685
Embarked_Q	0.048915
Embarked_S	-0.101063
SibSp_1	0.021590
SibSp_2	0.010019
SibSp_3	-0.040841
SibSp_4	-0.049564
SibSp_5	0.000000
SibSp_8	-0.033787
Parch_1	0.047975
Parch_2	0.055455
Parch_3	0.000000
Parch_4	0.034152
Parch_5	-0.027952
Parch_6	-0.028518
Parch_9	0.000933
Fare	0.050779
Age	-0.031927

	Accuracy	Precision	Recall	F1-score	AUC-ROC score
Logistic Regression (LR)	100.0	100.0	100.0	100.0	100.0
Decision Tree Classifier (DT)	100.0	100.0	100.0	100.0	100.0
Random Forest Classifier (RF)	100.0	100.0	100.0	100.0	100.0
Naïve Bayes Classifier (NB)	100.0	100.0	100.0	100.0	100.0
Support Vector Machine (SVM)	100.0	100.0	100.0	100.0	100.0
K Nearest Neighbours (KNN)	100.0	100.0	100.0	100.0	100.0
Extreme Gradient Boosting (XGB)	100.0	100.0	100.0	100.0	100.0
Deep Learning (DL)	100.0	100.0	100.0	100.0	100.0
DUMMY (DM)	37.8	14.3	37.8	20.7	50.0

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	Parch_1
0	0	1	0	1	1	0	0	0
1	1	0	0	1	0	1	1	0
2	0	1	1	0	1	0	0	0
3	0	1	0	1	0	1	0	0
4	1	0	0	1	0	1	1	1

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9
0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	1	0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0
2	0	1	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	1	0	1	1	0	0	0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
375	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0
376	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
377	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
378	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
379	0	1	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	Parch_1
0	0	1	0	1	1	0	0	0
1	1	0	0	1	0	1	1	0
2	0	1	1	0	1	0	0	0
3	0	1	0	1	0	1	0	0
4	1	0	0	1	0	1	1	1

	Survived	Sex	Pclass_2	Pclass_3	Embarked_Q	Embarked_S	SibSp_1	SibSp_2	SibSp_3	SibSp_4	SibSp_5	SibSp_8	Parch_1	Parch_2	Parch_3	Parch_4	Parch_5	Parch_6	Parch_9
0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	1	0	0	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0
2	0	1	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	1	0	1	1	0	0	0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
375	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0
376	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
377	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
378	0	1	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0
379	0	1	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0