The sinking of the Titanic is one of the most infamous shipwrecks in history.
On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.
While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.
import os
import math
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
from scipy.stats import randint
from scipy.stats import loguniform
from IPython.display import display
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scikitplot.metrics import plot_roc_curve as auc_roc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, \
f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from keras.models import Sequential
from keras.layers import Dense
# from keras.optimizers import SGD,Adam
from tensorflow.keras.optimizers import SGD, Adam
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,6]
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
import tensorflow as tf
def set_seed(seed=1234):
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()
import pandas as pd
import numpy as np
# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..
# the original dataset would probably call this ['Species']
df = pd.read_csv('./tested.csv')
df.columns.to_list()
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.head()
df.tail()
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 1 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 0 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 0 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 1 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
413 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
414 | 1 | 1 | female | 39.0 | 0 | 0 | 108.9000 | C |
415 | 0 | 3 | male | 38.5 | 0 | 0 | 7.2500 | S |
416 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
417 | 0 | 3 | male | NaN | 1 | 1 | 22.3583 | C |
target = 'Survived'
labels = ['Not-Survived', 'Survived']
features = [i for i in df.columns.values if i not in [target]]
print(features)
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
original_df = df.copy(deep=True)
original_df.head()
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 1 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 0 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 0 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 1 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
print('\n\033[1mInference:\033[0m The Dataset consists of {} features & {} samples.'.format(df.shape[1], df.shape[0]))
Inference: The Dataset consists of 8 features & 418 samples.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 418 non-null int64 1 Pclass 418 non-null int64 2 Sex 418 non-null object 3 Age 332 non-null float64 4 SibSp 418 non-null int64 5 Parch 418 non-null int64 6 Fare 417 non-null float64 7 Embarked 418 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 26.2+ KB
# Checking number of unique rows in each feature
df.nunique().sort_values()
Survived 2 Sex 2 Pclass 3 Embarked 3 SibSp 7 Parch 8 Age 79 Fare 169 dtype: int64
# Checking number of unique rows in each feature
nu = df[features].nunique().sort_values()
nf = []; cf = []; #nnf = 0; ncf = 0; #numerical & categorical features
for i in range(df[features].shape[1]):
if nu.values[i]<=15:cf.append(nu.index[i])
else: nf.append(nu.index[i])
print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(nf),len(cf)))
Inference: The Datset has 2 numerical & 5 categorical features.
# Checking the stats of all the columns
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Survived | 418.0 | 0.363636 | 0.481622 | 0.00 | 0.0000 | 0.0000 | 1.0 | 1.0000 |
Pclass | 418.0 | 2.265550 | 0.841838 | 1.00 | 1.0000 | 3.0000 | 3.0 | 3.0000 |
Age | 332.0 | 30.272590 | 14.181209 | 0.17 | 21.0000 | 27.0000 | 39.0 | 76.0000 |
SibSp | 418.0 | 0.447368 | 0.896760 | 0.00 | 0.0000 | 0.0000 | 1.0 | 8.0000 |
Parch | 418.0 | 0.392344 | 0.981429 | 0.00 | 0.0000 | 0.0000 | 0.0 | 9.0000 |
Fare | 417.0 | 35.627188 | 55.907576 | 0.00 | 7.8958 | 14.4542 | 31.5 | 512.3292 |
df.value_counts(target)
Survived 0 266 1 152 dtype: int64
df.groupby(target).size()
Survived 0 266 1 152 dtype: int64
# Correlation with target
#df[features].apply(lambda x: x.corr(df[target]))
# Correlation with target
df[features].corrwith(df[target])
Pclass -0.108615 Age -0.000013 SibSp 0.099943 Parch 0.159120 Fare 0.191514 dtype: float64
# type(df[features].corrwith(df[target]))
# dir(df[features].corrwith(df[target]))
df[features].corrwith(df[target]).index
df[features].corrwith(df[target]).values
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
array([-1.08614521e-01, -1.26823571e-05, 9.99433703e-02, 1.59120113e-01, 1.91513743e-01])
# x = df[features].corrwith(df[target]).index.tolist()
# y = df[features].corrwith(df[target]).values.round(2).tolist()
# # creating the bar plot
# plt.bar(x, y, color ='salmon', width = 0.4)
# plt.title("Correlation with target", fontweight='bold', size=20)
# for i in range(len(y)):
# plt.annotate(str(y[i]), xy=(x[i],y[i]), ha='center', va='bottom')
# plt.show()
x = df[features].corrwith(df[target]).index.tolist()
y = df[features].corrwith(df[target]).values.round(2).tolist()
ax = sns.barplot(x, y)
for i in ax.containers:
ax.bar_label(i,)
plt.title('Correlation with Target')
plt.show()
[Text(0, 0, '-0.11'), Text(0, 0, '0'), Text(0, 0, '0.1'), Text(0, 0, '0.16'), Text(0, 0, '0.19')]
Text(0.5, 1.0, 'Correlation with Target')
# plt.figure(figsize=(8,6))
sns.heatmap(df[features].corr(), annot=True)
<AxesSubplot:>
sns.pairplot(df, hue=target, height=2)
<seaborn.axisgrid.PairGrid at 0x21539f08ac0>
#fig, axes = plt.subplots(2, 2)
#axes[0,0].set_title("Sepal Length")
#axes[0,0].hist(df['sepal length (cm)'])
#axes[0,1].set_title("Sepal Width")
#axes[0,1].hist(df['sepal width (cm)']);
#axes[1,0].set_title("Petal Length")
#axes[1,0].hist(df['petal length (cm)']);
#axes[1,1].set_title("Petal Width")
#axes[1,1].hist(df['petal width (cm)']);
#Let us first analyze the distribution of the target variable
MAP={}
for e, i in enumerate(df[target].unique()):
MAP[i]=labels[e]
#MAP={0:'Not-Survived',1:'Survived'}
df1 = df.copy()
df1[target]=df1[target].map(MAP)
explode=np.zeros(len(labels))
explode[-1]=0.1
print('\033[1mTarget Variable Distribution'.center(55))
plt.pie(df1[target].value_counts(), labels=df1[target].value_counts().index, counterclock=False, shadow=True,
explode=explode, autopct='%1.1f%%', radius=1, startangle=0)
plt.show()
Target Variable Distribution
([<matplotlib.patches.Wedge at 0x2153ce95370>, <matplotlib.patches.Wedge at 0x2153ce95d30>], [Text(-0.45695648023571717, -1.000595210447554, 'Not-Survived'), Text(0.4984979784389643, 1.0915584113973316, 'Survived')], [Text(-0.24924898921948208, -0.5457792056986657, '63.6%'), Text(0.29079048742272917, 0.6367424066484433, '36.4%')])
# # distplot is depreciated
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "sepal length (cm)").add_legend()
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "sepal width (cm)").add_legend()
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "petal length (cm)").add_legend()
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(sns.distplot, "petal width (cm)").add_legend()
# plt.show()
# # https://stackoverflow.com/questions/63895392/seaborn-is-not-plotting-within-defined-subplots
# #fig, axes = plt.subplots(2,2)
# #sns.set(rc={"figure.figsize": (8, 4)});
# #subplot(2,2,1)
# sns.displot(df, x="sepal length (cm)", kind="kde", hue="Species", ax=axes[0,0])
# #subplot(2,2,2)
# sns.displot(df, x="sepal width (cm)", kind="kde", hue="Species", ax=axes[0,1])
# #subplot(2,2,3)
# sns.displot(df, x="petal length (cm)", kind="kde", hue="Species", ax=axes[1,0])
# #subplot(2,2,4)
# sns.displot(df, x="petal width (cm)", kind="kde", hue="Species", ax=axes[1,1])
# plt.show()
# https://stackoverflow.com/questions/63895392/seaborn-is-not-plotting-within-defined-subplots
# plot = sns.FacetGrid(df, hue="Species")
# plot.map(displot, "sepal length (cm)").add_legend()
# plot.map(displot, "sepal width (cm)").add_legend()
# plot.map(displot, "petal length (cm)").add_legend()
# plot.map(displot, "petal width (cm)").add_legend()
# plt.show()
df1['Sex'] = df1['Sex'].map({'male':1.0, 'female':0.0})
df_melt = df1[[target]+nf].melt(id_vars=target, var_name='attribute', value_name= 'measurement')
df_melt.head()
min_x, max_x = -100, 300
sns.displot(
data=df_melt,
x='measurement',
#hue='Species',
#kind='kde',
kde =True,
#fill=True,
col='attribute',
#bins=10,
binrange=(min_x, max_x),
#kde_kws={'clip': (min_x, max_x)},
facet_kws=dict(sharey=False, sharex=False) # different scale on x-axis, y-axis
)
Survived | attribute | measurement | |
---|---|---|---|
0 | Not-Survived | Age | 34.5 |
1 | Survived | Age | 47.0 |
2 | Not-Survived | Age | 62.0 |
3 | Not-Survived | Age | 27.0 |
4 | Survived | Age | 22.0 |
<seaborn.axisgrid.FacetGrid at 0x2153cc4c430>
sns.boxplot(
data=df_melt,
y='attribute',
x='measurement',
#hue='Species'
orient="h"
)
<AxesSubplot:xlabel='measurement', ylabel='attribute'>
sns.boxplot(
data=df_melt,
x='attribute',
y='measurement',
#hue='Species',
orient="v"
)
<AxesSubplot:xlabel='attribute', ylabel='measurement'>
# Visualising the categorical features
print('\033[1mVisualising Categorical Features:'.center(100))
n=3
plt.figure(figsize=[15,3*math.ceil(len(cf)/n)])
for i in range(len(cf)):
if df[cf[i]].nunique()<=15:
plt.subplot(math.ceil(len(cf)/n),n,i+1)
ax = sns.countplot(df[cf[i]])
ax.bar_label(container=ax.containers[0])
#else:
# plt.subplot(2,2,i)
# sns.countplot(df[cf[i]])
plt.tight_layout()
plt.show()
Visualising Categorical Features:
<Figure size 1080x432 with 0 Axes>
<AxesSubplot:>
[Text(0, 0, '266'), Text(0, 0, '152')]
<AxesSubplot:>
[Text(0, 0, '107'), Text(0, 0, '93'), Text(0, 0, '218')]
<AxesSubplot:>
[Text(0, 0, '46'), Text(0, 0, '270'), Text(0, 0, '102')]
<AxesSubplot:>
[Text(0, 0, '283'), Text(0, 0, '110'), Text(0, 0, '14'), Text(0, 0, '4'), Text(0, 0, '4'), Text(0, 0, '1'), Text(0, 0, '2')]
<AxesSubplot:>
[Text(0, 0, '324'), Text(0, 0, '52'), Text(0, 0, '33'), Text(0, 0, '3'), Text(0, 0, '2'), Text(0, 0, '1'), Text(0, 0, '1'), Text(0, 0, '2')]
# Understanding the Numerical feature set
print('\033[1mFeatures Distribution'.center(100))
n=4
nf = [i for i in features if i not in cf]
plt.figure(figsize=[15,3*math.ceil(len(features)/n)])
for c in range(len(nf)):
plt.subplot(math.ceil(len(features)/n),n,c+1)
sns.distplot(df[nf[c]])
plt.tight_layout()
plt.show()
plt.figure(figsize=[15,3*math.ceil(len(features)/n)])
for c in range(len(nf)):
plt.subplot(math.ceil(len(features)/n),n,c+1)
df.boxplot(nf[c])
plt.tight_layout()
plt.show()
Features Distribution
<Figure size 1080x432 with 0 Axes>
<AxesSubplot:>
<AxesSubplot:xlabel='Age', ylabel='Density'>
<AxesSubplot:>
<AxesSubplot:xlabel='Fare', ylabel='Density'>
<Figure size 1080x432 with 0 Axes>
<AxesSubplot:>
<AxesSubplot:>
<AxesSubplot:>
<AxesSubplot:>
# Understanding the relationship between all the features
ppc=[i for i in df.columns if i not in cf]
g=sns.pairplot(df[ppc], hue=target, size=4)
#g.map_upper(sns.kdeplot, levels=1, color=".2")
plt.show()
fig, axes = plt.subplots(1,2)
sns.boxplot(x=target, y="Age", data=df, ax=axes[0])
sns.boxplot(x=target, y="Fare", data=df, ax=axes[1])
<AxesSubplot:xlabel='Survived', ylabel='Age'>
<AxesSubplot:xlabel='Survived', ylabel='Fare'>
fig, axes = plt.subplots(1,2)
sns.boxplot("Age", data=df, ax=axes[0])
sns.boxplot("Fare", data=df, ax=axes[1])
<AxesSubplot:xlabel='Age'>
<AxesSubplot:xlabel='Fare'>
#sns.catplot(data=df_melt, x="measurement", y="attribute", orient="h", kind="box")
#sns.catplot(data=df_melt, y="measurement", x="attribute", orient="v", kind="box")
# Removal of any Duplicat rows (if any)
duplicate = df[df.duplicated(keep=False)]
print(duplicate)
r, c = df.shape
df1 = df.copy()
df1.drop_duplicates(inplace=True)
df1.reset_index(drop=True, inplace=True)
df1.shape
df1.head()
if df1.shape == (r, c):
print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else:
print(f'\n\033[1mInference:\033[0m Number of duplicates dropped ---> {r-df1.shape[0]}')
Survived Pclass Sex Age SibSp Parch Fare Embarked 3 0 3 male 27.0 0 0 8.6625 S 10 0 3 male NaN 0 0 7.8958 S 29 0 3 male NaN 2 0 21.6792 C 36 1 3 female NaN 0 0 8.0500 S 41 0 1 male NaN 0 0 26.5500 S 47 0 3 male NaN 0 0 7.7500 Q 70 1 3 female 24.0 0 0 7.7500 Q 76 0 3 male NaN 0 0 8.0500 S 78 0 2 male 30.0 0 0 13.0000 S 79 1 3 female 24.0 0 0 7.7500 Q 83 0 3 male NaN 0 0 7.8958 S 88 1 3 female NaN 0 0 7.7500 Q 93 0 3 male NaN 0 0 8.0500 S 102 0 3 male NaN 0 0 7.7500 Q 103 0 3 male 26.0 0 0 7.7750 S 107 0 3 male NaN 0 0 7.7500 Q 119 1 2 female 29.0 1 0 26.0000 S 124 0 3 male NaN 0 0 7.7500 Q 137 0 2 male 26.0 0 0 13.0000 S 144 0 1 male 42.0 0 0 26.5500 S 148 0 1 male NaN 0 0 26.5500 S 158 0 1 male 42.0 0 0 26.5500 S 170 0 3 male NaN 0 0 7.5500 S 173 0 3 male NaN 0 0 7.2292 C 180 0 2 male 30.0 0 0 13.0000 S 183 0 3 male NaN 0 0 7.7500 Q 204 0 2 male 25.0 0 0 10.5000 S 219 0 3 male NaN 0 0 8.0500 S 227 1 3 female NaN 0 0 7.7500 Q 248 1 2 female 29.0 1 0 26.0000 S 255 0 3 male NaN 0 0 7.5500 S 256 0 3 male NaN 0 0 7.7500 Q 265 0 3 male NaN 0 0 7.8958 S 267 0 3 male NaN 0 0 7.5500 S 268 1 3 female NaN 0 0 8.0500 S 271 0 3 male NaN 0 0 7.7500 Q 274 0 3 male NaN 0 0 7.2250 C 282 1 3 female NaN 0 0 7.7500 Q 288 0 3 male NaN 0 0 7.2292 C 289 0 3 male NaN 0 0 8.0500 S 292 0 3 male NaN 0 0 7.2292 C 297 0 3 male NaN 2 0 21.6792 C 304 1 3 female NaN 0 0 7.7500 Q 320 0 3 male 26.0 0 0 7.7750 S 322 0 2 male 26.0 0 0 13.0000 S 332 0 3 male NaN 0 0 7.2250 C 339 0 3 male NaN 0 0 7.2292 C 346 0 2 male 26.0 0 0 13.0000 S 349 1 2 female 31.0 0 0 21.0000 S 351 0 2 male 25.0 0 0 10.5000 S 358 0 3 male NaN 0 0 7.7500 Q 362 1 2 female 31.0 0 0 21.0000 S 363 0 3 male 27.0 0 0 8.6625 S 380 0 3 male NaN 0 0 7.7500 Q 410 1 3 female NaN 0 0 7.7500 Q 413 0 3 male NaN 0 0 8.0500 S 416 0 3 male NaN 0 0 8.0500 S
(380, 8)
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 1 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 0 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 0 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 1 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
Inference: Number of duplicates dropped ---> 38
# Check for empty elements
nvc = pd.DataFrame(df1.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/df1.shape[0], 3) * 100
print(nvc)
Total Null Values Percentage Survived 0 0.0 Pclass 0 0.0 Sex 0 0.0 SibSp 0 0.0 Parch 0 0.0 Embarked 0 0.0 Fare 1 0.3 Age 58 15.3
# Converting categorical Columns to Numeric
ecc = nvc[nvc['Percentage']!=0].index.values
dcc = [i for i in df.columns if i not in ecc]
print(dcc)
df1.head()
# Target Variable
MAP={}
for i, e in enumerate(df1[target].unique()):
MAP[e]=i
df1[target] = df1[target].map(MAP)
print('Mapping Target Variable --->', MAP)
df1.head()
df3 = df1[dcc]
fcc = [i for i in cf if i not in ecc]
print(fcc)
df3.head()
# One-Hot Binary Encoding
oh=True
dm=True
for i in fcc:
#print(i)
if df3[i].nunique()==2:
if oh==True: print("\033[1m\nOne-Hot Encoding on features:\033[0m")
print(i);oh=False
df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
if (df3[i].nunique()>2 and df3[i].nunique()<17):
if dm==True: print("\n\033[1mDummy Encoding on features:\033[0m")
print(i);dm=False
df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i], drop_first=True, prefix=str(i)))],axis=1)
df3.shape
df3.head()
['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 1 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 0 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 0 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 1 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
Mapping Target Variable ---> {0: 0, 1: 1}
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 1 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 0 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 0 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 1 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']
Survived | Pclass | Sex | SibSp | Parch | Embarked | |
---|---|---|---|---|---|---|
0 | 0 | 3 | male | 0 | 0 | Q |
1 | 1 | 3 | female | 1 | 0 | S |
2 | 0 | 2 | male | 0 | 0 | Q |
3 | 0 | 3 | male | 0 | 0 | S |
4 | 1 | 3 | female | 1 | 1 | S |
One-Hot Encoding on features: Sex Dummy Encoding on features: Pclass Embarked SibSp Parch
(380, 19)
Survived | Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | SibSp_5 | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
# Fixing Empty Categorical Columns
for x in [i for i in ecc if i in cf]:
a = df1[x]
b=[]; c=[]
for i,e in enumerate(a):
if e!=e:
b.append(i)
else:
c.append(i)
RF = RandomForestClassifier()
RF.fit(df3.loc[c],a[c])
d = RF.predict(df3.loc[b])
df3[x] = a
f=0
for i,e in enumerate(df3[x]):
if e!=e:
df3.loc[i,x] = d[f]
f+=1
df3 = pd.concat([df3.drop([x], axis=1), pd.DataFrame(pd.get_dummies(df3[x], drop_first=True, prefix=str(x)))],axis=1)
df3
Survived | Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | SibSp_5 | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
376 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
377 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
378 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
379 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
380 rows × 19 columns
# Fixing Empty Numerical Columns
for x in [i for i in ecc if i not in cf]:
a = df1[x]
b=[]; c=[]
for i,e in enumerate(a):
if e!=e:
b.append(i)
else:
c.append(i)
LR = LinearRegression()
LR.fit(df3.loc[c],a[c])
d = LR.predict(df3.loc[b])
df3[x] = a
f=0
for i,e in enumerate(df3[x]):
if e!=e:
df3.loc[i,x] = d[f]
f+=1
#df3 = pd.concat([df3.drop([x], axis=1), pd.DataFrame(pd.get_dummies(df3[x], drop_first=True, prefix=str(x)))],axis=1)
df3
LinearRegression()
LinearRegression()
Survived | Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | ... | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | Fare | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.8292 | 34.500000 |
1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.0000 | 47.000000 |
2 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9.6875 | 62.000000 |
3 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.6625 | 27.000000 |
4 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 12.2875 | 22.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 90.0000 | 37.000000 |
376 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.7750 | 28.000000 |
377 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108.9000 | 39.000000 |
378 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.2500 | 38.500000 |
379 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22.3583 | 19.357866 |
380 rows × 21 columns
# Removal of outlier:
df4 = df3.copy()
for i in [i for i in df4.columns]:
if df4[i].nunique()>=12:
Q1 = df4[i].quantile(0.20)
Q3 = df4[i].quantile(0.80)
IQR = Q3 - Q1
df4 = df4[df4[i] <= (Q3+(1.5*IQR))]
df4 = df4[df4[i] >= (Q1-(1.5*IQR))]
df4 = df4.reset_index(drop=True)
df4.head()
print('\n\033[1mInference:\033[0m Before removal of outliers, The dataset had {} samples.'.format(df.shape[0]))
print('\033[1mInference:\033[0m After removal of outliers, The dataset now has {} samples.'.format(df4.shape[0]))
Survived | Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | ... | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | Fare | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.8292 | 34.5 |
1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.0000 | 47.0 |
2 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9.6875 | 62.0 |
3 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.6625 | 27.0 |
4 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 12.2875 | 22.0 |
5 rows × 21 columns
Inference: Before removal of outliers, The dataset had 418 samples. Inference: After removal of outliers, The dataset now has 350 samples.
# Fixing the imbalance using SMOTE Technique
df5 = df4.copy()
print('Original class distribution:')
print(df5[target].value_counts())
xf = df5.columns
X = df5.drop([target], axis=1)
Y = df5[target]
smote = SMOTE()
X, Y = smote.fit_resample(X, Y)
df5 = pd.DataFrame(X, columns=xf)
df5[target] = Y
print('\nClass distribution after applying SMOTE Technique:',)
print(Y.value_counts())
Original class distribution: 0 224 1 126 Name: Survived, dtype: int64 Class distribution after applying SMOTE Technique: 0 224 1 224 Name: Survived, dtype: int64
df = df5.copy()
plt.title('Final Dataset Samples')
plt.pie([df.shape[0], original_df.shape[0]-df4.shape[0], df5.shape[0]-df4.shape[0]], radius = 1, shadow=True,
labels=['Retained','Dropped','Augmented'], counterclock=False, autopct='%1.1f%%', pctdistance=0.9, explode=[0,0.1,0.1])
plt.pie([df.shape[0]], labels=['100%'], labeldistance=-0, radius=0.78, shadow=True, colors=['powderblue'])
plt.show()
print('\n\033[1mInference:\033[0mThe final dataset after cleanup has {} samples & {} columns.'.format(df.shape[0], df.shape[1]))
Text(0.5, 1.0, 'Final Dataset Samples')
([<matplotlib.patches.Wedge at 0x215406117c0>, <matplotlib.patches.Wedge at 0x2154061f0d0>, <matplotlib.patches.Wedge at 0x2154061fa60>], [Text(-0.7265137443323456, -0.8259405422281892, 'Retained'), Text(0.2618915957590218, 1.171073350422933, 'Dropped'), Text(1.052277065616864, 0.576812774803716, 'Augmented')], [Text(-0.5944203362719191, -0.6757695345503366, '73.0%'), Text(0.21824299646585155, 0.9758944586857773, '11.1%'), Text(0.87689755468072, 0.4806773123364299, '16.0%')])
([<matplotlib.patches.Wedge at 0x2154062c550>], [Text(0.0, 0.0, '100%')])
Inference:The final dataset after cleanup has 448 samples & 21 columns.
#Splitting the data intro training & testing sets
df = df5.copy()
X = df.drop([target],axis=1)
Y = df[target]
# X = df
# Y = X.pop(target)
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)
print('Original set ---> ',X.shape,Y.shape,'\nTraining set ---> ',Train_X.shape,Train_Y.shape,'\nTesting set ---> ', Test_X.shape,'', Test_Y.shape)
Original set ---> (448, 20) (448,) Training set ---> (358, 20) (358,) Testing set ---> (90, 20) (90,)
# Feature Scaling (Standardization)
std = StandardScaler()
print('\033[1mStandardardization on Training set'.center(100))
Train_X_std = std.fit_transform(Train_X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
display(Train_X_std.describe())
print('\n','\033[1mStandardardization on Testing set'.center(100))
Test_X_std = std.transform(Test_X)
Test_X_std = pd.DataFrame(Test_X_std, columns=X.columns)
display(Test_X_std.describe())
Standardardization on Training set
Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | SibSp_5 | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | Fare | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 358.0 | 358.000000 | 3.580000e+02 | 3.580000e+02 | 358.0 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 | 3.580000e+02 |
mean | -7.939025e-17 | -1.141235e-16 | -7.939025e-17 | -3.473323e-17 | -4.961891e-17 | 4.961891e-18 | 4.093560e-17 | -2.480945e-17 | -2.977134e-17 | 0.0 | 0.000000 | 5.458080e-17 | -2.604993e-17 | 0.0 | -9.923781e-18 | -9.923781e-18 | 1.488567e-17 | -1.984756e-17 | 6.946647e-17 | 2.183232e-16 |
std | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 0.0 | 1.001400 | 1.001400e+00 | 1.001400e+00 | 0.0 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 | 1.001400e+00 |
min | -1.063462e+00 | -5.234705e-01 | -1.011236e+00 | -3.395067e-01 | -1.233340e+00 | -5.450701e-01 | -1.862313e-01 | -7.495317e-02 | -7.495317e-02 | 0.0 | -0.052926 | -3.496528e-01 | -2.367429e-01 | 0.0 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -7.495317e-02 | -1.124477e+00 | -2.303518e+00 |
25% | -1.063462e+00 | -5.234705e-01 | -1.011236e+00 | -3.395067e-01 | -1.233340e+00 | -5.450701e-01 | -1.862313e-01 | -7.495317e-02 | -7.495317e-02 | 0.0 | -0.052926 | -3.496528e-01 | -2.367429e-01 | 0.0 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -7.495317e-02 | -7.355564e-01 | -6.381262e-01 |
50% | 9.403247e-01 | -5.234705e-01 | 9.888885e-01 | -3.395067e-01 | 8.108066e-01 | -5.450701e-01 | -1.862313e-01 | -7.495317e-02 | -7.495317e-02 | 0.0 | -0.052926 | -3.496528e-01 | -2.367429e-01 | 0.0 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -7.495317e-02 | -4.406071e-01 | -2.026184e-01 |
75% | 9.403247e-01 | -5.234705e-01 | 9.888885e-01 | -3.395067e-01 | 8.108066e-01 | -5.450701e-01 | -1.862313e-01 | -7.495317e-02 | -7.495317e-02 | 0.0 | -0.052926 | -3.496528e-01 | -2.367429e-01 | 0.0 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -7.495317e-02 | 1.807455e-01 | 6.587269e-01 |
max | 9.403247e-01 | 1.910327e+00 | 9.888885e-01 | 2.945450e+00 | 8.108066e-01 | 1.834626e+00 | 5.369668e+00 | 1.334166e+01 | 1.334166e+01 | 0.0 | 18.894444 | 2.859980e+00 | 4.223992e+00 | 0.0 | 1.889444e+01 | 1.889444e+01 | 1.889444e+01 | 1.334166e+01 | 3.775379e+00 | 2.694634e+00 |
Standardardization on Testing set
Sex | Pclass_2 | Pclass_3 | Embarked_Q | Embarked_S | SibSp_1 | SibSp_2 | SibSp_3 | SibSp_4 | SibSp_5 | SibSp_8 | Parch_1 | Parch_2 | Parch_3 | Parch_4 | Parch_5 | Parch_6 | Parch_9 | Fare | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 9.000000e+01 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 9.000000e+01 | 9.000000e+01 | 9.000000e+01 | 90.000000 | 90.000000 | 90.000000 |
mean | -0.306476 | -0.144880 | 0.166615 | -0.047511 | -0.006852 | 0.115957 | -1.862313e-01 | 0.074120 | 0.223194 | 0.011111 | 0.157601 | 0.042636 | 0.358022 | 0.011111 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | 0.063130 | -0.134010 |
std | 0.976942 | 0.887032 | 0.989645 | 0.940081 | 1.007033 | 1.071845 | 2.791107e-17 | 1.414236 | 1.988763 | 0.105409 | 1.997228 | 1.057180 | 1.524853 | 0.105409 | 6.977768e-18 | 6.977768e-18 | 1.395554e-17 | 0.000000 | 1.114787 | 1.049540 |
min | -1.063462 | -0.523470 | -1.011236 | -0.339507 | -1.233340 | -0.545070 | -1.862313e-01 | -0.074953 | -0.074953 | 0.000000 | -0.052926 | -0.349653 | -0.236743 | 0.000000 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | -1.088190 | -2.258101 |
25% | -1.063462 | -0.523470 | -1.011236 | -0.339507 | -1.233340 | -0.545070 | -1.862313e-01 | -0.074953 | -0.074953 | 0.000000 | -0.052926 | -0.349653 | -0.236743 | 0.000000 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | -0.736112 | -0.822046 |
50% | -1.063462 | -0.523470 | 0.988889 | -0.339507 | 0.810807 | -0.545070 | -1.862313e-01 | -0.074953 | -0.074953 | 0.000000 | -0.052926 | -0.349653 | -0.236743 | 0.000000 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | -0.428871 | -0.280922 |
75% | 0.940325 | -0.523470 | 0.988889 | -0.339507 | 0.810807 | 1.834626 | -1.862313e-01 | -0.074953 | -0.074953 | 0.000000 | -0.052926 | -0.349653 | -0.236743 | 0.000000 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | 0.514160 | 0.502119 |
max | 0.940325 | 1.910327 | 0.988889 | 2.945450 | 0.810807 | 1.834626 | -1.862313e-01 | 13.341664 | 13.341664 | 1.000000 | 18.894444 | 2.859980 | 4.223992 | 1.000000 | -5.292561e-02 | -5.292561e-02 | -5.292561e-02 | -0.074953 | 3.749491 | 2.694634 |
#Checking the correlation
features = df.columns
plt.figure(figsize=[24,20])
plt.title('Features Correlation-Plot')
sns.heatmap(df[features].corr(), vmin=-1, vmax=1, center=0, annot=True)
plt.show()
<Figure size 1728x1440 with 0 Axes>
Text(0.5, 1.0, 'Features Correlation-Plot')
<AxesSubplot:title={'center':'Features Correlation-Plot'}>
# Calculate the VIFs to remove multicollinearity
DROP=[]; scores1=[]; scores2=[]; scores3=[]
#scores.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std)))
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y, eval_metric='logloss').predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
for i in range(len(X.columns.values)-1):
vif = pd.DataFrame()
Xs = X.drop(DROP,axis=1)
#print(DROP)
vif['Features'] = Xs.columns
vif['VIF'] = [variance_inflation_factor(Xs.values, i) for i in range(Xs.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif.reset_index(drop=True, inplace=True)
DROP.append(vif.Features[0])
if vif.VIF[0]>1:
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y, eval_metric='logloss').predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
#print(scores)
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.7,0.85])
plt.legend()
plt.grid()
plt.show()
[<matplotlib.lines.Line2D at 0x21541347df0>]
[<matplotlib.lines.Line2D at 0x21541363220>]
[<matplotlib.lines.Line2D at 0x215413634f0>]
<matplotlib.legend.Legend at 0x215412c1400>
# Applying Recurrsive Feature Elimination
# Running RFE with the output number of the variable equal to 10
LR = LogisticRegression()#.fit(Train_X_std, Train_Y)
scores1=[]; scores2=[]; scores3=[]
# scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std, Train_Y, eval_metric='logloss').predict(Test_X_std),average='weighted')*100)
for i in range(len(X.columns.values)):
rfe = RFE(LR,n_features_to_select=len(Train_X_std.columns)-i)
rfe = rfe.fit(Train_X_std, Train_Y)
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y, eval_metric='logloss').predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.80,0.84])
plt.legend()
plt.grid()
plt.show()
[<matplotlib.lines.Line2D at 0x215406ba370>]
[<matplotlib.lines.Line2D at 0x215406ba640>]
[<matplotlib.lines.Line2D at 0x215406ba910>]
<matplotlib.legend.Legend at 0x215405f9550>
pca = PCA().fit(Train_X_std)
fig, ax = plt.subplots(figsize=(14,6))
x_values = range(1, pca.n_components_+1)
ax.bar(x_values, pca.explained_variance_ratio_, lw=2, label='Explained Variance')
ax.plot(x_values, np.cumsum(pca.explained_variance_ratio_), lw=2, label='Cumulative Explained Variance', color='red')
plt.plot([0,pca.n_components_+1],[0.90,0.90],'g--')
plt.plot([2,2],[0,1], 'g--')
ax.set_title('Explained variance of components')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance')
plt.grid()
plt.legend()
plt.show()
<BarContainer object of 20 artists>
[<matplotlib.lines.Line2D at 0x21540705430>]
[<matplotlib.lines.Line2D at 0x21540705850>]
[<matplotlib.lines.Line2D at 0x21540705b20>]
Text(0.5, 1.0, 'Explained variance of components')
Text(0.5, 0, 'Principal Component')
Text(0, 0.5, 'Explained Variance')
<matplotlib.legend.Legend at 0x21540841790>
#Applying PCA Transformations
# scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
# scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std, Train_Y, eval_metric='logloss').predict(Test_X_std),average='weighted')*100)
scores1=[]; scores2=[]; scores3=[]
for i in range(len(X.columns.values)):
pca = PCA(n_components=Train_X_std.shape[1]-i)
Train_X_std_pca = pca.fit_transform(Train_X_std)
#print('The shape of final transformed training feature set:')
#print(Train_X_std_pca.shape)
Train_X_std_pca = pd.DataFrame(Train_X_std_pca)
Test_X_std_pca = pca.transform(Test_X_std)
#print('\nThe shape of final transformed testing feature set:')
#print(Test_X_std_pca.shape)
Test_X_std_pca = pd.DataFrame(Test_X_std_pca)
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier().fit(Train_X_std_pca, Train_Y, eval_metric='logloss').predict(Test_X_std_pca),average='weighted')*100)
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
#plt.ylim([0.80,0.84])
plt.legend()
plt.grid()
plt.show()
[<matplotlib.lines.Line2D at 0x215407736d0>]
[<matplotlib.lines.Line2D at 0x215407739a0>]
[<matplotlib.lines.Line2D at 0x21540773cd0>]
<matplotlib.legend.Legend at 0x2154073c760>
#Finalising the shortlisted features
rfe = RFE(LR,n_features_to_select=len(Train_X_std.columns)-0)
rfe = rfe.fit(Train_X_std, Train_Y)
print(f1_score(Test_Y,LogisticRegression().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
print(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
print(f1_score(Test_Y,XGBClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y, eval_metric='logloss').predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
Train_X_std = Train_X_std[Train_X_std.columns[rfe.support_]]
Test_X_std = Test_X_std[Test_X_std.columns[rfe.support_]]
print(Train_X_std.shape)
print(Test_X_std.shape)
100.0 100.0 100.0 (358, 20) (90, 20)
print(rfe.support_)
X.columns[rfe.support_]
[ True True True True True True True True True True True True True True True True True True True True]
Index(['Sex', 'Pclass_2', 'Pclass_3', 'Embarked_Q', 'Embarked_S', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9', 'Fare', 'Age'], dtype='object')
# Let us create first create a table to store the results of various models
Evaluation_Results = pd.DataFrame(np.zeros((9,5)), columns=['Accuracy', 'Precision','Recall','F1-score','AUC-ROC score'])
Evaluation_Results.index=['Logistic Regression (LR)','Decision Tree Classifier (DT)','Random Forest Classifier (RF)','Naïve Bayes Classifier (NB)',
'Support Vector Machine (SVM)','K Nearest Neighbours (KNN)','Extreme Gradient Boosting (XGB)', 'Deep Learning (DL)', 'DUMMY (DM)']
Evaluation_Results
Accuracy | Precision | Recall | F1-score | AUC-ROC score | |
---|---|---|---|---|---|
Logistic Regression (LR) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Decision Tree Classifier (DT) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Random Forest Classifier (RF) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Naïve Bayes Classifier (NB) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Support Vector Machine (SVM) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
K Nearest Neighbours (KNN) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Extreme Gradient Boosting (XGB) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Deep Learning (DL) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
DUMMY (DM) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
#Let us define functions to summarise the Prediction's scores .
#Classification Summary Function
def Classification_Summary(pred,pred_prob,i):
Evaluation_Results.iloc[i]['Accuracy']=round(accuracy_score(Test_Y, pred),3)*100
Evaluation_Results.iloc[i]['Precision']=round(precision_score(Test_Y, pred, average='weighted'),3)*100 #
Evaluation_Results.iloc[i]['Recall']=round(recall_score(Test_Y, pred, average='weighted'),3)*100 #
Evaluation_Results.iloc[i]['F1-score']=round(f1_score(Test_Y, pred, average='weighted'),3)*100 #
Evaluation_Results.iloc[i]['AUC-ROC score']=round(roc_auc_score(Test_Y, pred_prob[:,1], multi_class='ovr'),3)*100 #[:, 1]
print('{}{}\033[1m Evaluating {} \033[0m{}{}\n'.format('<'*3,'-'*35,Evaluation_Results.index[i], '-'*35,'>'*3))
print('Accuracy = {}%'.format(round(accuracy_score(Test_Y, pred),3)*100))
print('F1 Score = {}%'.format(round(f1_score(Test_Y, pred, average='weighted'),3)*100)) #
print('\n \033[1mConfusiton Matrix:\033[0m\n',confusion_matrix(Test_Y, pred))
print('\n\033[1mClassification Report:\033[0m\n',classification_report(Test_Y, pred))
auc_roc(Test_Y, pred_prob, curves=['each_class'])
plt.show()
#Visualising Function
def AUC_ROC_plot(Test_Y, pred):
ref = [0 for _ in range(len(Test_Y))]
ref_auc = roc_auc_score(Test_Y, ref)
lr_auc = roc_auc_score(Test_Y, pred)
ns_fpr, ns_tpr, _ = roc_curve(Test_Y, ref)
lr_fpr, lr_tpr, _ = roc_curve(Test_Y, pred)
plt.plot(ns_fpr, ns_tpr, linestyle='--')
plt.plot(lr_fpr, lr_tpr, marker='.', label='AUC = {}'.format(round(roc_auc_score(Test_Y, pred)*100,2)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Building Logistic Regression Classifier
LR_model = LogisticRegression()
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2'] #'none', 'l1', 'l2', 'elasticnet'
space['C'] = loguniform(1e-5, 100)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(LR_model, space, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
LR = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = LR.predict(Test_X_std)
pred_prob = LR.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,0)
print('\n\033[1mInterpreting the Output of Logistic Regression:\n\033[0m')
print('intercept ', LR.intercept_[0])
print('classes', LR.classes_)
display(pd.DataFrame({'coeff': LR.coef_[0]}, index=Train_X_std.columns))
<<<----------------------------------- Evaluating Logistic Regression (LR) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
Interpreting the Output of Logistic Regression:
intercept -0.14426344485989095
classes [0 1]
coeff | |
---|---|
Sex | -0.849003 |
Pclass_2 | -0.095318 |
Pclass_3 | -0.018685 |
Embarked_Q | 0.048915 |
Embarked_S | -0.101063 |
SibSp_1 | 0.021590 |
SibSp_2 | 0.010019 |
SibSp_3 | -0.040841 |
SibSp_4 | -0.049564 |
SibSp_5 | 0.000000 |
SibSp_8 | -0.033787 |
Parch_1 | 0.047975 |
Parch_2 | 0.055455 |
Parch_3 | 0.000000 |
Parch_4 | 0.034152 |
Parch_5 | -0.027952 |
Parch_6 | -0.028518 |
Parch_9 | 0.000933 |
Fare | 0.050779 |
Age | -0.031927 |
# Building Decision Tree Classifier
DT_model = DecisionTreeClassifier()
n=20
param_dist = {"max_depth": [3, None],
"max_features": randint(1, n),
"min_samples_leaf": randint(1, 9),
"criterion": ["gini", "entropy"]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(DT_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
DT = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = DT.predict(Test_X_std)
pred_prob = DT.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,1)
print('\n\033[1mInterpreting the output of Decision Tree:\n\033[0m')
tree.plot_tree(DT)
plt.show()
<<<----------------------------------- Evaluating Decision Tree Classifier (DT) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
Interpreting the output of Decision Tree:
[Text(0.5, 0.75, 'X[0] <= -0.062\nentropy = 0.997\nsamples = 358\nvalue = [190, 168]'), Text(0.25, 0.25, 'entropy = 0.0\nsamples = 168\nvalue = [0, 168]'), Text(0.75, 0.25, 'entropy = 0.0\nsamples = 190\nvalue = [190, 0]')]
RF_model = RandomForestClassifier()
param_dist={'bootstrap': [True, False],
'max_depth': [10, 20, 50, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [50, 100]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(RF_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
RF = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = RF.predict(Test_X_std)
pred_prob = RF.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,2)
print('\n\033[1mInterpreting the output of Random Forest:\n\033[0m')
rfi=pd.Series(RF.feature_importances_, index=Train_X_std.columns).sort_values(ascending=False)
plt.barh(rfi.index,rfi.values)
plt.show()
<<<----------------------------------- Evaluating Random Forest Classifier (RF) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
Interpreting the output of Random Forest:
<BarContainer object of 20 artists>
# Building Naive Bayes Classifier
NB_model = BernoulliNB()
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(NB_model, params, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
NB = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = NB.predict(Test_X_std)
pred_prob = NB.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,3)
<<<----------------------------------- Evaluating Naïve Bayes Classifier (NB) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
# Building Support Vector Machine Classifier
SVM_model = SVC(probability=True).fit(Train_X_std, Train_Y)
svm_param = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
"kernel": ["rbf"],
"random_state": [1]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(SVM_model, svm_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
SVM = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = SVM.predict(Test_X_std)
pred_prob = SVM.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,4)
<<<----------------------------------- Evaluating Support Vector Machine (SVM) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
# Building K-Neareset Neighbours Classifier
KNN_model = KNeighborsClassifier()
knn_param = {"n_neighbors": [i for i in range(1,30,5)],
"weights": ["uniform", "distance"],
"algorithm": ["ball_tree", "kd_tree", "brute"],
"leaf_size": [1, 10, 30],
"p": [1,2]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(KNN_model, knn_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
KNN = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = KNN.predict(Test_X_std)
pred_prob = KNN.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,5)
<<<----------------------------------- Evaluating K Nearest Neighbours (KNN) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
# Building Extreme Gradient Boosting Classifier
XGB_model = XGBClassifier(eval_metric='mlogloss')
param_dist = {
"learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(XGB_model, param_dist, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
XGB = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = XGB.predict(Test_X_std)
pred_prob = XGB.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,6)
plt.bar( Train_X_std.columns,XGB.feature_importances_,)
plt.show()
<<<----------------------------------- Evaluating Extreme Gradient Boosting (XGB) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
<BarContainer object of 20 artists>
# Building Deep Learning Classifier
# https://stackoverflow.com/questions/63375201/tensorflow-valueerror-logits-and-labels-must-have-the-same-shape-none-2-vs
def set_seed(seed=4):
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()
DL = Sequential()
DL.add(Dense(16,input_shape=(20,),activation='relu'))
DL.add(Dense(8,activation='relu'))
DL.add(Dense(4,activation='relu'))
DL.add(Dense(2,activation='sigmoid'))
DL.compile('Adam','binary_crossentropy',metrics=['accuracy'])
DL.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 16) 336 dense_1 (Dense) (None, 8) 136 dense_2 (Dense) (None, 4) 36 dense_3 (Dense) (None, 2) 10 ================================================================= Total params: 518 Trainable params: 518 Non-trainable params: 0 _________________________________________________________________
encoder = LabelEncoder()
y = pd.get_dummies(encoder.fit_transform(Train_Y)).values
DL.fit(Train_X_std.to_numpy(), y,epochs=100)
pred_prob = DL.predict(Test_X_std.to_numpy())
pred = np.argmax(pred_prob, axis=1)
Classification_Summary(pred,pred_prob,7)
Epoch 1/100 12/12 [==============================] - 2s 9ms/step - loss: 0.6977 - accuracy: 0.4721 Epoch 2/100 12/12 [==============================] - 0s 2ms/step - loss: 0.6757 - accuracy: 0.4777 Epoch 3/100 12/12 [==============================] - 0s 3ms/step - loss: 0.6539 - accuracy: 0.5000 Epoch 4/100 12/12 [==============================] - 0s 2ms/step - loss: 0.6334 - accuracy: 0.6034 Epoch 5/100 12/12 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.7374 Epoch 6/100 12/12 [==============================] - 0s 2ms/step - loss: 0.5943 - accuracy: 0.8073 Epoch 7/100 12/12 [==============================] - 0s 2ms/step - loss: 0.5734 - accuracy: 0.8268 Epoch 8/100 12/12 [==============================] - 0s 2ms/step - loss: 0.5530 - accuracy: 0.8352 Epoch 9/100 12/12 [==============================] - 0s 2ms/step - loss: 0.5300 - accuracy: 0.8520 Epoch 10/100 12/12 [==============================] - 0s 2ms/step - loss: 0.5088 - accuracy: 0.8631 Epoch 11/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4877 - accuracy: 0.9050 Epoch 12/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4677 - accuracy: 0.9358 Epoch 13/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4500 - accuracy: 0.9497 Epoch 14/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4330 - accuracy: 0.9553 Epoch 15/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4180 - accuracy: 0.9609 Epoch 16/100 12/12 [==============================] - 0s 2ms/step - loss: 0.4044 - accuracy: 0.9665 Epoch 17/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3917 - accuracy: 0.9721 Epoch 18/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3806 - accuracy: 0.9832 Epoch 19/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3700 - accuracy: 0.9832 Epoch 20/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3611 - accuracy: 0.9832 Epoch 21/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3524 - accuracy: 0.9832 Epoch 22/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3445 - accuracy: 0.9860 Epoch 23/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3374 - accuracy: 0.9860 Epoch 24/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3307 - accuracy: 0.9860 Epoch 25/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3250 - accuracy: 0.9860 Epoch 26/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3183 - accuracy: 0.9860 Epoch 27/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3124 - accuracy: 0.9916 Epoch 28/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3074 - accuracy: 0.9916 Epoch 29/100 12/12 [==============================] - 0s 2ms/step - loss: 0.3025 - accuracy: 0.9916 Epoch 30/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2980 - accuracy: 0.9916 Epoch 31/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2938 - accuracy: 0.9944 Epoch 32/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2895 - accuracy: 0.9944 Epoch 33/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2856 - accuracy: 0.9944 Epoch 34/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2818 - accuracy: 0.9944 Epoch 35/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2783 - accuracy: 0.9944 Epoch 36/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2748 - accuracy: 0.9944 Epoch 37/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2714 - accuracy: 0.9944 Epoch 38/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2683 - accuracy: 0.9944 Epoch 39/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2651 - accuracy: 0.9944 Epoch 40/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2621 - accuracy: 0.9944 Epoch 41/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2591 - accuracy: 0.9972 Epoch 42/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2562 - accuracy: 0.9972 Epoch 43/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2535 - accuracy: 0.9972 Epoch 44/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2508 - accuracy: 0.9972 Epoch 45/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2481 - accuracy: 0.9972 Epoch 46/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2455 - accuracy: 0.9972 Epoch 47/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2429 - accuracy: 0.9972 Epoch 48/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2404 - accuracy: 0.9972 Epoch 49/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2380 - accuracy: 1.0000 Epoch 50/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2356 - accuracy: 1.0000 Epoch 51/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2333 - accuracy: 1.0000 Epoch 52/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2310 - accuracy: 1.0000 Epoch 53/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2288 - accuracy: 1.0000 Epoch 54/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2266 - accuracy: 1.0000 Epoch 55/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2244 - accuracy: 1.0000 Epoch 56/100 12/12 [==============================] - 0s 1ms/step - loss: 0.2223 - accuracy: 1.0000 Epoch 57/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2202 - accuracy: 1.0000 Epoch 58/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2183 - accuracy: 1.0000 Epoch 59/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2162 - accuracy: 1.0000 Epoch 60/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2143 - accuracy: 1.0000 Epoch 61/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2123 - accuracy: 1.0000 Epoch 62/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2104 - accuracy: 1.0000 Epoch 63/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2085 - accuracy: 1.0000 Epoch 64/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2066 - accuracy: 1.0000 Epoch 65/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2047 - accuracy: 1.0000 Epoch 66/100 12/12 [==============================] - 0s 2ms/step - loss: 0.2029 - accuracy: 1.0000 Epoch 67/100 12/12 [==============================] - 0s 1ms/step - loss: 0.2011 - accuracy: 1.0000 Epoch 68/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1993 - accuracy: 1.0000 Epoch 69/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1976 - accuracy: 1.0000 Epoch 70/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1959 - accuracy: 1.0000 Epoch 71/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1942 - accuracy: 1.0000 Epoch 72/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1924 - accuracy: 1.0000 Epoch 73/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1908 - accuracy: 1.0000 Epoch 74/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1891 - accuracy: 1.0000 Epoch 75/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1875 - accuracy: 1.0000 Epoch 76/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1859 - accuracy: 1.0000 Epoch 77/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1843 - accuracy: 1.0000 Epoch 78/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1828 - accuracy: 1.0000 Epoch 79/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1812 - accuracy: 1.0000 Epoch 80/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1797 - accuracy: 1.0000 Epoch 81/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1782 - accuracy: 1.0000 Epoch 82/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1767 - accuracy: 1.0000 Epoch 83/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1752 - accuracy: 1.0000 Epoch 84/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1738 - accuracy: 1.0000 Epoch 85/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1723 - accuracy: 1.0000 Epoch 86/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1708 - accuracy: 1.0000 Epoch 87/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1694 - accuracy: 1.0000 Epoch 88/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1680 - accuracy: 1.0000 Epoch 89/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1666 - accuracy: 1.0000 Epoch 90/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1652 - accuracy: 1.0000 Epoch 91/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1639 - accuracy: 1.0000 Epoch 92/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1625 - accuracy: 1.0000 Epoch 93/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1612 - accuracy: 1.0000 Epoch 94/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1599 - accuracy: 1.0000 Epoch 95/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1586 - accuracy: 1.0000 Epoch 96/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1573 - accuracy: 1.0000 Epoch 97/100 12/12 [==============================] - 0s 1ms/step - loss: 0.1560 - accuracy: 1.0000 Epoch 98/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1548 - accuracy: 1.0000 Epoch 99/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1536 - accuracy: 1.0000 Epoch 100/100 12/12 [==============================] - 0s 2ms/step - loss: 0.1523 - accuracy: 1.0000
<keras.callbacks.History at 0x21540f93af0>
<<<----------------------------------- Evaluating Deep Learning (DL) ----------------------------------->>> Accuracy = 100.0% F1 Score = 100.0% Confusiton Matrix: [[34 0] [ 0 56]] Classification Report: precision recall f1-score support 0 1.00 1.00 1.00 34 1 1.00 1.00 1.00 56 accuracy 1.00 90 macro avg 1.00 1.00 1.00 90 weighted avg 1.00 1.00 1.00 90
#Plotting Confusion-Matrix of all the predictive Models
def plot_cm(y_true, y_pred):
cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.1f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
cm.columns=labels
cm.index=labels
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'
#fig, ax = plt.subplots()
sns.heatmap(cm, annot=annot, fmt='')# cmap= "GnBu"
def conf_mat_plot(all_models):
plt.figure(figsize=[20,3.5*math.ceil(len(all_models)*len(labels)/14)])
for i in range(len(all_models)):
if len(labels)<=4:
plt.subplot(2,4,i+1)
else:
plt.subplot(math.ceil(len(all_models)/3),3,i+1)
if all_models[i] == DL:
pred_prob =all_models[i].predict(Test_X_std.to_numpy())
pred = np.argmax(pred_prob, axis=1)
else:
pred = all_models[i].predict(Test_X_std)
#plot_cm(Test_Y, pred)
sns.heatmap(confusion_matrix(Test_Y, pred), annot=True, cmap='Blues', fmt='.0f') #vmin=0,vmax=5
plt.title(Evaluation_Results.index[i])
plt.tight_layout()
plt.show()
conf_mat_plot([LR,DT,RF,NB,SVM,KNN,XGB,DL])
from sklearn.dummy import DummyClassifier
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_param = {}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RCV = RandomizedSearchCV(dummy_model, dummy_param, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
DUMMY = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = DUMMY.predict(Test_X_std)
pred_prob = DUMMY.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,8)
<<<----------------------------------- Evaluating DUMMY (DM) ----------------------------------->>> Accuracy = 37.8% F1 Score = 20.7% Confusiton Matrix: [[34 0] [56 0]] Classification Report: precision recall f1-score support 0 0.38 1.00 0.55 34 1 0.00 0.00 0.00 56 accuracy 0.38 90 macro avg 0.19 0.50 0.27 90 weighted avg 0.14 0.38 0.21 90
Train_Y.value_counts()
Test_Y.value_counts()
0 190 1 168 Name: Survived, dtype: int64
1 56 0 34 Name: Survived, dtype: int64
# Comparing all the models Scores
print('\033[1mML Algorithms Comparison'.center(130))
plt.figure(figsize=[12,8])
sns.heatmap(Evaluation_Results, annot=True, vmin=50, vmax=100, cmap='Blues', fmt='.1f')
plt.show()
ML Algorithms Comparison
<Figure size 864x576 with 0 Axes>
<AxesSubplot:>
Evaluation_Results
Accuracy | Precision | Recall | F1-score | AUC-ROC score | |
---|---|---|---|---|---|
Logistic Regression (LR) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Decision Tree Classifier (DT) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Random Forest Classifier (RF) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Naïve Bayes Classifier (NB) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Support Vector Machine (SVM) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
K Nearest Neighbours (KNN) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Extreme Gradient Boosting (XGB) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
Deep Learning (DL) | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 |
DUMMY (DM) | 37.8 | 14.3 | 37.8 | 20.7 | 50.0 |