import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import set_config
set_config(display='diagram')
# if desired, set display back to the default
# set_config(display='text')
import multiprocessing
print("Multiprocessors: ", multiprocessing.cpu_count())
def set_seed(seed=1234):
np.random.seed(seed)
# tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
# os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()
MAX_OH_CARDINALITY = 15
BASE_PATH = "./"
Multiprocessors: 4
df = pd.read_csv(f"{BASE_PATH}train.csv")
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.drop(['PassengerId'], axis=1, inplace=True)
df.head()
Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Name 891 non-null object 3 Sex 891 non-null object 4 Age 714 non-null float64 5 SibSp 891 non-null int64 6 Parch 891 non-null int64 7 Ticket 891 non-null object 8 Fare 891 non-null float64 9 Cabin 204 non-null object 10 Embarked 889 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 76.7+ KB
above_0_missing = df.isnull().sum() > 0
df.isnull().sum()[above_0_missing]
Age 177 Cabin 687 Embarked 2 dtype: int64
df.duplicated().sum()
0
df = df.astype({'Pclass':object})
X = df.drop("Survived", axis=1)
y = df.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
num_cols = X.select_dtypes(include='number').columns.tolist()
print(num_cols,'\n')
print(f'No of numeric_features: {len(num_cols)} \n')
print(', '.join(num_cols),'\n')
cat_cols = X.select_dtypes(exclude='number').columns.tolist()
print(cat_cols,'\n')
print(f'No of categorical_features: {len(cat_cols)} \n')
print(', '.join(cat_cols),'\n')
['Age', 'SibSp', 'Parch', 'Fare'] No of numeric_features: 4 Age, SibSp, Parch, Fare ['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'] No of categorical_features: 6 Pclass, Name, Sex, Ticket, Cabin, Embarked
print(df[cat_cols].nunique().sort_values())
Sex 2 Pclass 3 Embarked 3 Cabin 147 Ticket 681 Name 891 dtype: int64
from itertools import compress
list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))
['Pclass', 'Sex', 'Embarked']
# use loop
for i in range(0, len(list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY)))):
print(list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))[i], ":", df[list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))[i]].unique())
Pclass : [3 1 2] Sex : ['male' 'female'] Embarked : ['S' 'C' 'Q' nan]
# no loop, use apply
df[list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))].apply(lambda col: col.unique(), axis=0)
Pclass [3, 1, 2] Sex [male, female] Embarked [S, C, Q, nan] dtype: object
def select_oh_cat_cols(df):
oh_cat_cols =\
df\
.select_dtypes(['object'])\
.apply(lambda col: col.nunique())\
.loc[lambda x: x <= MAX_OH_CARDINALITY]\
.index\
.tolist()
return oh_cat_cols
oh_cat_cols = select_oh_cat_cols(df[cat_cols])
print(oh_cat_cols,'\n')
print(f'No of one-hot cat columns: {len(oh_cat_cols)} \n')
print(', '.join(oh_cat_cols))
['Pclass', 'Sex', 'Embarked'] No of one-hot cat columns: 3 Pclass, Sex, Embarked
def select_hc_cat_cols(df):
hc_cat_cols =\
df\
.select_dtypes(['object'])\
.apply(lambda col: col.nunique())\
.loc[lambda x: x > MAX_OH_CARDINALITY]\
.index\
.tolist()
return hc_cat_cols
hc_cat_cols = select_hc_cat_cols(df[cat_cols])
print(hc_cat_cols,'\n')
print(f'No of high-cardinality cat cols: {len(hc_cat_cols)} \n')
print(', '.join(hc_cat_cols))
['Name', 'Ticket', 'Cabin'] No of high-cardinality cat cols: 3 Name, Ticket, Cabin
# On execution of Pipeline’s fit method, Transformer’s ‘fit’ and ‘transform’ method will be called sequentially.
# Where, on execution of Pipeline’s predict method, only Transformer’s transform method will be called.
class CabinFeatureTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
# print('in the CabinFeatureTransformer init method: ')
pass
def fit(self, x, y=None):
x.Cabin.fillna('U', inplace=True)
x['Cabin'] = x['Cabin'].map(lambda c: c[0])
cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')
self.cabin_columns= cabin_dummies.columns
return self
def transform(self, x):
# replacing missing cabins with U (for Uknown)
x.Cabin.fillna('U', inplace=True)
# mapping each Cabin value with the cabin letter
x['Cabin'] = x['Cabin'].map(lambda c: c[0])
cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')
# https://www.geeksforgeeks.org/reindexing-in-pandas-dataframe/
cabin_dummies = cabin_dummies.reindex(columns = self.cabin_columns, fill_value=0)
x = pd.concat([x, cabin_dummies], axis=1)
x.drop('Cabin', axis=1, inplace=True)
return x
# On execution of Pipeline’s fit method, Transformer’s ‘fit’ and ‘transform’ method will be called sequentially.
# Where, on execution of Pipeline’s predict method, only Transformer’s transform method will be called.
class NameFeatureTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
# print('in the NameFeatureTransformer Init method: ')
pass
def fit(self, x, y=None):
return self
def transform(self, x):
Title_Dictionary = {
"Capt": "Officer", "Col": "Officer", "Major": "Officer","Jonkheer": "Royalty",
"Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer","the Countess":"Royalty",
"Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr" : "Mr", "Mrs" : "Mrs", "Miss" : "Miss",
"Master" : "Master", "Lady" : "Royalty"}
x['Title'] = x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
x['Title'] = x.Title.map(Title_Dictionary)
x.drop('Name', axis=1, inplace=True)
titles_dummies = pd.get_dummies(x['Title'], prefix='Title')
x = pd.concat([x, titles_dummies], axis=1)
x.drop('Title', axis=1, inplace=True)
return x.values
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer(
transformers=[
('numeric_data_preprocessing', numeric_transformer, num_cols),
('categorical_data_preprocessing', categorical_transformer, oh_cat_cols),
('cabin_data_preprocessing', CabinFeatureTransformer(), ['Cabin']),
('name_data_preprocessing', NameFeatureTransformer(), ['Name'])
],
n_jobs=-1,
remainder="drop", # drop ['Ticket']
verbose=False
)
final_pipeline = Pipeline(steps=[('transformer', transformer),
('rf_estimator', RandomForestClassifier())])
final_pipeline.fit(X_train, y_train)
Pipeline(steps=[('transformer', ColumnTransformer(n_jobs=-1, transformers=[('numeric_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['Age', 'SibSp', 'Parch', 'Fare']), ('categorical_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['Pclass', 'Sex', 'Embarked']), ('cabin_data_preprocessing', CabinFeatureTransformer(), ['Cabin']), ('name_data_preprocessing', NameFeatureTransformer(), ['Name'])])), ('rf_estimator', RandomForestClassifier())])Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('transformer', ColumnTransformer(n_jobs=-1, transformers=[('numeric_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['Age', 'SibSp', 'Parch', 'Fare']), ('categorical_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['Pclass', 'Sex', 'Embarked']), ('cabin_data_preprocessing', CabinFeatureTransformer(), ['Cabin']), ('name_data_preprocessing', NameFeatureTransformer(), ['Name'])])), ('rf_estimator', RandomForestClassifier())])
ColumnTransformer(n_jobs=-1, transformers=[('numeric_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['Age', 'SibSp', 'Parch', 'Fare']), ('categorical_data_preprocessing', Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['Pclass', 'Sex', 'Embarked']), ('cabin_data_preprocessing', CabinFeatureTransformer(), ['Cabin']), ('name_data_preprocessing', NameFeatureTransformer(), ['Name'])])
['Age', 'SibSp', 'Parch', 'Fare']
SimpleImputer(strategy='median')
StandardScaler()
['Pclass', 'Sex', 'Embarked']
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore')
['Cabin']
CabinFeatureTransformer()
['Name']
NameFeatureTransformer()
RandomForestClassifier()
y_pred = final_pipeline.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
Accuracy Score: 0.8100558659217877 F1 Score: 0.8083472799609296 Precision Score: 0.8085529511780423 Recall Score: 0.8100558659217877
import pickle
with open('Titanic_pipeline.pkl', 'wb') as f:
pickle.dump(final_pipeline, f)
rf_pickle = pickle.load(open('Titanic_pipeline.pkl','rb'))
y_pred = rf_pickle.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
Accuracy Score: 0.8100558659217877 F1 Score: 0.8083472799609296 Precision Score: 0.8085529511780423 Recall Score: 0.8100558659217877