Let's start pipelines ...¶

drawing

First we import all the necessary libraries and set a base file path to the data sets.¶

In [ ]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import set_config
set_config(display='diagram')
# if desired, set display back to the default
# set_config(display='text')

import multiprocessing
print("Multiprocessors: ", multiprocessing.cpu_count())

def set_seed(seed=1234):
    np.random.seed(seed)
    # tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

MAX_OH_CARDINALITY = 15
BASE_PATH = "./"
Multiprocessors:  4

Load data¶

In [ ]:
df = pd.read_csv(f"{BASE_PATH}train.csv")
df.head()
Out[ ]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Data cleaning¶

In [ ]:
df.drop(['PassengerId'], axis=1, inplace=True)
df.head()
Out[ ]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB
In [ ]:
above_0_missing = df.isnull().sum() > 0
df.isnull().sum()[above_0_missing]
Out[ ]:
Age         177
Cabin       687
Embarked      2
dtype: int64
In [ ]:
df.duplicated().sum()
Out[ ]:
0

Construct train data and test data sets¶

In [ ]:
df = df.astype({'Pclass':object})

X = df.drop("Survived", axis=1)
y = df.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Seperate numeric and categorical data¶

In [ ]:
num_cols = X.select_dtypes(include='number').columns.tolist()
print(num_cols,'\n')
print(f'No of numeric_features: {len(num_cols)} \n')
print(', '.join(num_cols),'\n')
cat_cols = X.select_dtypes(exclude='number').columns.tolist()
print(cat_cols,'\n')
print(f'No of categorical_features: {len(cat_cols)} \n')
print(', '.join(cat_cols),'\n')
['Age', 'SibSp', 'Parch', 'Fare'] 

No of numeric_features: 4 

Age, SibSp, Parch, Fare 

['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'] 

No of categorical_features: 6 

Pclass, Name, Sex, Ticket, Cabin, Embarked 

Check the cardinality of categorical data¶

In [ ]:
print(df[cat_cols].nunique().sort_values())
Sex           2
Pclass        3
Embarked      3
Cabin       147
Ticket      681
Name        891
dtype: int64
In [ ]:
from itertools import compress
list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))
Out[ ]:
['Pclass', 'Sex', 'Embarked']
In [ ]:
# use loop
for i in range(0, len(list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY)))):
    print(list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))[i], ":", df[list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))[i]].unique())
Pclass : [3 1 2]
Sex : ['male' 'female']
Embarked : ['S' 'C' 'Q' nan]
In [ ]:
# no loop, use apply
df[list(compress(cat_cols, df[cat_cols].nunique().values < MAX_OH_CARDINALITY))].apply(lambda col: col.unique(), axis=0)
Out[ ]:
Pclass           [3, 1, 2]
Sex         [male, female]
Embarked    [S, C, Q, nan]
dtype: object

Handle low cardinality of categorical data¶

In [ ]:
def select_oh_cat_cols(df):
    
    oh_cat_cols =\
        df\
        .select_dtypes(['object'])\
        .apply(lambda col: col.nunique())\
        .loc[lambda x: x <= MAX_OH_CARDINALITY]\
        .index\
        .tolist()
        
    return oh_cat_cols

oh_cat_cols = select_oh_cat_cols(df[cat_cols])
print(oh_cat_cols,'\n')

print(f'No of one-hot cat columns: {len(oh_cat_cols)} \n')
print(', '.join(oh_cat_cols))
['Pclass', 'Sex', 'Embarked'] 

No of one-hot cat columns: 3 

Pclass, Sex, Embarked

Handle high cardinaltiy of categorical data¶

In [ ]:
def select_hc_cat_cols(df):
    
    hc_cat_cols =\
        df\
        .select_dtypes(['object'])\
        .apply(lambda col: col.nunique())\
        .loc[lambda x: x > MAX_OH_CARDINALITY]\
        .index\
        .tolist()
        
    return hc_cat_cols


hc_cat_cols = select_hc_cat_cols(df[cat_cols])
print(hc_cat_cols,'\n')

print(f'No of high-cardinality cat cols: {len(hc_cat_cols)} \n')
print(', '.join(hc_cat_cols))
['Name', 'Ticket', 'Cabin'] 

No of high-cardinality cat cols: 3 

Name, Ticket, Cabin

Construct customer transformers for some features¶

In [ ]:
# On execution of Pipeline’s fit method, Transformer’s ‘fit’ and ‘transform’ method will be called sequentially.
# Where, on execution of Pipeline’s predict method, only Transformer’s transform method will be called.
class CabinFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # print('in the CabinFeatureTransformer init method: ')
        pass
        
    def fit(self, x, y=None):
        x.Cabin.fillna('U', inplace=True)
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')    
        self.cabin_columns=  cabin_dummies.columns
        return self

    def transform(self, x):
        # replacing missing cabins with U (for Uknown)
        x.Cabin.fillna('U', inplace=True)
    
        # mapping each Cabin value with the cabin letter
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')
        # https://www.geeksforgeeks.org/reindexing-in-pandas-dataframe/ 
        cabin_dummies = cabin_dummies.reindex(columns = self.cabin_columns, fill_value=0)
        
        x = pd.concat([x, cabin_dummies], axis=1)

        x.drop('Cabin', axis=1, inplace=True)
    
        return x
In [ ]:
# On execution of Pipeline’s fit method, Transformer’s ‘fit’ and ‘transform’ method will be called sequentially.
# Where, on execution of Pipeline’s predict method, only Transformer’s transform method will be called.
class NameFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # print('in the NameFeatureTransformer Init method: ')
        pass
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        Title_Dictionary = {
                "Capt": "Officer", "Col": "Officer", "Major": "Officer","Jonkheer": "Royalty",
                "Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer","the Countess":"Royalty",
                "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr" : "Mr", "Mrs" : "Mrs", "Miss" : "Miss",
                "Master" : "Master", "Lady" : "Royalty"}
        
        x['Title'] = x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
        x['Title'] = x.Title.map(Title_Dictionary)
        
        x.drop('Name', axis=1, inplace=True)
    
        titles_dummies = pd.get_dummies(x['Title'], prefix='Title')
        x = pd.concat([x, titles_dummies], axis=1)
    
        x.drop('Title', axis=1, inplace=True)
        return x.values

Construct transformers for numeric and categorical data¶

In [ ]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
In [ ]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric_data_preprocessing', numeric_transformer, num_cols),
        ('categorical_data_preprocessing', categorical_transformer, oh_cat_cols),
        ('cabin_data_preprocessing', CabinFeatureTransformer(), ['Cabin']),
        ('name_data_preprocessing', NameFeatureTransformer(), ['Name'])
    ],
    n_jobs=-1,
    remainder="drop", # drop ['Ticket']
    verbose=False
    )

Construct pipelines with transformers and estimator¶

In [ ]:
final_pipeline = Pipeline(steps=[('transformer', transformer),
                      ('rf_estimator', RandomForestClassifier())])

Train the model with the final pipeline¶

In [ ]:
final_pipeline.fit(X_train, y_train)
Out[ ]:
Pipeline(steps=[('transformer',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'SibSp', 'Parch',
                                                   'Fare']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Pclass', 'Sex',
                                                   'Embarked']),
                                                 ('cabin_data_preprocessing',
                                                  CabinFeatureTransformer(),
                                                  ['Cabin']),
                                                 ('name_data_preprocessing',
                                                  NameFeatureTransformer(),
                                                  ['Name'])])),
                ('rf_estimator', RandomForestClassifier())])
Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('transformer',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'SibSp', 'Parch',
                                                   'Fare']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Pclass', 'Sex',
                                                   'Embarked']),
                                                 ('cabin_data_preprocessing',
                                                  CabinFeatureTransformer(),
                                                  ['Cabin']),
                                                 ('name_data_preprocessing',
                                                  NameFeatureTransformer(),
                                                  ['Name'])])),
                ('rf_estimator', RandomForestClassifier())])
ColumnTransformer(n_jobs=-1,
                  transformers=[('numeric_data_preprocessing',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['Age', 'SibSp', 'Parch', 'Fare']),
                                ('categorical_data_preprocessing',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Pclass', 'Sex', 'Embarked']),
                                ('cabin_data_preprocessing',
                                 CabinFeatureTransformer(), ['Cabin']),
                                ('name_data_preprocessing',
                                 NameFeatureTransformer(), ['Name'])])
['Age', 'SibSp', 'Parch', 'Fare']
SimpleImputer(strategy='median')
StandardScaler()
['Pclass', 'Sex', 'Embarked']
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore')
['Cabin']
CabinFeatureTransformer()
['Name']
NameFeatureTransformer()
RandomForestClassifier()

Make predictions based on the pipelined model¶

In [ ]:
y_pred = final_pipeline.predict(X_test)
In [ ]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
Accuracy Score:  0.8100558659217877
F1 Score:  0.8083472799609296
Precision Score:  0.8085529511780423
Recall Score:  0.8100558659217877

Store the pipelined model for future use¶

In [ ]:
import pickle
with open('Titanic_pipeline.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

Reuse the stored model for coming data sets¶

In [ ]:
rf_pickle = pickle.load(open('Titanic_pipeline.pkl','rb'))
y_pred = rf_pickle.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
Accuracy Score:  0.8100558659217877
F1 Score:  0.8083472799609296
Precision Score:  0.8085529511780423
Recall Score:  0.8100558659217877