Let's start regression ...¶

drawing

★ House Price Dataset ★

Description:¶

The Ames Housing Data Set is an interesting and modern alternative to the famous Boston Housing Data Set from 1978. The Ames Housing Data Set was collected by Dean De Cock, Professor of Statistics at Truman State University, in 2011.

The dataset contains data of individual residential properties in Ames, Iowa from 2006 to 2010. The full dataset consists of 2930 samples with 80 features

Meet and Greet Data

1. 80 variables in total: 23 nominal, 23 ordinal, 14 discrete and 20 continuous
2. 20 continuous: relate to various area dimensions (e.g. lot size, total dwelling square footage etc.)
3. 14 discrete: quantify number of items occurring in the house (e.g. kitchens, baths, etc.)
4. 46 categorical ranging from 2 to 28 classes (e.g. smallest STREET, largest NEIGHBORHOOD)

First we import all the necessary libraries and set a base file path to the data sets.¶

In [ ]:
import numpy as np 
import pandas as pd 
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from keras.layers import Dense
from tensorflow.keras import layers
from keras.models import Sequential
from tensorflow_addons.metrics import RSquare
from tensorflow.keras.models import * 
from tensorflow.keras.layers import * 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
set_config(display='diagram')
# if desired, set display back to the default
# set_config(display='text')

def set_seed(seed=1234):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

BASE_PATH = "./"

Build custom regressor and assemble regressors¶

In [ ]:
def build_model():
    model = Sequential()
    model.add(Dense(1024,input_shape=(288,),activation='relu'))
    model.add(layers.Dropout(rate=0.3))# apply 30% dropout to the next layer
    model.add(Dense(8,activation='relu'))
    model.add(Dense(4,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam',metrics=[RSquare(), tf.keras.metrics.RootMeanSquaredError()])
    return model
In [ ]:
# comment out all classifiers that you don't want to use
# and do so for clf_names accordingly
classifiers = [
               DummyRegressor(strategy='median'),
               LinearRegression(n_jobs=-1), 
               Ridge(alpha=0.003, max_iter=30), 
               Lasso(alpha=.0005), 
               ElasticNet(alpha=0.0005, l1_ratio=.9),
               KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5),
               SGDRegressor(),
               SVR(kernel="linear"),
               LinearSVR(),
               RandomForestRegressor(n_jobs=-1, n_estimators=350, 
                                     max_depth=12, random_state=1),
               GradientBoostingRegressor(n_estimators=500, max_depth=2),
               lgb.LGBMRegressor(n_jobs=-1, max_depth=2, n_estimators=1000, 
                                 learning_rate=0.05),
               xgb.XGBRegressor(objective="reg:squarederror", n_jobs=-1, 
                                max_depth=2, n_estimators=1500, learning_rate=0.075),
               KerasRegressor(build_fn=build_model, epochs=500, verbose=0),
]

clf_names = [
            "dummy", 
            "linear", 
            "ridge",
            "lasso",
            "elastic",
            "kernlrdg",
            "sgdreg",
            "svr",
            "linearsvr",
            "randomforest", 
            "gbm", 
            "lgbm", 
            "xgboost",
            "dl"
]

Encapsulate all our feature cleaning and engineering¶

In [ ]:
def clean_data(data, is_train_data=True):
    # add your code for data cleaning and feature engineering here
    # e.g. create a new feature from existing ones
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

    # add here the code that you only want to apply to your training data and not the test set
    # e.g. removing outliers from the training data works... 
    # ...but you cannot remove samples from your test set.
    if is_train_data == True:
        data = data[data.GrLivArea < 4000]
        
    return data

Prepare our data for the pipeline¶

In [ ]:
def prepare_data(df, is_train_data=True):
    
    # split data into numerical & categorical in order to process seperately in the pipeline 
    numerical   = df.select_dtypes("number").copy()
    categorical = df.select_dtypes("object").copy()
    
    # for training data only...
    # ...convert SalePrice to log values and drop "Id" and "SalePrice" columns
    if is_train_data == True :
        SalePrice = numerical.SalePrice
        y = np.log1p(SalePrice)
        numerical.drop(["Id", "SalePrice"], axis=1, inplace=True)
        
    # for the test data: just drop "Id" and set "y" to None
    else:
        numerical.drop(["Id"], axis=1, inplace=True)
        y = None
    
    # concatenate numerical and categorical data to X (our final training data)
    X = pd.concat([numerical, categorical], axis=1)
    
    # in addition to X and y return the separated columns to use these separetely in our pipeline
    return X, y, numerical.columns, categorical.columns

Create the pipeline¶

In [ ]:
class DLTransformer(BaseEstimator, TransformerMixin):
  # def __init__(self):
  #   print('\n>>>>>>>init() called.\n')

  def fit(self, X, y = None):
    # print('\n>>>>>>>fit() called.\n')
    # X_ = X.copy() # creating a copy to avoid changes to original dataset
    # X_ = X_.to_numpy()
    # return X_
    # X= X.to_numpy() 
    return self

  def transform(self, X, y = None):
    # print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_ = X_.toarray()
    return X_
In [ ]:
def get_pipeline(clf_name, classifier, num_cols, cat_cols):
    # the numeric transformer gets the numerical data acording to num_cols
    # the first step is the imputer which imputes all missing values to the mean
    # in the second step all numerical data gets scaled by the StandardScaler()
    if clf_name == 'dl':
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', RobustScaler())])      
    else:      
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())])
    
    # the categorical transformer gets all categorical data according to cat_cols
    # again: first step is imputing missing values and one hot encoding the categoricals
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # the column transformer creates one Pipeline for categorical and numerical data each
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])
    
    # return the whole pipeline with the classifier provided in the function call   
    # return Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', FunctionTransformer(lambda x: x.toarray())), ('classifier', classifier)]) if clf_name == 'dl' else Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)]) 
    return Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', DLTransformer()), ('classifier', classifier)]) if clf_name == 'dl' else Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

Score the models with crossvalidation¶

In [ ]:
def score_models(df):
    # retrieve X, y and the seperate columns names
    X, y, num_cols, cat_cols = prepare_data(df)
    
    # since we converted SalePrice to log values, we use neg_mean_squared_error... 
    # ...rather than *neg_mean_squared_log_error* 
    # scoring_metric = "neg_mean_squared_error"
    scoring_metric = ("r2", "neg_mean_squared_error")
    scores = []
    
    for clf_name, classifier in zip(clf_names, classifiers):
        # create a pipeline for each classifier
        clf = get_pipeline(clf_name, classifier, num_cols, cat_cols)
        # set a kfold with 3 splits to get more robust scores. 
        # increase to 5 or 10 to get more precise estimations on models score
        kfold = KFold(n_splits=3, shuffle=True, random_state=1)  
        if clf_name == 'dl': 
            # preprocessor = ColumnTransformer(
            #         transformers=[
            #             ('num', numeric_transformer, num_cols),
            #             ('cat', categorical_transformer, cat_cols)])
            # # keras_estimator = KerasRegressor(build_fn=build_model, epochs=50, verbose=0)
            # clf = Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', DLTransformer()), ('classifier', classifier)])
            # # print(clf.get_params)
            # display(clf)
            clf.fit(X, y) # put your callbacks in a lists 
            metric = RSquare()
            metric.update_state(y, clf.predict(X))
            result = metric.result()
            k1 = result.numpy()
            metric=tf.keras.metrics.RootMeanSquaredError()
            metric.update_state(y, clf.predict(X))
            result = metric.result()
            k2 = result.numpy()
            scores.append([clf_name, k1, k2])        
        else:    
            # crossvalidate and return the square root of the results
            # results = np.sqrt(-cross_val_score(clf, X, y, cv=kfold, scoring=scoring_metric))
            results = cross_validate(clf, X, y, cv=kfold, scoring=scoring_metric)
            scores.append([clf_name, results["test_r2"].mean(), np.sqrt(-results["test_neg_mean_squared_error"]).mean()])

    # scores = pd.DataFrame(scores, columns=["classifier", "rmse"]).sort_values("rmse", ascending=False)
    scores = pd.DataFrame(scores, columns=["classifier", "R2", "rmse"]).sort_values("rmse", ascending=False)
    # just for good measure: add the mean of all scores to dataframe
    # scores.loc[len(scores) + 1, :] = ["mean_all", scores.rmse.mean()]
    scores.loc[len(scores) + 1, :] = ["mean_all", scores.R2.mean(), scores.rmse.mean()]
    return scores.reset_index(drop=True)
    

Finally: Train the models¶

For each classifier we create and fit a pipeline.

In [ ]:
def train_models(df): 
    X, y, num_cols, cat_cols = prepare_data(df)
    pipelines = []
    
    for clf_name, classifier in zip(clf_names, classifiers):
        clf = get_pipeline(clf_name, classifier, num_cols, cat_cols)
        clf.fit(X, y)
        pipelines.append(clf)
    
    return pipelines

Make predictions with trained models¶

For each fitted pipeline we retrieve predictions for SalePrice

In [ ]:
def predict_from_models(df_test, pipelines):
    X_test, _ , _, _ = prepare_data(df_test, is_train_data=False)
    predictions = []
    
    for pipeline in pipelines:
        preds = pipeline.predict(X_test)
        # we return the exponent of the predictions since we have log converted y for training
        predictions.append(np.expm1(preds))
    
    return predictions

And now: Let's use our pipeline...¶

In [ ]:
df = pd.read_csv(f"{BASE_PATH}train.csv")
df_test = pd.read_csv(f"{BASE_PATH}test.csv")

# We clean the data
df = clean_data(df)
df_test = clean_data(df_test, is_train_data=False)
In [ ]:
# We score the models on the preprocessed training data
set_seed()
my_scores = score_models(df)
display(my_scores)
classifier R2 rmse
0 dummy -0.005590 0.396491
1 sgdreg 0.608009 0.247571
2 randomforest 0.877905 0.138248
3 linearsvr 0.879780 0.136673
4 linear 0.890839 0.130701
5 svr 0.892614 0.129446
6 ridge 0.905993 0.121218
7 xgboost 0.907387 0.120122
8 lgbm 0.907761 0.120102
9 gbm 0.907569 0.120036
10 elastic 0.916960 0.113857
11 lasso 0.917301 0.113624
12 kernlrdg 0.917450 0.113385
13 dl 0.952052 0.086704
14 mean_all 0.819716 0.149156
In [ ]:
# We train the models on the whole training set and predict on the test data
set_seed()
models = train_models(df)
predictions = predict_from_models(df_test, models)
# We average over the results of all 12 classifiers (simple ensembling)
# we exclude the DummyRegressor and the SGDRegressor: they perform worst...
prediction_final = pd.DataFrame(predictions[2:]).mean().T.values

submission = pd.DataFrame({'Id': df_test.Id.values, 'SalePrice': prediction_final})
# submission.to_csv(f"submission.csv", index=False)
submission.head()
Out[ ]:
Id SalePrice
0 1461 119125.038558
1 1462 154007.285550
2 1463 185168.508907
3 1464 197395.771079
4 1465 191741.367469