The Ames Housing Data Set is an interesting and modern alternative to the famous Boston Housing Data Set from 1978. The Ames Housing Data Set was collected by Dean De Cock, Professor of Statistics at Truman State University, in 2011.
The dataset contains data of individual residential properties in Ames, Iowa from 2006 to 2010. The full dataset consists of 2930 samples with 80 features
1. 80 variables in total: 23 nominal, 23 ordinal, 14 discrete and 20 continuous
2. 20 continuous: relate to various area dimensions (e.g. lot size, total dwelling square footage etc.)
3. 14 discrete: quantify number of items occurring in the house (e.g. kitchens, baths, etc.)
4. 46 categorical ranging from 2 to 28 classes (e.g. smallest STREET, largest NEIGHBORHOOD)
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from keras.layers import Dense
from tensorflow.keras import layers
from keras.models import Sequential
from tensorflow_addons.metrics import RSquare
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
set_config(display='diagram')
# if desired, set display back to the default
# set_config(display='text')
def set_seed(seed=1234):
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()
BASE_PATH = "./"
def build_model():
model = Sequential()
model.add(Dense(1024,input_shape=(288,),activation='relu'))
model.add(layers.Dropout(rate=0.3))# apply 30% dropout to the next layer
model.add(Dense(8,activation='relu'))
model.add(Dense(4,activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam',metrics=[RSquare(), tf.keras.metrics.RootMeanSquaredError()])
return model
# comment out all classifiers that you don't want to use
# and do so for clf_names accordingly
classifiers = [
DummyRegressor(strategy='median'),
LinearRegression(n_jobs=-1),
Ridge(alpha=0.003, max_iter=30),
Lasso(alpha=.0005),
ElasticNet(alpha=0.0005, l1_ratio=.9),
KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5),
SGDRegressor(),
SVR(kernel="linear"),
LinearSVR(),
RandomForestRegressor(n_jobs=-1, n_estimators=350,
max_depth=12, random_state=1),
GradientBoostingRegressor(n_estimators=500, max_depth=2),
lgb.LGBMRegressor(n_jobs=-1, max_depth=2, n_estimators=1000,
learning_rate=0.05),
xgb.XGBRegressor(objective="reg:squarederror", n_jobs=-1,
max_depth=2, n_estimators=1500, learning_rate=0.075),
KerasRegressor(build_fn=build_model, epochs=500, verbose=0),
]
clf_names = [
"dummy",
"linear",
"ridge",
"lasso",
"elastic",
"kernlrdg",
"sgdreg",
"svr",
"linearsvr",
"randomforest",
"gbm",
"lgbm",
"xgboost",
"dl"
]
def clean_data(data, is_train_data=True):
# add your code for data cleaning and feature engineering here
# e.g. create a new feature from existing ones
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
# add here the code that you only want to apply to your training data and not the test set
# e.g. removing outliers from the training data works...
# ...but you cannot remove samples from your test set.
if is_train_data == True:
data = data[data.GrLivArea < 4000]
return data
def prepare_data(df, is_train_data=True):
# split data into numerical & categorical in order to process seperately in the pipeline
numerical = df.select_dtypes("number").copy()
categorical = df.select_dtypes("object").copy()
# for training data only...
# ...convert SalePrice to log values and drop "Id" and "SalePrice" columns
if is_train_data == True :
SalePrice = numerical.SalePrice
y = np.log1p(SalePrice)
numerical.drop(["Id", "SalePrice"], axis=1, inplace=True)
# for the test data: just drop "Id" and set "y" to None
else:
numerical.drop(["Id"], axis=1, inplace=True)
y = None
# concatenate numerical and categorical data to X (our final training data)
X = pd.concat([numerical, categorical], axis=1)
# in addition to X and y return the separated columns to use these separetely in our pipeline
return X, y, numerical.columns, categorical.columns
class DLTransformer(BaseEstimator, TransformerMixin):
# def __init__(self):
# print('\n>>>>>>>init() called.\n')
def fit(self, X, y = None):
# print('\n>>>>>>>fit() called.\n')
# X_ = X.copy() # creating a copy to avoid changes to original dataset
# X_ = X_.to_numpy()
# return X_
# X= X.to_numpy()
return self
def transform(self, X, y = None):
# print('\n>>>>>>>transform() called.\n')
X_ = X.copy() # creating a copy to avoid changes to original dataset
X_ = X_.toarray()
return X_
def get_pipeline(clf_name, classifier, num_cols, cat_cols):
# the numeric transformer gets the numerical data acording to num_cols
# the first step is the imputer which imputes all missing values to the mean
# in the second step all numerical data gets scaled by the StandardScaler()
if clf_name == 'dl':
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', RobustScaler())])
else:
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
# the categorical transformer gets all categorical data according to cat_cols
# again: first step is imputing missing values and one hot encoding the categoricals
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# the column transformer creates one Pipeline for categorical and numerical data each
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, num_cols),
('cat', categorical_transformer, cat_cols)])
# return the whole pipeline with the classifier provided in the function call
# return Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', FunctionTransformer(lambda x: x.toarray())), ('classifier', classifier)]) if clf_name == 'dl' else Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
return Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', DLTransformer()), ('classifier', classifier)]) if clf_name == 'dl' else Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
def score_models(df):
# retrieve X, y and the seperate columns names
X, y, num_cols, cat_cols = prepare_data(df)
# since we converted SalePrice to log values, we use neg_mean_squared_error...
# ...rather than *neg_mean_squared_log_error*
# scoring_metric = "neg_mean_squared_error"
scoring_metric = ("r2", "neg_mean_squared_error")
scores = []
for clf_name, classifier in zip(clf_names, classifiers):
# create a pipeline for each classifier
clf = get_pipeline(clf_name, classifier, num_cols, cat_cols)
# set a kfold with 3 splits to get more robust scores.
# increase to 5 or 10 to get more precise estimations on models score
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
if clf_name == 'dl':
# preprocessor = ColumnTransformer(
# transformers=[
# ('num', numeric_transformer, num_cols),
# ('cat', categorical_transformer, cat_cols)])
# # keras_estimator = KerasRegressor(build_fn=build_model, epochs=50, verbose=0)
# clf = Pipeline(steps=[('preprocessor', preprocessor), ('dltransform', DLTransformer()), ('classifier', classifier)])
# # print(clf.get_params)
# display(clf)
clf.fit(X, y) # put your callbacks in a lists
metric = RSquare()
metric.update_state(y, clf.predict(X))
result = metric.result()
k1 = result.numpy()
metric=tf.keras.metrics.RootMeanSquaredError()
metric.update_state(y, clf.predict(X))
result = metric.result()
k2 = result.numpy()
scores.append([clf_name, k1, k2])
else:
# crossvalidate and return the square root of the results
# results = np.sqrt(-cross_val_score(clf, X, y, cv=kfold, scoring=scoring_metric))
results = cross_validate(clf, X, y, cv=kfold, scoring=scoring_metric)
scores.append([clf_name, results["test_r2"].mean(), np.sqrt(-results["test_neg_mean_squared_error"]).mean()])
# scores = pd.DataFrame(scores, columns=["classifier", "rmse"]).sort_values("rmse", ascending=False)
scores = pd.DataFrame(scores, columns=["classifier", "R2", "rmse"]).sort_values("rmse", ascending=False)
# just for good measure: add the mean of all scores to dataframe
# scores.loc[len(scores) + 1, :] = ["mean_all", scores.rmse.mean()]
scores.loc[len(scores) + 1, :] = ["mean_all", scores.R2.mean(), scores.rmse.mean()]
return scores.reset_index(drop=True)
For each classifier we create and fit a pipeline.
def train_models(df):
X, y, num_cols, cat_cols = prepare_data(df)
pipelines = []
for clf_name, classifier in zip(clf_names, classifiers):
clf = get_pipeline(clf_name, classifier, num_cols, cat_cols)
clf.fit(X, y)
pipelines.append(clf)
return pipelines
For each fitted pipeline we retrieve predictions for SalePrice
def predict_from_models(df_test, pipelines):
X_test, _ , _, _ = prepare_data(df_test, is_train_data=False)
predictions = []
for pipeline in pipelines:
preds = pipeline.predict(X_test)
# we return the exponent of the predictions since we have log converted y for training
predictions.append(np.expm1(preds))
return predictions
df = pd.read_csv(f"{BASE_PATH}train.csv")
df_test = pd.read_csv(f"{BASE_PATH}test.csv")
# We clean the data
df = clean_data(df)
df_test = clean_data(df_test, is_train_data=False)
# We score the models on the preprocessed training data
set_seed()
my_scores = score_models(df)
display(my_scores)
classifier | R2 | rmse | |
---|---|---|---|
0 | dummy | -0.005590 | 0.396491 |
1 | sgdreg | 0.608009 | 0.247571 |
2 | randomforest | 0.877905 | 0.138248 |
3 | linearsvr | 0.879780 | 0.136673 |
4 | linear | 0.890839 | 0.130701 |
5 | svr | 0.892614 | 0.129446 |
6 | ridge | 0.905993 | 0.121218 |
7 | xgboost | 0.907387 | 0.120122 |
8 | lgbm | 0.907761 | 0.120102 |
9 | gbm | 0.907569 | 0.120036 |
10 | elastic | 0.916960 | 0.113857 |
11 | lasso | 0.917301 | 0.113624 |
12 | kernlrdg | 0.917450 | 0.113385 |
13 | dl | 0.952052 | 0.086704 |
14 | mean_all | 0.819716 | 0.149156 |
# We train the models on the whole training set and predict on the test data
set_seed()
models = train_models(df)
predictions = predict_from_models(df_test, models)
# We average over the results of all 12 classifiers (simple ensembling)
# we exclude the DummyRegressor and the SGDRegressor: they perform worst...
prediction_final = pd.DataFrame(predictions[2:]).mean().T.values
submission = pd.DataFrame({'Id': df_test.Id.values, 'SalePrice': prediction_final})
# submission.to_csv(f"submission.csv", index=False)
submission.head()
Id | SalePrice | |
---|---|---|
0 | 1461 | 119125.038558 |
1 | 1462 | 154007.285550 |
2 | 1463 | 185168.508907 |
3 | 1464 | 197395.771079 |
4 | 1465 | 191741.367469 |