In [1]:
# File Name: Churn_Fun.ipynb
In [2]:
# load modules
from __future__ import division
import pandas as pd
pd.set_option('precision', 3)
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
import seaborn as sns
from scipy import interp
import bisect
from scipy.stats import mstats
# Load in the r magic
%load_ext rpy2.ipython
# We need ggplot2
%R require(ggplot2)
# local modules
from churn_measurements import calibration, discrimination
In [3]:
print "~ Importing Data ~"
df = pd.read_csv("data/churn.csv")
~ Importing Data ~
In [4]:
print "~ Preprocessing Data ~"
col_names = df.columns.tolist()
print "Column names:"
print col_names
~ Preprocessing Data ~
Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']
In [5]:
to_show = col_names[:6] + col_names[-6:]
print '\n Sample data:'
df[to_show].head(6)
 Sample data:
Out[5]:
State Account Length Area Code Phone Int'l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 8.41 10.1 3 2.73 3 False.
5 AL 118 510 391-8027 yes no 9.18 6.3 6 1.70 0 False.
In [6]:
print df.shape
print df.dtypes
(3333, 21)
State              object
Account Length      int64
Area Code           int64
Phone              object
Int'l Plan         object
VMail Plan         object
VMail Message       int64
Day Mins          float64
Day Calls           int64
Day Charge        float64
Eve Mins          float64
Eve Calls           int64
Eve Charge        float64
Night Mins        float64
Night Calls         int64
Night Charge      float64
Intl Mins         float64
Intl Calls          int64
Intl Charge       float64
CustServ Calls      int64
Churn?             object
dtype: object
In [7]:
# The number of numeric data
len(df.select_dtypes(include=['int64','float64']).columns)
Out[7]:
16
In [8]:
# The number of categorical data
len(df.select_dtypes(include=['category','object']).columns)
Out[8]:
5
In [9]:
# Check there is any missing data
for i in df.columns.tolist():
    k = sum(pd.isnull(df[i]))
    print (i, k)
('State', 0)
('Account Length', 0)
('Area Code', 0)
('Phone', 0)
("Int'l Plan", 0)
('VMail Plan', 0)
('VMail Message', 0)
('Day Mins', 0)
('Day Calls', 0)
('Day Charge', 0)
('Eve Mins', 0)
('Eve Calls', 0)
('Eve Charge', 0)
('Night Mins', 0)
('Night Calls', 0)
('Night Charge', 0)
('Intl Mins', 0)
('Intl Calls', 0)
('Intl Charge', 0)
('CustServ Calls', 0)
('Churn?', 0)
In [10]:
print "~ Exploring Data ~"
# numeric data
#print df.describe()
# same as above
print df.describe(include=['int64','float64'])
~ Exploring Data ~
       Account Length  Area Code  VMail Message  Day Mins  Day Calls  \
count        3333.000   3333.000       3333.000  3333.000   3333.000   
mean          101.065    437.182          8.099   179.775    100.436   
std            39.822     42.371         13.688    54.467     20.069   
min             1.000    408.000          0.000     0.000      0.000   
25%            74.000    408.000          0.000   143.700     87.000   
50%           101.000    415.000          0.000   179.400    101.000   
75%           127.000    510.000         20.000   216.400    114.000   
max           243.000    510.000         51.000   350.800    165.000   

       Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  \
count    3333.000  3333.000   3333.000    3333.000    3333.000     3333.000   
mean       30.562   200.980    100.114      17.084     200.872      100.108   
std         9.259    50.714     19.923       4.311      50.574       19.569   
min         0.000     0.000      0.000       0.000      23.200       33.000   
25%        24.430   166.600     87.000      14.160     167.000       87.000   
50%        30.500   201.400    100.000      17.120     201.200      100.000   
75%        36.790   235.300    114.000      20.000     235.300      113.000   
max        59.640   363.700    170.000      30.910     395.000      175.000   

       Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  
count      3333.000   3333.000    3333.000     3333.000        3333.000  
mean          9.039     10.237       4.479        2.765           1.563  
std           2.276      2.792       2.461        0.754           1.315  
min           1.040      0.000       0.000        0.000           0.000  
25%           7.520      8.500       3.000        2.300           1.000  
50%           9.050     10.300       4.000        2.780           1.000  
75%          10.590     12.100       6.000        3.270           2.000  
max          17.770     20.000      20.000        5.400           9.000  
In [11]:
# categorical and object data
print df.describe(include=['category','object'])
       State     Phone Int'l Plan VMail Plan  Churn?
count   3333      3333       3333       3333    3333
unique    51      3333          2          2       2
top       WV  385-6952         no         no  False.
freq     106         1       3010       2411    2850
In [12]:
print df['Churn?'].value_counts()
False.    2850
True.      483
Name: Churn?, dtype: int64
In [13]:
print df.groupby(df['Churn?']).mean()
        Account Length  Area Code  VMail Message  Day Mins  Day Calls  \
Churn?                                                                  
False.         100.794    437.075          8.605   175.176    100.283   
True.          102.665    437.818          5.116   206.914    101.335   

        Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  \
Churn?                                                                         
False.      29.780   199.043    100.039      16.919     200.133      100.058   
True.       35.176   212.410    100.561      18.055     205.232      100.400   

        Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  
Churn?                                                                    
False.         9.006     10.159       4.533        2.743            1.45  
True.          9.236     10.700       4.164        2.890            2.23  
In [14]:
# Histogram
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
plt.show()
In [15]:
df.plot(kind='density', subplots=True, layout=(5,4), sharex=False, 
              legend=False, fontsize=1)
plt.show()
In [16]:
# Boxplots
df.plot(kind='box', subplots=True, layout=(5,4), sharex=False, 
              sharey=False, fontsize=1)
plt.show()
In [17]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0, len(df._get_numeric_data().columns), 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(col_names)
ax.set_yticklabels(col_names)
plt.show()
In [18]:
print "~ Preparing Target and Features Data ~"

# Isolate target data
y = np.where(df['Churn?'] == 'True.', 1, 0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
df = df.drop(to_drop, axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
# yes_no_cols will be re-used for later scoring
yes_no_cols = ["Int'l Plan","VMail Plan"]
df[yes_no_cols] = df[yes_no_cols] == 'yes'

# Pull out fesatures for later scoring
features = df.columns

# feature variables
X = df.as_matrix().astype(np.float)
~ Preparing Target and Features Data ~
In [19]:
# Importances of features
train_index,test_index = train_test_split(df.index, random_state=4)

forest = RF()
forest_fit = forest.fit(X[train_index], y[train_index])
forest_predictions = forest_fit.predict(X[test_index])


importances = forest_fit.feature_importances_[:10]
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(10):
    print("%d. %s (%f)" % (f + 1, df.columns[f], importances[indices[f]]))

# Plot the feature importances of the forest
#import pylab as pl
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices], yerr=std[indices], color="r", align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()
Feature ranking:
1. Account Length (0.144792)
2. Int'l Plan (0.112537)
3. VMail Plan (0.074275)
4. VMail Message (0.070857)
5. Day Mins (0.067214)
6. Day Calls (0.043385)
7. Day Charge (0.038278)
8. Eve Mins (0.037361)
9. Eve Calls (0.035272)
10. Eve Charge (0.017569)
In [20]:
print "~ Transforming Data ~"
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)
~ Transforming Data ~
Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]
In [21]:
print "~ Building K-Fold Cross-Validations ~"
def run_cv(X, y, clf):
    # construct a K-Fold object
    kf = KFold(n_splits=5, shuffle=True, random_state=4)
    y_pred = y.copy()
    
    # iterate through folds
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
~ Building K-Fold Cross-Validations ~
In [22]:
print "~ Evaluating Models ~"    
def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print "Logistic Regression:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,LR()))
print "Gradient Boosting Classifier:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,GBC()))
print "Support vector machines:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,SVC()))
print "Random forest:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,RF()))
print "K-nearest-neighbors:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,KNN()))
~ Evaluating Models ~
Logistic Regression:
Accuracy = 0.860
Gradient Boosting Classifier:
Accuracy = 0.952
Support vector machines:
Accuracy = 0.920
Random forest:
Accuracy = 0.946
K-nearest-neighbors:
Accuracy = 0.893
In [23]:
# F1-Scores and Confusion Matrices
def draw_confusion_matrices(confusion_matricies,class_names):
    class_names = class_names.tolist()
    for cm in confusion_matrices:
        classifier, cm = cm[0], cm[1]
        print(cm)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm)
        plt.title('Confusion matrix for %s' % classifier)
        fig.colorbar(cax)
        ax.set_xticklabels([''] + class_names)
        ax.set_yticklabels([''] + class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()
    
y = np.array(y)
class_names = np.unique(y)


confusion_matrices = {
                1: {
                    'matrix': confusion_matrix(y,run_cv(X,y,LR())),
                    'title': 'Logistic Regression',
                   },
                2: {
                    'matrix': confusion_matrix(y,run_cv(X,y,GBC())),
                    'title': 'Gradient Boosting Classifier',
                   },
                3: {
                    'matrix': confusion_matrix(y,run_cv(X,y,SVC())),
                    'title': 'Support Vector Machine',
                   },
                4: {
                    'matrix': confusion_matrix(y,run_cv(X,y,RF())),
                    'title': 'Random Forest',
                   },
                5: {
                    'matrix': confusion_matrix(y,run_cv(X,y,KNN())),
                    'title': 'K Nearest Neighbors',
                   },
}



fix, ax = plt.subplots(figsize=(16, 12))
plt.suptitle('Confusion Matrix of Various Classifiers')
for ii, values in confusion_matrices.items():
    matrix = values['matrix']
    title = values['title']
    plt.subplot(3, 3, ii) # starts from 1
    plt.title(title);
    sns.heatmap(matrix, annot=True,  fmt='');
    
print  "Logisitic Regression F1 Score" ,f1_score(y,run_cv(X,y,LR()))
print  "Gradient Boosting Classifier F1 Score" ,f1_score(y,run_cv(X,y,GBC()))
print  "Support Vector Machines F1 Score" ,f1_score(y,run_cv(X,y,SVC()))
print  "Random Forest F1 Score" ,f1_score(y,run_cv(X,y,RF()))
print  "K-Nearest-Neighbors F1 Score" ,f1_score(y,run_cv(X,y,KNN()))        
Logisitic Regression F1 Score 0.301943198804
Gradient Boosting Classifier F1 Score 0.815154994259
Support Vector Machines F1 Score 0.651773981603
Random Forest F1 Score 0.791418355185
K-Nearest-Neighbors F1 Score 0.490028490028
In [24]:
# ROC plots
def plot_roc(X, y, clf_class):
    kf = KFold(n_splits=5,shuffle=True, random_state=4)
    y_prob = np.zeros((len(y),2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
#    all_tpr = []
    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
        i = i + 1
    mean_tpr /= kf.get_n_splits(X)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
print "Logisitic Regression:"
plot_roc(X,y,LR())    
      
print "Gradient Boosting Classifier:"
plot_roc(X,y,GBC())

print "Support vector machines:"
plot_roc(X,y,SVC(probability=True))

print "Random forests:"
plot_roc(X,y,RF(n_estimators=18))

print "K-nearest-neighbors:"
plot_roc(X,y,KNN())
Logisitic Regression:
Gradient Boosting Classifier:
Support vector machines:
Random forests:
K-nearest-neighbors:
In [25]:
print "~ Building K-Fold Cross-Validations with Probabilities ~"
def run_prob_cv(X, y, clf):
    kf = KFold(n_splits=5,shuffle=True, random_state=4)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob
~ Building K-Fold Cross-Validations with Probabilities ~
In [26]:
print "~ Calculating Calibration and Discrimination ~"
# Take on RF
pred_prob = run_prob_cv(X, y, RF(n_estimators=10, random_state=4))

# Use 10 estimators so predictions are all multiples of 0.1
pred_churn = pred_prob[:,1].round(1)
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
counts[:]
~ Calculating Calibration and Discrimination ~
Out[26]:
0.0    1699
0.1     762
0.2     267
0.3     124
0.8      77
0.6      76
0.4      75
0.7      74
0.9      68
0.5      62
1.0      49
dtype: int64
In [27]:
# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts.sort_values('pred_prob').reset_index().drop(['index'], axis=1)
Out[27]:
pred_prob count true_prob
0 0.0 1699 0.026
1 0.1 762 0.024
2 0.2 267 0.086
3 0.3 124 0.194
4 0.4 75 0.280
5 0.5 62 0.565
6 0.6 76 0.816
7 0.7 74 0.892
8 0.8 77 0.961
9 0.9 68 0.985
10 1.0 49 1.000
In [28]:
baseline = np.mean(is_churn)
In [29]:
%%R -i counts,baseline -w 800 -h 600 -u px

ggplot(counts,aes(x=pred_prob,y=true_prob,size=count)) + 
    geom_point(color='blue') + 
    stat_function(fun = function(x){x}, color='red') + 
    stat_function(fun = function(x){baseline}, color='green') + 
    xlim(-0.05,  1.05) + ylim(-0.05,1.05) + 
    ggtitle("RF") + 
    xlab("Predicted probability") + ylab("Relative  of outcome")
In [30]:
def print_measurements(pred_prob):
    churn_prob, is_churn = pred_prob[:,1], y == 1
    print "  %-20s %.4f" % ("Calibration Error", calibration(churn_prob, is_churn))
    print "  %-20s %.4f" % ("Discrimination", discrimination(churn_prob,is_churn))
In [31]:
print "Note -- Lower calibration is better, higher discrimination is better"
print "Logistic Regression:"
print_measurements(run_prob_cv(X,y,LR()))

print "Gradient Boosting Classifier:"
print_measurements(run_prob_cv(X,y,GBC()))

print "Support vector machines:"
print_measurements(run_prob_cv(X,y,SVC(probability=True)))

print "Random forests:"
print_measurements(run_prob_cv(X,y,RF(n_estimators=18)))

print "K-nearest-neighbors:"
print_measurements(run_prob_cv(X,y,KNN()))
Note -- Lower calibration is better, higher discrimination is better
Logistic Regression:
  Calibration Error    0.0014
  Discrimination       0.0247
Gradient Boosting Classifier:
  Calibration Error    0.0018
  Discrimination       0.0843
Support vector machines:
  Calibration Error    0.0013
  Discrimination       0.0668
Random forests:
  Calibration Error    0.0068
  Discrimination       0.0832
K-nearest-neighbors:
  Calibration Error    0.0020
  Discrimination       0.0428
In [32]:
print '~ Profit Curves ~'
def confusion_rates(cm): 

    tn = cm[0][0]
    fp = cm[0][1] 
    fn = cm[1][0]
    tp = cm[1][1]
    
    N = fp + tn
    P = tp + fn
    
    tpr = tp / P
    fpr = fp / P
    fnr = fn / N
    tnr = tn / N
    
    rates = np.array([[tpr, fpr], [fnr, tnr]])
    
    return rates


def profit_curve(classifiers):
    for clf_class in classifiers:
        name, clf_class = clf_class[0], clf_class[1]
        clf = clf_class
        fit = clf.fit(X[train_index], y[train_index])
        probabilities = np.array([prob[0] for prob in fit.predict_proba(X[test_index])])
        profit = []
        
        indicies = np.argsort(probabilities)[::1]
    
        for idx in xrange(len(indicies)): 
            pred_true = indicies[:idx]
            ctr = np.arange(indicies.shape[0])
            masked_prediction = np.in1d(ctr, pred_true)
            cm = confusion_matrix(y_test.astype(int), masked_prediction.astype(int))
            rates = confusion_rates(cm)
     
            profit.append(np.sum(np.multiply(rates,cb)))
        
        plt.plot((np.arange(len(indicies)) / len(indicies) * 100), profit, label=name)
    plt.legend(loc="lower right")
    plt.title("Profits of classifiers")
    plt.xlabel("Percentage of test instances (decreasing by score)")
    plt.ylabel("Profit")
    plt.show()
~ Profit Curves ~
In [33]:
y_test = y[test_index].astype(float)

# Cost-Benefit Matrix
cb = np.array([[4, -5],
               [0, 0]])

# Define classifiers for comparison
classifiers = [("Logistic Regression",LR()),
               ("Gradient Boosting Classifier", GBC()),
               ("Support Vector Machines",SVC(probability=True)),
               ("Random Forest", RF(n_estimators=18)),
               ("K-nearest-neighbors:", KNN())
              ]
               
# Plot profit curves
profit_curve(classifiers)
In [34]:
forest = RF(n_estimators=18, random_state=4)
forest_fit = forest.fit(X[train_index], y[train_index])
predictions = forest_fit.predict(X[test_index])

print confusion_matrix(y[test_index],predictions)
print accuracy_score(y[test_index],predictions)
print classification_report(y[test_index],predictions)
[[706   9]
 [ 24  95]]
0.960431654676
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       715
          1       0.91      0.80      0.85       119

avg / total       0.96      0.96      0.96       834

In [35]:
# Grid Search and Hyper Parameters
rfc = RF(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True, random_state = 4) 

param_grid = { 
    'n_estimators': [5, 10, 20, 40, 80, 160, 200],
    #'min_samples_leaf': [1, 5, 10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X[train_index], y[train_index])

means = CV_rfc.cv_results_['mean_test_score']
stds = CV_rfc.cv_results_['std_test_score']

print("Best: %f using %s with %s" % (CV_rfc.best_score_, CV_rfc.best_params_, CV_rfc.best_estimator_))
for params, mean_score, scores in zip(CV_rfc.cv_results_['params'],means,stds):
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
Best: 0.949180 using {'max_features': 'auto', 'n_estimators': 200} with RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=True, random_state=4,
            verbose=0, warm_start=False)
0.010515 (0.000000) with: {'max_features': 'auto', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'auto', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'auto', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'auto', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'auto', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'auto', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'auto', 'n_estimators': 200}
0.010515 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 200}
0.010515 (0.000000) with: {'max_features': 'log2', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'log2', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'log2', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'log2', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'log2', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'log2', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'log2', 'n_estimators': 200}
In [36]:
forest = CV_rfc.best_estimator_
forest_fit = forest.fit(X[train_index], y[train_index])
predictions = forest_fit.predict(X[test_index])

print confusion_matrix(y[test_index],predictions)
print accuracy_score(y[test_index],predictions)
print classification_report(y[test_index],predictions)

print "Random forests senstivity analysis Train Data:"
plot_roc(X[train_index],y[train_index],CV_rfc.best_estimator_)
print "Random forests senstivity analysis Test Data:"
plot_roc(X[test_index],y[test_index],CV_rfc.best_estimator_)
[[706   9]
 [ 23  96]]
0.961630695444
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       715
          1       0.91      0.81      0.86       119

avg / total       0.96      0.96      0.96       834

Random forests senstivity analysis Train Data:
Random forests senstivity analysis Test Data:
In [37]:
print "~ Threshold Value ~"
clf = CV_rfc.best_estimator_

n_trials = 50
test_size_percent = 0.1

signals = X
labels = y

plot_data = []

train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
clf.fit(train_signals, train_labels)
predictions = clf.predict_proba(test_signals)[:,1]

precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
thresholds = np.append(thresholds, 1)

queue_rate = []
for threshold in thresholds:
    queue_rate.append((predictions >= threshold).mean())
    
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])

leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
~ Threshold Value ~
Out[37]:
<matplotlib.text.Text at 0x10a2f668>
In [38]:
clf = CV_rfc.best_estimator_

n_trials = 50
test_size_percent = 0.1

signals = X
labels = y

plot_data = []

for trial in range(n_trials):
    train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
    clf.fit(train_signals, train_labels)
    predictions = clf.predict_proba(test_signals)[:,1]
    
    precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
    thresholds = np.append(thresholds, 1)
    
    queue_rate = []
    for threshold in thresholds:
        queue_rate.append((predictions >= threshold).mean())
            
    plot_data.append({
            'thresholds': thresholds
        ,   'precision': precision
        ,   'recall': recall
        ,   'queue_rate': queue_rate
    })
#%%
for p in plot_data:
    plt.plot(p['thresholds'], p['precision'], color=sns.color_palette()[0], alpha=0.5)
    plt.plot(p['thresholds'], p['recall'], color=sns.color_palette()[1], alpha=0.5)
    plt.plot(p['thresholds'], p['queue_rate'], color=sns.color_palette()[2], alpha=0.5)
    
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
Out[38]:
<matplotlib.text.Text at 0x138df978>
In [39]:
uniform_thresholds = np.linspace(0, 1, 101)

uniform_precision_plots = []
uniform_recall_plots= []
uniform_queue_rate_plots= []

for p in plot_data:
    uniform_precision = []
    uniform_recall = []
    uniform_queue_rate = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        uniform_precision.append(p['precision'][index])
        uniform_recall.append(p['recall'][index])
        uniform_queue_rate.append(p['queue_rate'][index])
        
    uniform_precision_plots.append(uniform_precision)
    uniform_recall_plots.append(uniform_recall)
    uniform_queue_rate_plots.append(uniform_queue_rate)
#%%
quantiles = [0.1, 0.5, 0.9]
lower_precision, median_precision, upper_precision = mstats.mquantiles(uniform_precision_plots, quantiles, axis=0)
lower_recall, median_recall, upper_recall = mstats.mquantiles(uniform_recall_plots, quantiles, axis=0)
lower_queue_rate, median_queue_rate, upper_queue_rate = mstats.mquantiles(uniform_queue_rate_plots, quantiles, axis=0)

plt.plot(uniform_thresholds, median_precision)
plt.plot(uniform_thresholds, median_recall)
plt.plot(uniform_thresholds, median_queue_rate)

plt.fill_between(uniform_thresholds, upper_precision, lower_precision, alpha=0.5, linewidth=0, color=sns.color_palette()[0])
plt.fill_between(uniform_thresholds, upper_recall, lower_recall, alpha=0.5, linewidth=0, color=sns.color_palette()[1])
plt.fill_between(uniform_thresholds, upper_queue_rate, lower_queue_rate, alpha=0.5, linewidth=0, color=sns.color_palette()[2])

leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Out[39]:
<matplotlib.text.Text at 0xc9a5748>
In [40]:
uniform_thresholds = np.linspace(0, 1, 101)

uniform_payout_plots = []

n = 10000
success_payoff = 4
case_cost = 5

for p in plot_data:
    uniform_payout = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        precision = p['precision'][index]
        queue_rate = p['queue_rate'][index]
        
        payout = n*queue_rate*(precision*100 - case_cost)
        uniform_payout.append(payout)

    uniform_payout_plots.append(uniform_payout)

quantiles = [0.1, 0.5, 0.9]
lower_payout, median_payout, upper_payout = mstats.mquantiles(uniform_payout_plots, quantiles, axis=0)

plt.plot(uniform_thresholds, median_payout, color=sns.color_palette()[4])
plt.fill_between(uniform_thresholds, upper_payout, lower_payout, alpha=0.5, linewidth=0, color=sns.color_palette()[4])

max_ap = uniform_thresholds[np.argmax(median_payout)]
plt.vlines([max_ap], -100000, 150000, linestyles='--')
plt.ylim(-100000, 150000)

leg = plt.legend(('payout ($)', 'median argmax = {:.2f}'.format(max_ap)), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.title("Payout as a Function of Threshold")
plt.ylabel('$')
Out[40]:
<matplotlib.text.Text at 0x10525400>
In [41]:
print np.max(median_payout)
113922.155689
In [42]:
# Scoring Model
def ChurnModel(df, clf):
    # Convert yes no columns to bool
    # yes_no_cols already known, stored as a global variable
    df[yes_no_cols] = df[yes_no_cols] == 'yes'
    # features already known, stored as a global variable
    X = df[features].as_matrix().astype(np.float)
    X = scaler.transform(X)
    
    """
    Calculates probability of churn and expected loss, 
    and gathers customer's contact info
    """     
    # Collect customer meta data
    response = df[['Area Code','Phone']]
    charges = ['Day Charge','Eve Charge','Night Charge','Intl Charge']
    response['customer_worth'] = df[charges].sum(axis=1)    
    
    # Make prediction
    churn_prob = clf.predict_proba(X)
    response['churn_prob'] = churn_prob[:,1]
    
    # Calculate expected loss
    response['expected_loss'] = response['churn_prob'] * response['customer_worth']
    response = response.sort('expected_loss', ascending=False)
    
    # Return response DataFrame
    return response
In [43]:
# Simulated new data 
df = pd.read_csv("data/churn.csv")    
train_index,test_index = train_test_split(df.index, random_state = 4)
test_df = df.ix[test_index]

# Apply new data to the model
ChurnModel(test_df, CV_rfc.best_estimator_)
Out[43]:
Area Code Phone customer_worth churn_prob expected_loss
3205 408 345-3787 89.76 0.990 88.862
3132 415 394-5489 87.35 0.990 86.477
289 510 352-6976 89.31 0.965 86.184
2113 408 335-2967 85.09 1.000 85.090
1701 415 374-1981 84.08 0.990 83.239
832 408 335-1874 83.14 0.995 82.724
3272 510 373-7974 82.96 0.995 82.545
1350 408 357-6039 82.58 0.980 80.928
306 415 419-1714 81.00 0.995 80.595
2028 510 408-4836 84.68 0.950 80.446
2103 408 413-2194 80.07 0.995 79.670
2747 408 376-4292 80.71 0.980 79.096
2908 415 406-7844 79.39 0.985 78.199
331 415 387-5453 78.93 0.990 78.141
76 415 374-5353 81.75 0.940 76.845
2210 415 367-3220 78.22 0.975 76.264
2186 415 327-8495 80.39 0.940 75.567
454 408 405-6189 79.89 0.940 75.097
301 415 416-1676 78.74 0.950 74.803
415 415 352-4418 77.88 0.960 74.765
2595 408 337-4600 78.67 0.950 74.736
2573 415 344-1970 79.39 0.940 74.627
3045 415 348-5728 76.34 0.975 74.431
546 510 418-6455 78.67 0.930 73.163
859 408 374-9203 79.37 0.905 71.830
574 510 419-1674 77.54 0.925 71.725
3168 415 415-5476 78.15 0.915 71.507
430 510 365-5979 77.07 0.925 71.290
1984 415 339-6477 77.14 0.920 70.969
1614 415 417-4810 76.98 0.915 70.437
... ... ... ... ... ...
2778 415 396-6390 56.69 0.000 0.000
2352 408 404-2877 57.70 0.000 0.000
308 510 341-3464 60.56 0.000 0.000
28 415 353-2630 64.12 0.000 0.000
3296 510 380-3186 61.48 0.000 0.000
658 415 392-9342 47.91 0.000 0.000
1031 415 411-6663 48.44 0.000 0.000
1613 415 390-3565 61.70 0.000 0.000
1215 510 383-2017 62.74 0.000 0.000
79 408 333-1967 56.46 0.000 0.000
2171 510 355-2293 60.11 0.000 0.000
637 408 333-9253 54.17 0.000 0.000
459 415 361-8239 51.40 0.000 0.000
102 415 354-3783 46.85 0.000 0.000
664 408 358-8729 62.01 0.000 0.000
1749 415 358-5274 51.90 0.000 0.000
3325 408 368-8555 56.28 0.000 0.000
1572 510 352-4541 57.07 0.000 0.000
262 415 340-3182 53.76 0.000 0.000
1994 510 369-2899 49.50 0.000 0.000
3195 510 399-7029 64.19 0.000 0.000
1131 408 333-3447 65.99 0.000 0.000
182 408 405-2888 50.17 0.000 0.000
1055 415 362-4685 51.22 0.000 0.000
2334 415 404-8765 40.57 0.000 0.000
3196 510 337-3868 58.15 0.000 0.000
1681 510 345-8350 56.20 0.000 0.000
1546 415 331-5919 49.48 0.000 0.000
1657 408 411-5078 47.23 0.000 0.000
2398 408 333-9133 54.20 0.000 0.000

834 rows × 5 columns