# File Name: Churn_Fun.ipynb

# load modules
from __future__ import division
import pandas as pd
pd.set_option('precision', 3)
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
import seaborn as sns
from scipy import interp
import bisect
from scipy.stats import mstats
# Load in the r magic
%load_ext rpy2.ipython
# We need ggplot2
%R require(ggplot2)
# local modules
from churn_measurements import calibration, discrimination

print "~ Importing Data ~"
df = pd.read_csv("data/churn.csv")

~ Importing Data ~

print "~ Preprocessing Data ~"
col_names = df.columns.tolist()
print "Column names:"
print col_names

~ Preprocessing Data ~
Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

to_show = col_names[:6] + col_names[-6:]
print '\n Sample data:'
df[to_show].head(6)

 Sample data:

print df.shape
print df.dtypes

(3333, 21)
State              object
Account Length      int64
Area Code           int64
Phone              object
Int'l Plan         object
VMail Plan         object
VMail Message       int64
Day Mins          float64
Day Calls           int64
Day Charge        float64
Eve Mins          float64
Eve Calls           int64
Eve Charge        float64
Night Mins        float64
Night Calls         int64
Night Charge      float64
Intl Mins         float64
Intl Calls          int64
Intl Charge       float64
CustServ Calls      int64
Churn?             object
dtype: object

# The number of numeric data
len(df.select_dtypes(include=['int64','float64']).columns)

16

# The number of categorical data
len(df.select_dtypes(include=['category','object']).columns)

5

# Check there is any missing data
for i in df.columns.tolist():
    k = sum(pd.isnull(df[i]))
    print (i, k)

('State', 0)
('Account Length', 0)
('Area Code', 0)
('Phone', 0)
("Int'l Plan", 0)
('VMail Plan', 0)
('VMail Message', 0)
('Day Mins', 0)
('Day Calls', 0)
('Day Charge', 0)
('Eve Mins', 0)
('Eve Calls', 0)
('Eve Charge', 0)
('Night Mins', 0)
('Night Calls', 0)
('Night Charge', 0)
('Intl Mins', 0)
('Intl Calls', 0)
('Intl Charge', 0)
('CustServ Calls', 0)
('Churn?', 0)

print "~ Exploring Data ~"
# numeric data
#print df.describe()
# same as above
print df.describe(include=['int64','float64'])

~ Exploring Data ~
       Account Length  Area Code  VMail Message  Day Mins  Day Calls  \
count        3333.000   3333.000       3333.000  3333.000   3333.000   
mean          101.065    437.182          8.099   179.775    100.436   
std            39.822     42.371         13.688    54.467     20.069   
min             1.000    408.000          0.000     0.000      0.000   
25%            74.000    408.000          0.000   143.700     87.000   
50%           101.000    415.000          0.000   179.400    101.000   
75%           127.000    510.000         20.000   216.400    114.000   
max           243.000    510.000         51.000   350.800    165.000   

       Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  \
count    3333.000  3333.000   3333.000    3333.000    3333.000     3333.000   
mean       30.562   200.980    100.114      17.084     200.872      100.108   
std         9.259    50.714     19.923       4.311      50.574       19.569   
min         0.000     0.000      0.000       0.000      23.200       33.000   
25%        24.430   166.600     87.000      14.160     167.000       87.000   
50%        30.500   201.400    100.000      17.120     201.200      100.000   
75%        36.790   235.300    114.000      20.000     235.300      113.000   
max        59.640   363.700    170.000      30.910     395.000      175.000   

       Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  
count      3333.000   3333.000    3333.000     3333.000        3333.000  
mean          9.039     10.237       4.479        2.765           1.563  
std           2.276      2.792       2.461        0.754           1.315  
min           1.040      0.000       0.000        0.000           0.000  
25%           7.520      8.500       3.000        2.300           1.000  
50%           9.050     10.300       4.000        2.780           1.000  
75%          10.590     12.100       6.000        3.270           2.000  
max          17.770     20.000      20.000        5.400           9.000

# categorical and object data
print df.describe(include=['category','object'])

       State     Phone Int'l Plan VMail Plan  Churn?
count   3333      3333       3333       3333    3333
unique    51      3333          2          2       2
top       WV  385-6952         no         no  False.
freq     106         1       3010       2411    2850

print df['Churn?'].value_counts()

False.    2850
True.      483
Name: Churn?, dtype: int64

print df.groupby(df['Churn?']).mean()

        Account Length  Area Code  VMail Message  Day Mins  Day Calls  \
Churn?                                                                  
False.         100.794    437.075          8.605   175.176    100.283   
True.          102.665    437.818          5.116   206.914    101.335   

        Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  \
Churn?                                                                         
False.      29.780   199.043    100.039      16.919     200.133      100.058   
True.       35.176   212.410    100.561      18.055     205.232      100.400   

        Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  
Churn?                                                                    
False.         9.006     10.159       4.533        2.743            1.45  
True.          9.236     10.700       4.164        2.890            2.23

# Histogram
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
plt.show()

df.plot(kind='density', subplots=True, layout=(5,4), sharex=False, 
              legend=False, fontsize=1)
plt.show()

# Boxplots
df.plot(kind='box', subplots=True, layout=(5,4), sharex=False, 
              sharey=False, fontsize=1)
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0, len(df._get_numeric_data().columns), 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(col_names)
ax.set_yticklabels(col_names)
plt.show()

print "~ Preparing Target and Features Data ~"

# Isolate target data
y = np.where(df['Churn?'] == 'True.', 1, 0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
df = df.drop(to_drop, axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
# yes_no_cols will be re-used for later scoring
yes_no_cols = ["Int'l Plan","VMail Plan"]
df[yes_no_cols] = df[yes_no_cols] == 'yes'

# Pull out fesatures for later scoring
features = df.columns

# feature variables
X = df.as_matrix().astype(np.float)

~ Preparing Target and Features Data ~

# Importances of features
train_index,test_index = train_test_split(df.index, random_state=4)

forest = RF()
forest_fit = forest.fit(X[train_index], y[train_index])
forest_predictions = forest_fit.predict(X[test_index])


importances = forest_fit.feature_importances_[:10]
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(10):
    print("%d. %s (%f)" % (f + 1, df.columns[f], importances[indices[f]]))

# Plot the feature importances of the forest
#import pylab as pl
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices], yerr=std[indices], color="r", align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()

Feature ranking:
1. Account Length (0.144792)
2. Int'l Plan (0.112537)
3. VMail Plan (0.074275)
4. VMail Message (0.070857)
5. Day Mins (0.067214)
6. Day Calls (0.043385)
7. Day Charge (0.038278)
8. Eve Mins (0.037361)
9. Eve Calls (0.035272)
10. Eve Charge (0.017569)

print "~ Transforming Data ~"
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)

~ Transforming Data ~
Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]

print "~ Building K-Fold Cross-Validations ~"
def run_cv(X, y, clf):
    # construct a K-Fold object
    kf = KFold(n_splits=5, shuffle=True, random_state=4)
    y_pred = y.copy()
    
    # iterate through folds
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

~ Building K-Fold Cross-Validations ~

print "~ Evaluating Models ~"    
def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print "Logistic Regression:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,LR()))
print "Gradient Boosting Classifier:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,GBC()))
print "Support vector machines:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,SVC()))
print "Random forest:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,RF()))
print "K-nearest-neighbors:"
print "Accuracy = %.3f" % accuracy(y, run_cv(X,y,KNN()))

~ Evaluating Models ~
Logistic Regression:
Accuracy = 0.860
Gradient Boosting Classifier:
Accuracy = 0.952
Support vector machines:
Accuracy = 0.920
Random forest:
Accuracy = 0.946
K-nearest-neighbors:
Accuracy = 0.893

# F1-Scores and Confusion Matrices
def draw_confusion_matrices(confusion_matricies,class_names):
    class_names = class_names.tolist()
    for cm in confusion_matrices:
        classifier, cm = cm[0], cm[1]
        print(cm)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm)
        plt.title('Confusion matrix for %s' % classifier)
        fig.colorbar(cax)
        ax.set_xticklabels([''] + class_names)
        ax.set_yticklabels([''] + class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()
    
y = np.array(y)
class_names = np.unique(y)


confusion_matrices = {
                1: {
                    'matrix': confusion_matrix(y,run_cv(X,y,LR())),
                    'title': 'Logistic Regression',
                   },
                2: {
                    'matrix': confusion_matrix(y,run_cv(X,y,GBC())),
                    'title': 'Gradient Boosting Classifier',
                   },
                3: {
                    'matrix': confusion_matrix(y,run_cv(X,y,SVC())),
                    'title': 'Support Vector Machine',
                   },
                4: {
                    'matrix': confusion_matrix(y,run_cv(X,y,RF())),
                    'title': 'Random Forest',
                   },
                5: {
                    'matrix': confusion_matrix(y,run_cv(X,y,KNN())),
                    'title': 'K Nearest Neighbors',
                   },
}



fix, ax = plt.subplots(figsize=(16, 12))
plt.suptitle('Confusion Matrix of Various Classifiers')
for ii, values in confusion_matrices.items():
    matrix = values['matrix']
    title = values['title']
    plt.subplot(3, 3, ii) # starts from 1
    plt.title(title);
    sns.heatmap(matrix, annot=True,  fmt='');
    
print  "Logisitic Regression F1 Score" ,f1_score(y,run_cv(X,y,LR()))
print  "Gradient Boosting Classifier F1 Score" ,f1_score(y,run_cv(X,y,GBC()))
print  "Support Vector Machines F1 Score" ,f1_score(y,run_cv(X,y,SVC()))
print  "Random Forest F1 Score" ,f1_score(y,run_cv(X,y,RF()))
print  "K-Nearest-Neighbors F1 Score" ,f1_score(y,run_cv(X,y,KNN()))

Logisitic Regression F1 Score 0.301943198804
Gradient Boosting Classifier F1 Score 0.815154994259
Support Vector Machines F1 Score 0.651773981603
Random Forest F1 Score 0.791418355185
K-Nearest-Neighbors F1 Score 0.490028490028

# ROC plots
def plot_roc(X, y, clf_class):
    kf = KFold(n_splits=5,shuffle=True, random_state=4)
    y_prob = np.zeros((len(y),2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
#    all_tpr = []
    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
        i = i + 1
    mean_tpr /= kf.get_n_splits(X)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
print "Logisitic Regression:"
plot_roc(X,y,LR())    
      
print "Gradient Boosting Classifier:"
plot_roc(X,y,GBC())

print "Support vector machines:"
plot_roc(X,y,SVC(probability=True))

print "Random forests:"
plot_roc(X,y,RF(n_estimators=18))

print "K-nearest-neighbors:"
plot_roc(X,y,KNN())

Logisitic Regression:

Gradient Boosting Classifier:

Support vector machines:

Random forests:

K-nearest-neighbors:

print "~ Building K-Fold Cross-Validations with Probabilities ~"
def run_prob_cv(X, y, clf):
    kf = KFold(n_splits=5,shuffle=True, random_state=4)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob

~ Building K-Fold Cross-Validations with Probabilities ~

print "~ Calculating Calibration and Discrimination ~"
# Take on RF
pred_prob = run_prob_cv(X, y, RF(n_estimators=10, random_state=4))

# Use 10 estimators so predictions are all multiples of 0.1
pred_churn = pred_prob[:,1].round(1)
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
counts[:]

~ Calculating Calibration and Discrimination ~

0.0    1699
0.1     762
0.2     267
0.3     124
0.8      77
0.6      76
0.4      75
0.7      74
0.9      68
0.5      62
1.0      49
dtype: int64

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts.sort_values('pred_prob').reset_index().drop(['index'], axis=1)

baseline = np.mean(is_churn)

%%R -i counts,baseline -w 800 -h 600 -u px

ggplot(counts,aes(x=pred_prob,y=true_prob,size=count)) + 
    geom_point(color='blue') + 
    stat_function(fun = function(x){x}, color='red') + 
    stat_function(fun = function(x){baseline}, color='green') + 
    xlim(-0.05,  1.05) + ylim(-0.05,1.05) + 
    ggtitle("RF") + 
    xlab("Predicted probability") + ylab("Relative  of outcome")

def print_measurements(pred_prob):
    churn_prob, is_churn = pred_prob[:,1], y == 1
    print "  %-20s %.4f" % ("Calibration Error", calibration(churn_prob, is_churn))
    print "  %-20s %.4f" % ("Discrimination", discrimination(churn_prob,is_churn))

print "Note -- Lower calibration is better, higher discrimination is better"
print "Logistic Regression:"
print_measurements(run_prob_cv(X,y,LR()))

print "Gradient Boosting Classifier:"
print_measurements(run_prob_cv(X,y,GBC()))

print "Support vector machines:"
print_measurements(run_prob_cv(X,y,SVC(probability=True)))

print "Random forests:"
print_measurements(run_prob_cv(X,y,RF(n_estimators=18)))

print "K-nearest-neighbors:"
print_measurements(run_prob_cv(X,y,KNN()))

Note -- Lower calibration is better, higher discrimination is better
Logistic Regression:
  Calibration Error    0.0014
  Discrimination       0.0247
Gradient Boosting Classifier:
  Calibration Error    0.0018
  Discrimination       0.0843
Support vector machines:
  Calibration Error    0.0013
  Discrimination       0.0668
Random forests:
  Calibration Error    0.0068
  Discrimination       0.0832
K-nearest-neighbors:
  Calibration Error    0.0020
  Discrimination       0.0428

print '~ Profit Curves ~'
def confusion_rates(cm): 

    tn = cm[0][0]
    fp = cm[0][1] 
    fn = cm[1][0]
    tp = cm[1][1]
    
    N = fp + tn
    P = tp + fn
    
    tpr = tp / P
    fpr = fp / P
    fnr = fn / N
    tnr = tn / N
    
    rates = np.array([[tpr, fpr], [fnr, tnr]])
    
    return rates


def profit_curve(classifiers):
    for clf_class in classifiers:
        name, clf_class = clf_class[0], clf_class[1]
        clf = clf_class
        fit = clf.fit(X[train_index], y[train_index])
        probabilities = np.array([prob[0] for prob in fit.predict_proba(X[test_index])])
        profit = []
        
        indicies = np.argsort(probabilities)[::1]
    
        for idx in xrange(len(indicies)): 
            pred_true = indicies[:idx]
            ctr = np.arange(indicies.shape[0])
            masked_prediction = np.in1d(ctr, pred_true)
            cm = confusion_matrix(y_test.astype(int), masked_prediction.astype(int))
            rates = confusion_rates(cm)
     
            profit.append(np.sum(np.multiply(rates,cb)))
        
        plt.plot((np.arange(len(indicies)) / len(indicies) * 100), profit, label=name)
    plt.legend(loc="lower right")
    plt.title("Profits of classifiers")
    plt.xlabel("Percentage of test instances (decreasing by score)")
    plt.ylabel("Profit")
    plt.show()

~ Profit Curves ~

y_test = y[test_index].astype(float)

# Cost-Benefit Matrix
cb = np.array([[4, -5],
               [0, 0]])

# Define classifiers for comparison
classifiers = [("Logistic Regression",LR()),
               ("Gradient Boosting Classifier", GBC()),
               ("Support Vector Machines",SVC(probability=True)),
               ("Random Forest", RF(n_estimators=18)),
               ("K-nearest-neighbors:", KNN())
              ]
               
# Plot profit curves
profit_curve(classifiers)

forest = RF(n_estimators=18, random_state=4)
forest_fit = forest.fit(X[train_index], y[train_index])
predictions = forest_fit.predict(X[test_index])

print confusion_matrix(y[test_index],predictions)
print accuracy_score(y[test_index],predictions)
print classification_report(y[test_index],predictions)

[[706   9]
 [ 24  95]]
0.960431654676
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       715
          1       0.91      0.80      0.85       119

avg / total       0.96      0.96      0.96       834

# Grid Search and Hyper Parameters
rfc = RF(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True, random_state = 4) 

param_grid = { 
    'n_estimators': [5, 10, 20, 40, 80, 160, 200],
    #'min_samples_leaf': [1, 5, 10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X[train_index], y[train_index])

means = CV_rfc.cv_results_['mean_test_score']
stds = CV_rfc.cv_results_['std_test_score']

print("Best: %f using %s with %s" % (CV_rfc.best_score_, CV_rfc.best_params_, CV_rfc.best_estimator_))
for params, mean_score, scores in zip(CV_rfc.cv_results_['params'],means,stds):
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

Best: 0.949180 using {'max_features': 'auto', 'n_estimators': 200} with RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=True, random_state=4,
            verbose=0, warm_start=False)
0.010515 (0.000000) with: {'max_features': 'auto', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'auto', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'auto', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'auto', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'auto', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'auto', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'auto', 'n_estimators': 200}
0.010515 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'sqrt', 'n_estimators': 200}
0.010515 (0.000000) with: {'max_features': 'log2', 'n_estimators': 5}
0.007910 (0.000000) with: {'max_features': 'log2', 'n_estimators': 10}
0.004662 (0.000000) with: {'max_features': 'log2', 'n_estimators': 20}
0.003245 (0.000000) with: {'max_features': 'log2', 'n_estimators': 40}
0.003866 (0.000000) with: {'max_features': 'log2', 'n_estimators': 80}
0.005106 (0.000000) with: {'max_features': 'log2', 'n_estimators': 160}
0.006393 (0.000000) with: {'max_features': 'log2', 'n_estimators': 200}

forest = CV_rfc.best_estimator_
forest_fit = forest.fit(X[train_index], y[train_index])
predictions = forest_fit.predict(X[test_index])

print confusion_matrix(y[test_index],predictions)
print accuracy_score(y[test_index],predictions)
print classification_report(y[test_index],predictions)

print "Random forests senstivity analysis Train Data:"
plot_roc(X[train_index],y[train_index],CV_rfc.best_estimator_)
print "Random forests senstivity analysis Test Data:"
plot_roc(X[test_index],y[test_index],CV_rfc.best_estimator_)

[[706   9]
 [ 23  96]]
0.961630695444
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       715
          1       0.91      0.81      0.86       119

avg / total       0.96      0.96      0.96       834

Random forests senstivity analysis Train Data:

Random forests senstivity analysis Test Data:

print "~ Threshold Value ~"
clf = CV_rfc.best_estimator_

n_trials = 50
test_size_percent = 0.1

signals = X
labels = y

plot_data = []

train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
clf.fit(train_signals, train_labels)
predictions = clf.predict_proba(test_signals)[:,1]

precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
thresholds = np.append(thresholds, 1)

queue_rate = []
for threshold in thresholds:
    queue_rate.append((predictions >= threshold).mean())
    
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])

leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')

~ Threshold Value ~

<matplotlib.text.Text at 0x10a2f668>

clf = CV_rfc.best_estimator_

n_trials = 50
test_size_percent = 0.1

signals = X
labels = y

plot_data = []

for trial in range(n_trials):
    train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
    clf.fit(train_signals, train_labels)
    predictions = clf.predict_proba(test_signals)[:,1]
    
    precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
    thresholds = np.append(thresholds, 1)
    
    queue_rate = []
    for threshold in thresholds:
        queue_rate.append((predictions >= threshold).mean())
            
    plot_data.append({
            'thresholds': thresholds
        ,   'precision': precision
        ,   'recall': recall
        ,   'queue_rate': queue_rate
    })
#%%
for p in plot_data:
    plt.plot(p['thresholds'], p['precision'], color=sns.color_palette()[0], alpha=0.5)
    plt.plot(p['thresholds'], p['recall'], color=sns.color_palette()[1], alpha=0.5)
    plt.plot(p['thresholds'], p['queue_rate'], color=sns.color_palette()[2], alpha=0.5)
    
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')

<matplotlib.text.Text at 0x138df978>

uniform_thresholds = np.linspace(0, 1, 101)

uniform_precision_plots = []
uniform_recall_plots= []
uniform_queue_rate_plots= []

for p in plot_data:
    uniform_precision = []
    uniform_recall = []
    uniform_queue_rate = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        uniform_precision.append(p['precision'][index])
        uniform_recall.append(p['recall'][index])
        uniform_queue_rate.append(p['queue_rate'][index])
        
    uniform_precision_plots.append(uniform_precision)
    uniform_recall_plots.append(uniform_recall)
    uniform_queue_rate_plots.append(uniform_queue_rate)
#%%
quantiles = [0.1, 0.5, 0.9]
lower_precision, median_precision, upper_precision = mstats.mquantiles(uniform_precision_plots, quantiles, axis=0)
lower_recall, median_recall, upper_recall = mstats.mquantiles(uniform_recall_plots, quantiles, axis=0)
lower_queue_rate, median_queue_rate, upper_queue_rate = mstats.mquantiles(uniform_queue_rate_plots, quantiles, axis=0)

plt.plot(uniform_thresholds, median_precision)
plt.plot(uniform_thresholds, median_recall)
plt.plot(uniform_thresholds, median_queue_rate)

plt.fill_between(uniform_thresholds, upper_precision, lower_precision, alpha=0.5, linewidth=0, color=sns.color_palette()[0])
plt.fill_between(uniform_thresholds, upper_recall, lower_recall, alpha=0.5, linewidth=0, color=sns.color_palette()[1])
plt.fill_between(uniform_thresholds, upper_queue_rate, lower_queue_rate, alpha=0.5, linewidth=0, color=sns.color_palette()[2])

leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')

<matplotlib.text.Text at 0xc9a5748>

uniform_thresholds = np.linspace(0, 1, 101)

uniform_payout_plots = []

n = 10000
success_payoff = 4
case_cost = 5

for p in plot_data:
    uniform_payout = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        precision = p['precision'][index]
        queue_rate = p['queue_rate'][index]
        
        payout = n*queue_rate*(precision*100 - case_cost)
        uniform_payout.append(payout)

    uniform_payout_plots.append(uniform_payout)

quantiles = [0.1, 0.5, 0.9]
lower_payout, median_payout, upper_payout = mstats.mquantiles(uniform_payout_plots, quantiles, axis=0)

plt.plot(uniform_thresholds, median_payout, color=sns.color_palette()[4])
plt.fill_between(uniform_thresholds, upper_payout, lower_payout, alpha=0.5, linewidth=0, color=sns.color_palette()[4])

max_ap = uniform_thresholds[np.argmax(median_payout)]
plt.vlines([max_ap], -100000, 150000, linestyles='--')
plt.ylim(-100000, 150000)

leg = plt.legend(('payout ($)', 'median argmax = {:.2f}'.format(max_ap)), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.title("Payout as a Function of Threshold")
plt.ylabel('$')

<matplotlib.text.Text at 0x10525400>

print np.max(median_payout)

113922.155689

# Scoring Model
def ChurnModel(df, clf):
    # Convert yes no columns to bool
    # yes_no_cols already known, stored as a global variable
    df[yes_no_cols] = df[yes_no_cols] == 'yes'
    # features already known, stored as a global variable
    X = df[features].as_matrix().astype(np.float)
    X = scaler.transform(X)
    
    """
    Calculates probability of churn and expected loss, 
    and gathers customer's contact info
    """     
    # Collect customer meta data
    response = df[['Area Code','Phone']]
    charges = ['Day Charge','Eve Charge','Night Charge','Intl Charge']
    response['customer_worth'] = df[charges].sum(axis=1)    
    
    # Make prediction
    churn_prob = clf.predict_proba(X)
    response['churn_prob'] = churn_prob[:,1]
    
    # Calculate expected loss
    response['expected_loss'] = response['churn_prob'] * response['customer_worth']
    response = response.sort('expected_loss', ascending=False)
    
    # Return response DataFrame
    return response

# Simulated new data 
df = pd.read_csv("data/churn.csv")    
train_index,test_index = train_test_split(df.index, random_state = 4)
test_df = df.ix[test_index]

# Apply new data to the model
ChurnModel(test_df, CV_rfc.best_estimator_)

	Area Code	Phone	customer_worth	churn_prob	expected_loss
3205	408	345-3787	89.76	0.990	88.862
3132	415	394-5489	87.35	0.990	86.477
289	510	352-6976	89.31	0.965	86.184
2113	408	335-2967	85.09	1.000	85.090
1701	415	374-1981	84.08	0.990	83.239
832	408	335-1874	83.14	0.995	82.724
3272	510	373-7974	82.96	0.995	82.545
1350	408	357-6039	82.58	0.980	80.928
306	415	419-1714	81.00	0.995	80.595
2028	510	408-4836	84.68	0.950	80.446
2103	408	413-2194	80.07	0.995	79.670
2747	408	376-4292	80.71	0.980	79.096
2908	415	406-7844	79.39	0.985	78.199
331	415	387-5453	78.93	0.990	78.141
76	415	374-5353	81.75	0.940	76.845
2210	415	367-3220	78.22	0.975	76.264
2186	415	327-8495	80.39	0.940	75.567
454	408	405-6189	79.89	0.940	75.097
301	415	416-1676	78.74	0.950	74.803
415	415	352-4418	77.88	0.960	74.765
2595	408	337-4600	78.67	0.950	74.736
2573	415	344-1970	79.39	0.940	74.627
3045	415	348-5728	76.34	0.975	74.431
546	510	418-6455	78.67	0.930	73.163
859	408	374-9203	79.37	0.905	71.830
574	510	419-1674	77.54	0.925	71.725
3168	415	415-5476	78.15	0.915	71.507
430	510	365-5979	77.07	0.925	71.290
1984	415	339-6477	77.14	0.920	70.969
1614	415	417-4810	76.98	0.915	70.437
...	...	...	...	...	...
2778	415	396-6390	56.69	0.000	0.000
2352	408	404-2877	57.70	0.000	0.000
308	510	341-3464	60.56	0.000	0.000
28	415	353-2630	64.12	0.000	0.000
3296	510	380-3186	61.48	0.000	0.000
658	415	392-9342	47.91	0.000	0.000
1031	415	411-6663	48.44	0.000	0.000
1613	415	390-3565	61.70	0.000	0.000
1215	510	383-2017	62.74	0.000	0.000
79	408	333-1967	56.46	0.000	0.000
2171	510	355-2293	60.11	0.000	0.000
637	408	333-9253	54.17	0.000	0.000
459	415	361-8239	51.40	0.000	0.000
102	415	354-3783	46.85	0.000	0.000
664	408	358-8729	62.01	0.000	0.000
1749	415	358-5274	51.90	0.000	0.000
3325	408	368-8555	56.28	0.000	0.000
1572	510	352-4541	57.07	0.000	0.000
262	415	340-3182	53.76	0.000	0.000
1994	510	369-2899	49.50	0.000	0.000
3195	510	399-7029	64.19	0.000	0.000
1131	408	333-3447	65.99	0.000	0.000
182	408	405-2888	50.17	0.000	0.000
1055	415	362-4685	51.22	0.000	0.000
2334	415	404-8765	40.57	0.000	0.000
3196	510	337-3868	58.15	0.000	0.000
1681	510	345-8350	56.20	0.000	0.000
1546	415	331-5919	49.48	0.000	0.000
1657	408	411-5078	47.23	0.000	0.000
2398	408	333-9133	54.20	0.000	0.000

	State	Account Length	Area Code	Phone	Int'l Plan	VMail Plan	Night Charge	Intl Mins	Intl Calls	Intl Charge	CustServ Calls	Churn?
0	KS	128	415	382-4657	no	yes	11.01	10.0	3	2.70	1	False.
1	OH	107	415	371-7191	no	yes	11.45	13.7	3	3.70	1	False.
2	NJ	137	415	358-1921	no	no	7.32	12.2	5	3.29	0	False.
3	OH	84	408	375-9999	yes	no	8.86	6.6	7	1.78	2	False.
4	OK	75	415	330-6626	yes	no	8.41	10.1	3	2.73	3	False.
5	AL	118	510	391-8027	yes	no	9.18	6.3	6	1.70	0	False.

	pred_prob	count	true_prob
0	0.0	1699	0.026
1	0.1	762	0.024
2	0.2	267	0.086
3	0.3	124	0.194
4	0.4	75	0.280
5	0.5	62	0.565
6	0.6	76	0.816
7	0.7	74	0.892
8	0.8	77	0.961
9	0.9	68	0.985
10	1.0	49	1.000