from __future__ import division
import pandas as pd
import numpy as np
churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()
print "Column_names:"
print col_names
to_show = col_names[:6]+col_names[-6:]
print "\nSample_data:"
churn_df[to_show].head(3)
Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:
|
State |
Account Length |
Area Code |
Phone |
Int’l Plan |
VMail Plan |
Night Charge |
Intl Mins |
Intl Calls |
Intl Charge |
CustServ Calls |
Churn? |
0 |
KS |
128 |
415 |
382-4657 |
no |
yes |
11.01 |
10.0 |
3 |
2.70 |
1 |
False. |
1 |
OH |
107 |
415 |
371-7191 |
no |
yes |
11.45 |
13.7 |
3 |
3.70 |
1 |
False. |
2 |
NJ |
137 |
415 |
358-1921 |
no |
no |
7.32 |
12.2 |
5 |
3.29 |
0 |
False. |
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',1,0)
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
feaures = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[0]
print len(y[y == 0])
Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850
from sklearn.cross_validation import KFold
def run_cv(X,y,clf_class,**kwargs):
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
def accuracy(y_true,y_pred):
return np.mean(y_true == y_pred)
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897
from sklearn.cross_validation import KFold
def run_prob_cv(X,y,clf_class,**kwargs):
kf = KFold(len(y),n_folds=5,shuffle=True)
y_prob = np.zeros((len(y),2))
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
import warnings
warnings.filterwarnings('ignore')
pred_prob = run_prob_cv(X,y,RF,n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y == 1
counts = pd.value_counts(pred_churn)
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
|
pred_prob |
count |
true_prob |
0 |
0.0 |
1779 |
0.029230 |
1 |
0.1 |
696 |
0.020115 |
2 |
0.2 |
265 |
0.060377 |
3 |
0.3 |
126 |
0.142857 |
4 |
0.8 |
91 |
0.978022 |
5 |
0.9 |
75 |
0.960000 |
6 |
0.4 |
73 |
0.438356 |
7 |
0.7 |
65 |
0.953846 |
8 |
0.5 |
57 |
0.561404 |
9 |
1.0 |
56 |
0.982143 |
10 |
0.6 |
50 |
0.820000 |