机器学习项目实战之用户流失预警

from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()

print "Column_names:"
print col_names

to_show = col_names[:6]+col_names[-6:]
print "\nSample_data:"
churn_df[to_show].head(3)
Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:
State Account Length Area Code Phone Int’l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.
#将字符改变成数值,便于分析
#Churn是客户量流失的意思
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',1,0)

#去掉一些特征
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

#将这些yes和no转化为布尔值
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

#重点:不同的特征项有不同的值,如1-2,3万到4万,不同特征间的数值上的巨大差异会影响我们的分析
#例如作图的时候,所以我们需要统一将这些数据压缩到一定的区间上
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[0]
print len(y[y == 0])
Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850
from sklearn.cross_validation import KFold

#交叉验证函数:X是特征数据,y是label,clf_class是你选择的分类器,kwargs指定的参数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
from sklearn.svm import SVC  #支持向量机
from sklearn.ensemble import RandomForestClassifier as RF  #随机森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近邻

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

#尝试使用多种分类器来验证效果
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897
#以上的准确率的意义并不大,对于客户来说,重要的是ROC指标FN,即我预测错了,认为客户不会流失,但是客户流失了

from sklearn.cross_validation import KFold

#交叉验证函数:X是特征数据,y是label,clf_class是你选择的分类器,kwargs指定的参数
def run_prob_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_prob = np.zeros((len(y),2))

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob
import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=10)

pred_churn = pred_prob[:,1]
is_churn = y == 1

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
#通过观测以下数据进行预警,当实际的可能性是百分之30或者40时,对应的真实的用户流失情况,由用户选择阈值进行预警
pred_prob count true_prob
0 0.0 1779 0.029230
1 0.1 696 0.020115
2 0.2 265 0.060377
3 0.3 126 0.142857
4 0.8 91 0.978022
5 0.9 75 0.960000
6 0.4 73 0.438356
7 0.7 65 0.953846
8 0.5 57 0.561404
9 1.0 56 0.982143
10 0.6 50 0.820000

你可能感兴趣的:(python机器学习项目实战)