用户流失预测(KNN SVC RF)


import pandas as pda
import numpy as np
import missingno
import matplotlib.pyplot as plt

userData=pda.read_csv("churn.csv")
print(userData.shape)
# print(userData.describe())
# print(userData.columns.tolist())
# print(userData.head(),userData.tail())
# #查看缺失情况 无缺失
# missingno.bar(userData,color="g")
# plt.show()

#
# standerUser=StandardScaler().fit_transform(userData["Night Calls"])
# data1=userData[["Night Calls","Intl Mins"]]
# print(data1)
# standerUser=StandardScaler().fit_transform(data1)
# print(standerUser)

data=userData["Churn?"]
y=np.where(data=="True.",1,0)
# print(y)
drop_column=["State","Area Code","Phone","Churn?"]
userData=userData.drop(drop_column,axis=1)
# print(userData.head())
# yes_no=["Int'l Plan", "VMail Plan"]
#
# userData[yes_no]=userData[yes_no]=="yes"
# userData["1"]=userData["Int'l Plan"]
# userData["2"]=userData["VMail Plan"]
userData.ix[userData["Int'l Plan"]=="yes",["Int'l Plan"]]=1
userData.ix[userData["Int'l Plan"]=="no",["Int'l Plan"]]=0

userData.ix[userData["VMail Plan"]=="yes",["VMail Plan"]]=1
userData.ix[userData["VMail Plan"]=="no",["VMail Plan"]]=0

# print(userData.head())

features=userData.columns
X=userData.as_matrix().astype(np.float)
from sklearn.preprocessing import StandardScaler
X=StandardScaler().fit_transform(X)
print(X[0])
print(X.shape)
print(len(y[y==0]))

from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def run_cv(X,y,model,**kwargs):
    kf=KFold(len(y),n_folds=5,shuffle=True)
    y_pred=y.copy()

    for train_index,test_index in kf:
        X_train,X_test=X[train_index],X[test_index]
        # print(X_train)
        # print(X_test)
        y_train=y[train_index]
        clf=model(**kwargs)
        # clf=SVC()
        clf.fit(X_train,y_train)
        y_pred[test_index]=clf.predict(X_test)
    return y_pred

def accuracy(y_true,y_pred):
    return np.mean(y_pred==y_true)

print("SVC",accuracy(y,run_cv(X,y,SVC)))
print("RF",accuracy(y,run_cv(X,y,RF)))
print("KNN",accuracy(y,run_cv(X,y,KNN)))

def run_prob_cv(X,y,model,**kwargs):
    kf=KFold(len(y),n_folds=5,shuffle=True)
    y_prob=np.zeros((len(y),2))

    for train_index,test_index in kf:
        X_train,X_test=X[train_index],X[test_index]
        # print(X_train)
        # print(X_test)
        y_train=y[train_index]
        clf=model(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index]=clf.predict_proba(X_test)
    return y_prob
#
pred_prob=run_prob_cv(X,y,RF,n_estimators=10)

pred_p=pred_prob[:,1]
print(pred_p)
is_lost=y==1

counts=pda.value_counts(pred_p)
print("counts:")
print(counts)
true_prob={}
print("===========================================")
for prob in counts.index:
    true_prob[prob]=np.mean(is_lost[pred_p==prob])
    print(true_prob[prob])
    true_prob=pda.Series(true_prob)

counts=pda.concat([counts,true_prob],axis=1).reset_index()
counts.columns=["pred_prob","count","true_prob"]
print(counts)
counts["pred_prob"]=counts["pred_prob"].astype(np.float)
sortcounts=counts.sort_values(by =['pred_prob'],ascending = [True])
print(sortcounts)
#

你可能感兴趣的:(Python)