本案例主要是通过对不均衡的28万组数据建立模型,分析预测欺诈用户,采用sigmod函数作为损失函数,采用交叉验证的方法
和l1正交法则,比对了不同惩罚函数下的模型的召回率,也通过预测值和实际值做出混淆矩阵更加直观看到各种预测结果。
也比较了sigmod函数下的不同阈值下的模型预测的精度和召回率。
以下是部分数据格式,代码中对每一步如何做以及为什么做了详细的说明。可能有某些库的版本不同会出现错误。
数据格式:
Python源码:
#!/usr/bin/env python # encoding: utf-8 """ @Company:华中科技大学电气学院聚变与等离子研究所 @version: V1.0 @author: Victor @contact: [email protected] or [email protected] 2018--2020 @software: PyCharm @file: LG.py @time: 2018/11/16 16:32 @Desc: """ import pandas as pd import matplotlib.pyplot as plt import numpy as np data = pd.read_csv("creditcard.csv") data.head() ##该案列中Class是分类,1表示该用户是欺诈用户,0表示为好用户 count_classes = pd.value_counts(data['Class'],sort=True).sort_index()##取出Class列中不同数的个数,并按照大小排序 print(count_classes) plt.figure(1) count_classes.plot(kind='bar')###直接调用pd中的plot画直方图 plt.title("Fraud class histogram") plt.xlabel("Class") plt.ylabel("Frequency") from sklearn.preprocessing import StandardScaler data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))##因为机器默认Amout大小与某个特征数据的大小相对应,影响建模,所以对Amount进行标准化 data = data.drop(['Time','Amount'],axis=1) data.head(3) X = data.ix[:,data.columns != 'Class']##取得除去Class列的所有数据 y = data.ix[:,data.columns == 'class'] #X.head() #y.head() number_records_fraud = len(data[data.Class == 1])##class为1的数量 fraud_indices = np.array(data[data.Class == 1].index)###取出所有class为1在源文件中编号 ##print(fraud_indices) normal_indices = data[data.Class == 0].index ##向下采样:保证数据一样少 random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False) #print(random_normal_indices) random_normal_indices = np.array(random_normal_indices)##将其变为数组形式,方便使用 #print(random_normal_indices) ##将class为0和1的等量样本数据的编号整合到一起 under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) #print(under_sample_indices) ##根据索引编号取出实际数据 under_sample_data = data.iloc[under_sample_indices,:] under_sample_data.head() ###对采样的数据进行分析处理 X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class'] # Showing ratio print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data)) print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data)) print("Total number of transactions in resampled data: ", len(under_sample_data))##输入总的样本数 ##开始训练数据,建立模型 from sklearn.cross_validation import train_test_split ##交叉验证 ##whole dataset X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)##对原始数据进行切分,取30%来测试,70%用于模型建立 print("Number transactions train dataset: ", len(X_train)) print("Number transactions test dataset: ", len(X_test)) print("Total number of transactions: ", len(X_train)+len(X_test)) ##采样数据集 X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0) print("==============================================") print("Number transactions train dataset: ", len(X_train_undersample)) print("Number transactions test dataset: ", len(X_test_undersample)) print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample)) ##Recall=TP/(TP+FN) from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold,cross_val_score from sklearn.metrics import confusion_matrix,recall_score,classification_report ###交叉验证 def printing_Kfold_socres(x_train_data, y_train_data): ##对训练数据集分成5份,交叉选取其中一份作为验证集,然后取参数平均 fold = KFold(len(y_train_data), 5, shuffle=False) ##不同的惩罚参数,因为选择不同的正交模式对参数的浮动影响很大,需要一个参数来惩罚控制 c_param_range = [0.01, 0.1, 1, 10, 100] results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score']) ##存储结果 results_table['C_parameter'] = c_param_range # print(results_table) ### the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1] j = 0 ##调用参数的次数标记 ##循环每个惩罚参数下得交叉验证情况 for c_param in c_param_range: print("=====================") print("current c paramter:", c_param) print("=====================") print("\n") recall_accs = [] ##存储每个惩罚参数下得结果精度 #####设定从1(默认是0)开始遍历fold,fold是序列 for iteration, indices in enumerate(fold, start=1): ###iteration是交叉验证的次数,indices是下标而已 # >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter'] # >>> list(enumerate(seasons)) # [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')] # >>> list(enumerate(seasons, start=1)) # 下标从 1 开始 # [(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')] ##使用惩罚参数调用逻辑回归模型 ##lr是逻辑回归模型的实例,penalty是选择哪种正交模式 lr = LogisticRegression(C=c_param, penalty='l1') ##train the model lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel()) ##通过上面的模型测试验证集 y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values) ##recall_score计算召回率,并将结果保存在数组中 recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample) recall_accs.append(recall_acc) print("iteration:", iteration, " recall score:", recall_acc) ##某个惩罚参数下的平均召回率 results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs) ##将结果存储在result_table的j行的Mean recall score列 j += 1 print('=======') print("Mean recall score:", np.mean(recall_accs)) print("=============================") ##找到上面每个惩罚参数下的最大的平局召回率对应的参数 return results_table ############下采样交叉验证############# results_table = printing_Kfold_socres(X_train_undersample,y_train_undersample) print(results_table) #print("**********************************") ############原始数据集交叉验证######## ###召回率很低很低 #results_table1 = printing_Kfold_socres(X_train,y_train) #print(results_table1) ##############根据模型预测做出混淆矩阵############## import itertools def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') ###调用模型预测,用预测值画混淆矩阵 import itertools lr = LogisticRegression(C = 0.01, penalty = 'l1') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_undersample = lr.predict(X_test_undersample.values)###预测的直接是分类值 0,1 # 计算混淆矩阵 cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)##预测值和真实值画混淆矩阵 np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # Plot non-normalized confusion matrix class_names = [0,1] plt.figure(2) plot_confusion_matrix(cnf_matrix , classes=class_names , title='Confusion matrix') #plt.show() #####sigmod函数(损失函数)的阈值对模型的预测有很大影响 #####所以看下各个阈值下的召回率和精度,找到最合适的阈值 #####通过混淆矩阵来看 lr = LogisticRegression(C=0.01, penalty='l1') lr.fit(X_train_undersample, y_train_undersample.values.ravel()) y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values) thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] plt.figure(figsize=(10, 10)) j = 1 for i in thresholds: y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i plt.subplot(3, 3, j) j += 1 # Compute confusion matrix cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall) np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])) # Plot non-normalized confusion matrix class_names = [0, 1] plot_confusion_matrix(cnf_matrix , classes=class_names , title='Threshold >= %s' % i) plt.show() '''上采样:使不均衡的样本数据一样多,通常采用SMOTE算法(通过在少类的样本中找到一个点,求该点到其他少类样本点的距离 再排列取值生成新的样本点) 过采样可以使召回下降,精度上升,误杀率下降''' import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split credit_cards=pd.read_csv('creditcard.csv') columns=credit_cards.columns # The labels are in the last column ('Class'). Simply remove it to obtain features columns features_columns=columns.delete(len(columns)-1) features=credit_cards[features_columns] labels=credit_cards['Class'] features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0) oversampler=SMOTE(random_state=0) os_features,os_labels=oversampler.fit_sample(features_train,labels_train) os_features = pd.DataFrame(os_features) os_labels = pd.DataFrame(os_labels) best_c = printing_Kfold_socres(os_features,os_labels) lr = LogisticRegression(C = 0.01, penalty = 'l1') lr.fit(os_features,os_labels.values.ravel()) y_pred = lr.predict(features_test.values) # Compute confusion matrix cnf_matrix = confusion_matrix(labels_test,y_pred) np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # Plot non-normalized confusion matrix class_names = [0,1] plt.figure() plot_confusion_matrix(cnf_matrix , classes=class_names , title='Confusion matrix') plt.show()
因为结果图片很多,没有展示出来