Logistic Regression 信用卡交易异常检测

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
data = pd.read_scv("creditcard.csv")
data.head()
data
help(pd.value_counts) # 计算非空值计数的直方图

value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True) Compute a histogram of the counts of non-null values.

count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
# obj.sort_index() 按行索引进行排序
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
交易正常与异常的数量

针对不平衡样本处理方法:过采样和欠采样

X=data.loc[:, data.columns != 'Class'] # 将不是class列的数据赋给X
y = data.loc[:, data.columns == 'Class'] 
# 计算交易异常的个数
number_records_fraud = len(data[data.Class == 1])
# data.Class ==1 返回的是布尔值,data[True]是返回Class列为1的整行数据
fraud_indices = np.array(data[data.Class == 1 ].index)
# 将Class为1的数据的索引生成一个数组。
normal_indices = data[data.Class == 0].index
# 将Class = 0 的数据的索引赋给normal_indices
欠采样
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
# 随机选择(从Class = 0 的索引里选择,数量和class = 1 的一样多,是否替换样本为否)
random_normal_indices = np.array(random_normal_indices)

# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
## 将两个索引数组拼接起来
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]
# 通过索引将每一行数据赋给欠采样数据集
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']
# 将欠采样数据集的X,y分开

# Showing ratio 计算正常交易和异常交易的所占的比例
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
欠采样样本

loc函数完全基于标签位置的索引器,所谓标签,位置就是定义的行标题

iloc函数完全基于行号的索引器,所谓的行号就是第0,1,2行

from sklearn.model_selection import train_test_split  
#从sklearn库调特征选择将样本分成为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0) 
# 测试集占总样本的0.3, 计算测试的个数。
# whole dataset
print("Number transactions train dataset:", len(X_train))
print("Number transactions test dataset:", len(X_test))
print("Total number of transactions:", len(X_train) + len(X_test))
# undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample ,y_undersample,test_size = 0.3 ,random_state = 0)
print("")
print("Number transactions train dataset:", len(X_train_undersample))
print("Number transactions test dataset:", len(X_train_undersample))
print("Total number of transactions:", len(X_train_undersample) + len(X_test_undersample))
切分训练集和测试集

几个评价指标

评价指标

错误率

准确度

精确度

召回率

F1_score

ROC曲线

以FPR为横坐标,TPR为纵坐标,那么ROC曲线就是改变各种阈值后得到的所有坐标点 (FPR,TPR) 的连线,画出来如下。红线是随机乱猜情况下的ROC,曲线越靠左上角,分类器越佳。


ROC

AUC被定义为ROC曲线下的面积
在比较不同的分类模型时,可以将每个模型的ROC曲线都画出来,比较曲线下面积做为模型优劣的指标。
ROC曲线下方的面积(英语:Area under the Curve of ROC (AUC ROC)),其意义是:
因为是在1x1的方格里求面积,AUC必在0~1之间。
假设阈值以上是阳性,以下是阴性;
若随机抽取一个阳性样本和一个阴性样本,分类器正确判断阳性样本的值高于阴性样本之机率

简单说:AUC值越大的分类器,正确率越高。

#Recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, recall_score, classification_report
# 分类报告sklearn中的classification_report函数用于显示主要分类指标的文本报告.在报告中显示每个类的精确度,召回率,F1值等信息。 
# 主要参数: 
# y_true:1维数组,或标签指示器数组/稀疏矩阵,目标值。 
# y_pred:1维数组,或标签指示器数组/稀疏矩阵,分类器返回的估计值。 
# labels:array,shape = [n_labels],报表中包含的标签索引的可选列表。 
# target_names:字符串列表,与标签匹配的可选显示名称(相同顺序)。 
# sample_weight:类似于shape = [n_samples]的数组,可选项,样本权重。 
# digits:int,输出浮点值的位数.
def printing_Kfold_scores(x_train_data, y_train_data):
    fold = KFold(5, shuffle = False) #进行k折交叉验证,是否在将数据分成批之前进行洗牌为否
# 不同的罚参数
c_param_range = [0.01, 0.1, 1, 10, 100]
 results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    #          形成DataFrame结构表  索引为五行两列,列名为C_parameter , Mean recall score
    results_table['C_parameter'] = c_param_range  # 将罚参数赋给C_parameter这一列

    # the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')

        recall_accs = []    # 召回率
        for iteration, indices in enumerate(fold.split(x_train_data)):

            # Call the logistic regression model with a certain C parameter
            lr = LogisticRegression(C = c_param, penalty = 'l1')  # 调用逻辑回归,L1型罚函数

            # 利用训练数据拟合模型。在这种情况下,我们使用折叠部分来训练模型
            # with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
            # rvavel(散开,解开),flatten(变平)。两者的区别在于返回拷贝(copy)还是返回视图(view),
            #numpy.flatten()返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵,
            # 而numpy.ravel()返回的是视图,会影响(reflects)原始矩阵。
           
            # 使用训练数据中的测试指标预测值
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)

            # 计算召回率,并将其追加到表示当前c_parameter的召回率列表中
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)

        # The mean value of those recall scores is the metric we want to save and get hold of.
        results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
     # 将召回率最大的罚参数输出                                          
    # 最后,我们可以检查选择的C参数中哪个是最好的。
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    return best_c

enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,

一般用在 for 循环当中。
enumerate
import warnings
warnings.filterwarnings("ignore")  # 忽略警告
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample) # 将欠采样数据集传入进去
不同罚参数

不同罚参数

不同罚参数
def plot_confusion_matrix(cm, classes, title = "Confusion matrix", camp = plt.cm.Blues):     # 混淆矩阵
plt.imshow(cm,interpolation = 'nearest', cmap = cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation = 0)
plt.yticks(tick_marks, classes)

thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
    plt.text(j, i, cm[i, j], horizontalignment = "center", color = "white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel("Predicted label")
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# 绘制非标准化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()
Confusion matrix
best_c = printing_Kfold_scores(X_train,y_train)

C parameter: 0.01

Iteration 0 : recall score = 0.4925373134328358
Iteration 1 : recall score = 0.6027397260273972
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5692307692307692
Iteration 4 : recall score = 0.45

Mean recall score 0.5595682284048672


C parameter: 0.1

Iteration 0 : recall score = 0.5671641791044776
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5846153846153846
Iteration 4 : recall score = 0.525

Mean recall score 0.5953102506435158


C parameter: 1

Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7166666666666667
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.5625

Mean recall score 0.612645688837163


C parameter: 10

Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575

Mean recall score 0.6184790221704963


C parameter: 100

Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575

Mean recall score 0.6184790221704963


Best model to choose from cross validation is with C parameter = 10.0


lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

Recall metric in the testing dataset: 0.6190476190476191


image.png
lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j = 1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
    
    plt.subplot(3,3,j)
    j += 1
    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i) 

Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 0.9727891156462585
Recall metric in the testing dataset: 0.9387755102040817
Recall metric in the testing dataset: 0.8979591836734694
Recall metric in the testing dataset: 0.8367346938775511
Recall metric in the testing dataset: 0.782312925170068
Recall metric in the testing dataset: 0.5986394557823129


untitled.png

你可能感兴趣的:(Logistic Regression 信用卡交易异常检测)