python逻辑回归模型建模步骤_实战:利用Python sklearn库里的逻辑回归模型训练数据---建立模型...

本案例主要是通过对不均衡的28万组数据建立模型,分析预测欺诈用户,采用sigmod函数作为损失函数,采用交叉验证的方法

和l1正交法则,比对了不同惩罚函数下的模型的召回率,也通过预测值和实际值做出混淆矩阵更加直观看到各种预测结果。

也比较了sigmod函数下的不同阈值下的模型预测的精度和召回率。

以下是部分数据格式,代码中对每一步如何做以及为什么做了详细的说明。可能有某些库的版本不同会出现错误。

数据格式:

python逻辑回归模型建模步骤_实战:利用Python sklearn库里的逻辑回归模型训练数据---建立模型..._第1张图片

Python源码:

#!/usr/bin/env python

# encoding: utf-8

"""

@Company:华中科技大学电气学院聚变与等离子研究所

@version: V1.0

@author: Victor

@contact: [email protected] or [email protected] 2018--2020

@software: PyCharm

@file: LG.py

@time: 2018/11/16 16:32

@Desc:

"""

import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

data = pd.read_csv("creditcard.csv")

data.head()

##该案列中Class是分类,1表示该用户是欺诈用户,0表示为好用户

count_classes = pd.value_counts(data['Class'],sort=True).sort_index()##取出Class列中不同数的个数,并按照大小排序

print(count_classes)

plt.figure(1)

count_classes.plot(kind='bar')###直接调用pd中的plot画直方图

plt.title("Fraud class histogram")

plt.xlabel("Class")

plt.ylabel("Frequency")

from sklearn.preprocessing import StandardScaler

data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))##因为机器默认Amout大小与某个特征数据的大小相对应,影响建模,所以对Amount进行标准化

data = data.drop(['Time','Amount'],axis=1)

data.head(3)

X = data.ix[:,data.columns != 'Class']##取得除去Class列的所有数据

y = data.ix[:,data.columns == 'class']

#X.head()

#y.head()

number_records_fraud = len(data[data.Class == 1])##class为1的数量

fraud_indices = np.array(data[data.Class == 1].index)###取出所有class为1在源文件中编号

##print(fraud_indices)

normal_indices = data[data.Class == 0].index

##向下采样:保证数据一样少

random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False)

#print(random_normal_indices)

random_normal_indices = np.array(random_normal_indices)##将其变为数组形式,方便使用

#print(random_normal_indices)

##将class为0和1的等量样本数据的编号整合到一起

under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

#print(under_sample_indices)

##根据索引编号取出实际数据

under_sample_data = data.iloc[under_sample_indices,:]

under_sample_data.head()

###对采样的数据进行分析处理

X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class']

y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class']

# Showing ratio

print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))

print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))

print("Total number of transactions in resampled data: ", len(under_sample_data))##输入总的样本数

##开始训练数据,建立模型

from sklearn.cross_validation import train_test_split ##交叉验证

##whole dataset

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)##对原始数据进行切分,取30%来测试,70%用于模型建立

print("Number transactions train dataset: ", len(X_train))

print("Number transactions test dataset: ", len(X_test))

print("Total number of transactions: ", len(X_train)+len(X_test))

##采样数据集

X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample,

y_undersample,

test_size=0.3,

random_state=0)

print("==============================================")

print("Number transactions train dataset: ", len(X_train_undersample))

print("Number transactions test dataset: ", len(X_test_undersample))

print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))

##Recall=TP/(TP+FN)

from sklearn.linear_model import LogisticRegression

from sklearn.cross_validation import KFold,cross_val_score

from sklearn.metrics import confusion_matrix,recall_score,classification_report

###交叉验证

def printing_Kfold_socres(x_train_data, y_train_data):

##对训练数据集分成5份,交叉选取其中一份作为验证集,然后取参数平均

fold = KFold(len(y_train_data), 5, shuffle=False)

##不同的惩罚参数,因为选择不同的正交模式对参数的浮动影响很大,需要一个参数来惩罚控制

c_param_range = [0.01, 0.1, 1, 10, 100]

results_table = pd.DataFrame(index=range(len(c_param_range), 2),

columns=['C_parameter', 'Mean recall score']) ##存储结果

results_table['C_parameter'] = c_param_range

# print(results_table)

### the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]

j = 0 ##调用参数的次数标记

##循环每个惩罚参数下得交叉验证情况

for c_param in c_param_range:

print("=====================")

print("current c paramter:", c_param)

print("=====================")

print("\n")

recall_accs = [] ##存储每个惩罚参数下得结果精度

#####设定从1(默认是0)开始遍历fold,fold是序列

for iteration, indices in enumerate(fold, start=1):

###iteration是交叉验证的次数,indices是下标而已

# >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter']

# >>> list(enumerate(seasons))

# [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]

# >>> list(enumerate(seasons, start=1)) # 下标从 1 开始

# [(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]

##使用惩罚参数调用逻辑回归模型

##lr是逻辑回归模型的实例,penalty是选择哪种正交模式

lr = LogisticRegression(C=c_param, penalty='l1')

##train the model

lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())

##通过上面的模型测试验证集

y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)

##recall_score计算召回率,并将结果保存在数组中

recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)

recall_accs.append(recall_acc)

print("iteration:", iteration, " recall score:", recall_acc)

##某个惩罚参数下的平均召回率

results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs) ##将结果存储在result_table的j行的Mean recall score列

j += 1

print('=======')

print("Mean recall score:", np.mean(recall_accs))

print("=============================")

##找到上面每个惩罚参数下的最大的平局召回率对应的参数

return results_table

############下采样交叉验证#############

results_table = printing_Kfold_socres(X_train_undersample,y_train_undersample)

print(results_table)

#print("**********************************")

############原始数据集交叉验证########

###召回率很低很低

#results_table1 = printing_Kfold_socres(X_train,y_train)

#print(results_table1)

##############根据模型预测做出混淆矩阵##############

import itertools

def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):

"""

This function prints and plots the confusion matrix.

"""

plt.imshow(cm, interpolation='nearest', cmap=cmap)

plt.title(title)

plt.colorbar()

tick_marks = np.arange(len(classes))

plt.xticks(tick_marks, classes, rotation=0)

plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()

plt.ylabel('True label')

plt.xlabel('Predicted label')

###调用模型预测,用预测值画混淆矩阵

import itertools

lr = LogisticRegression(C = 0.01, penalty = 'l1')

lr.fit(X_train_undersample,y_train_undersample.values.ravel())

y_pred_undersample = lr.predict(X_test_undersample.values)###预测的直接是分类值 0,1

# 计算混淆矩阵

cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)##预测值和真实值画混淆矩阵

np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix

class_names = [0,1]

plt.figure(2)

plot_confusion_matrix(cnf_matrix

, classes=class_names

, title='Confusion matrix')

#plt.show()

#####sigmod函数(损失函数)的阈值对模型的预测有很大影响

#####所以看下各个阈值下的召回率和精度,找到最合适的阈值

#####通过混淆矩阵来看

lr = LogisticRegression(C=0.01, penalty='l1')

lr.fit(X_train_undersample, y_train_undersample.values.ravel())

y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

plt.figure(figsize=(10, 10))

j = 1

for i in thresholds:

y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i

plt.subplot(3, 3, j)

j += 1

# Compute confusion matrix

cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)

np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

# Plot non-normalized confusion matrix

class_names = [0, 1]

plot_confusion_matrix(cnf_matrix

, classes=class_names

, title='Threshold >= %s' % i)

plt.show()

'''上采样:使不均衡的样本数据一样多,通常采用SMOTE算法(通过在少类的样本中找到一个点,求该点到其他少类样本点的距离

再排列取值生成新的样本点)

过采样可以使召回下降,精度上升,误杀率下降'''

import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

credit_cards=pd.read_csv('creditcard.csv')

columns=credit_cards.columns

# The labels are in the last column ('Class'). Simply remove it to obtain features columns

features_columns=columns.delete(len(columns)-1)

features=credit_cards[features_columns]

labels=credit_cards['Class']

features_train, features_test, labels_train, labels_test = train_test_split(features,

labels,

test_size=0.2,

random_state=0)

oversampler=SMOTE(random_state=0)

os_features,os_labels=oversampler.fit_sample(features_train,labels_train)

os_features = pd.DataFrame(os_features)

os_labels = pd.DataFrame(os_labels)

best_c = printing_Kfold_socres(os_features,os_labels)

lr = LogisticRegression(C = 0.01, penalty = 'l1')

lr.fit(os_features,os_labels.values.ravel())

y_pred = lr.predict(features_test.values)

# Compute confusion matrix

cnf_matrix = confusion_matrix(labels_test,y_pred)

np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix

class_names = [0,1]

plt.figure()

plot_confusion_matrix(cnf_matrix

, classes=class_names

, title='Confusion matrix')

plt.show()因为结果图片很多,没有展示出来

你可能感兴趣的:(python逻辑回归模型建模步骤_实战:利用Python sklearn库里的逻辑回归模型训练数据---建立模型...)