【学习用】
原题目地址:Synthetic Financial Datasets For Fraud Detection
原代码地址:Predicting Fraud in Financial Payment Services
目录
1 准备工作
1.1 各种引入
1.1.1 Import Packages
1.1.2 Import Dataset
1.2 数据探索
目录
1 准备工作
1.1 各种引入
1.1.1 Import Packages
1.1.2 Import Dataset
1.2 数据探索(Exploratory Data Analysis, EDA)
1.2.1 交易类型
1.2.2 商人账户
1.3 数据清洗
1.3.1 选取有用数据
1.3.2 缺失数据处理
2 深入分析
2.1 特征工程
2.2 可视化
3 建模求解
4 结论
1.3 数据清洗
2 深入分析
2.1 特征工程
2.2 可视化
3 建模求解
4 结论
# 基本
import pandas as pd
import numpy as np
# 可视化
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# 建模
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
# 警告忽略
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# 数据导入
df = pd.read_csv('data/raw_data.csv')
# 重命名变量(hhh是强迫症的概念:-P)
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', 'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
# 前5条数据
print(df.head())
# 缺失数据初判
df.isnull().values.any()
在这部分中对数据进行简单分析
共有5种交易类型(type):CASH_IN, CASH_OUT, DEBIT, PAYMENT, TRANSFER.
# 判断欺诈出现在哪种交易类型
print('\n The types of fraudulent transactions are {}'.format(df.loc[df.isFraud == 1].type.drop_duplicates().values))
dfFraudTransfer = df.loc[(df.isFraud == 1) & (df.type == 'TRANSFER')]
dfFraudCashout = df.loc[(df.isFraud == 1) & (df.type == 'CASH_OUT')]
print ('\n The number of fraudulent TRANSFERs = {}'.format(len(dfFraudTransfer)))
print ('\n The number of fraudulent CASH_OUTs = {}'.format(len(dfFraudCashout)))
The types of fraudulent transactions are ['TRANSFER', 'CASH_OUT']
The number of fraudulent TRANSFERs = 4097
The number of fraudulent CASH_OUTs = 4116
【待补充】判定isFlaggedFraud insignificant 的方法
判断商人账户在各笔交易中的位置,已知商人账户以M打头
# M是否出现在CASH_IN的Orig账户中
print('\nAre there any merchants among originator accounts for CASH_IN transactions? {}'.format((df.loc[df.type == 'CASH_IN'].nameOrig.str.contains('M')).any())) # False
# M是否出现在CASH_OUT的Dest账户中
print('\nAre there any merchants among originator accounts for CASH_OUT transactions? {}'.format((df.loc[df.type == 'CASH_OUT'].nameOrig.str.contains('M')).any())) # False
# M是否出现在任何Orig账户中
print('\nAre there merchants among any originator accounts? {}'.format(df.nameOrig.str.contains('M').any())) # False
# M是否出现在任何非PAYMENT的Dest账户中
print('\nAre there any transactions having merchants among destination accounts other than the PAYMENT type? {}'.format((df.loc[df.nameDest.str.contains('M')].type != 'PAYMENT').any())) # False
# 只取部分交易(行)数据研究
X = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
Y = X['isFraud']
del X['isFraud'] # del删除变量-非数据
# 只取部分列数据研究
X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)
# type数据转换0-1
X.loc[X.type == 'TRANSFER', 'type'] = 0
X.loc[X.type == 'CASH_OUT', 'type'] = 1
X.type = X.type.astype(int)
# 划分欺诈交易和非欺诈交易
Xfraud = X.loc[Y == 1]
XnonFraud = X.loc[Y == 0]
# 欺诈数据中缺失数据占比
print('\nThe fraction of fraudulent transactions with \'oldBalanceDest\' = \
\'newBalanceDest\' = 0 although the transacted \'amount\' is non-zero is: {}'.\
format(len(Xfraud.loc[(Xfraud.oldBalanceDest == 0) & \
(Xfraud.newBalanceDest == 0) & (Xfraud.amount)]) / (1.0 * len(Xfraud))))
# 非欺诈数据中缺失数据占比
print('\nThe fraction of genuine transactions with \'oldBalanceDest\' = \
newBalanceDest\' = 0 although the transacted \'amount\' is non-zero is: {}'.\
format(len(XnonFraud.loc[(XnonFraud.oldBalanceDest == 0) & \
(XnonFraud.newBalanceDest == 0) & (XnonFraud.amount)]) / (1.0 * len(XnonFraud))))
# 给缺失数据打标签
X.loc[(X.oldBalanceDest == 0) & (X.newBalanceDest == 0) & (X.amount != 0), ['oldBalanceDest', 'newBalanceDest']] = - 1
X.loc[(X.oldBalanceOrig == 0) & (X.newBalanceOrig == 0) & (X.amount != 0), ['oldBalanceOrig', 'newBalanceOrig']] = np.nan
X['errorBalanceOrig'] = X.newBalanceOrig + X.amount - X.oldBalanceOrig
X['errorBalanceDest'] = X.oldBalanceDest + X.amount - X.newBalanceDest
limit = len(X)
def plotStrip(x, y, hue, figsize = (14, 9)):
fig = plt.figure(figsize = figsize)
colours = plt.cm.tab10(np.linspace(0, 1, 9))
with sns.axes_style('ticks'):
ax = sns.stripplot(x, y, hue = hue, jitter = 0.4, marker = '.', size = 4, palette = colours)
ax.set_xlabel('')
ax.set_xticklabels(['genuine', 'fraudulent'], size = 16)
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(2)
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['Transfer', 'Cash out'], bbox_to_anchor=(1, 1), loc=2, borderaxespad=0, fontsize = 16);
return ax
ax = plotStrip(Y[:limit], X.step[:limit], X.type[:limit])
ax.set_ylabel('time [hour]', size = 16)
ax.set_title('Striped vs. homogenous fingerprints of genuine and fraudulent transactions over time', size = 20);
ax = plotStrip(Y[:limit], X.amount[:limit], X.type[:limit], figsize = (14, 9))
ax.set_ylabel('amount', size = 16)
ax.set_title('Same-signed fingerprints of genuine and fraudulent transactions over amount', size = 18);
ax = plotStrip(Y[:limit], - X.errorBalanceDest[:limit], X.type[:limit], figsize = (14, 9))
ax.set_ylabel('- errorBalanceDest', size = 16)
ax.set_title('Opposite polarity fingerprints over the error in
destination account balances', size = 18);