科技金融应用:欺诈风险识别

https://www.dcic-china.com/competitions/10060

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder


def train_lgb_kfold(X_train, y_train, X_test, n_fold=5, cate_feats=None):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=cate_feats)
        dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 31,  # 31
            # 'n_estimators': 1500,
            'learning_rate': 0.05,
            'min_data_in_leaf': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 1024
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=300,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=50)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds



def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.001):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh



data = pd.read_csv('账户交易信息.csv')
data_static = pd.read_csv('账户静态信息.csv')
data_label = pd.read_csv('训练集标签.csv')
data['jyrq'] = pd.to_datetime(data['jyrq'] + " " + data['jysj']).astype(int) // 10**9
data_static['khrq'] = pd.to_datetime(data_static['khrq'] + " 00:00:00").astype(int) // 10**9

d = {}
users = data.zhdh.unique().tolist()
for user_id in tqdm(users):
    lt = data[data.zhdh == user_id].reset_index(drop=True)
    lt = lt.sort_values(['jyrq']).reset_index(drop=True)

    all_cnt = lt.shape[0] # 操作次数
    in_cnt = lt.jdbj.sum() # 转入次数、转出次数、转入次数占比
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt

    in_amt = lt[lt.jdbj == 1].jyje.sum() # 转入金额、转出金额、转入金额占比、转入方差、转出方差、交易金额比原始金额
    out_amt = lt[lt.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)
    in_amt_std = lt[lt.jdbj == 1].jyje.std()
    out_amt_std = lt[lt.jdbj == 0].jyje.std()
    zhengshu = lt['jyje'].apply(lambda x: 1 if x >= 1000 else 0).sum()
    # ratio = (lt.jyje / lt.zhye).mean()
    # inc = 0  # 入账顺序数
    # dec = 0  # 出账顺序数
    # ratio = 0
    #     # if i + 1 < all_cnt and lt.iloc[i]['zhye'] <= lt.iloc[i + 1]['zhye']:
    #     #     inc += 1
    #     # if i + 1 < all_cnt and lt.iloc[i]['zhye'] >= lt.iloc[i + 1]['zhye']:
    #     #     dec += 1
    #     if lt.iloc[i].jdbj == 1:
    #         ratio += lt.iloc[i].jyje / (lt.iloc[i].zhye - lt.iloc[i].jyje)
    #     else:
    #         ratio += lt.iloc[i].jyje / (lt.iloc[i].zhye + lt.iloc[i].jyje)
    # ratio /= all_cnt

    in_user_cnt = lt[lt.jdbj == 1].dfzh.nunique() # 转入人数、转出人数、转入人数占比
    out_user_cnt = lt[lt.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)
    
    in_jyqd = lt[lt.jdbj == 1].jyqd.nunique() # 转入渠道数、转出渠道数、转入渠道数占比
    out_jyqd = lt[lt.jdbj == 0].jyqd.nunique()
    in_jyqd_ratio = in_jyqd / (in_jyqd + out_jyqd)

    # in_dfhh = lt[lt.jdbj == 1].dfhh.nunique() / in_cnt # 转入行号数、转出行号数、转入行号数占比
    # out_dfhh = lt[lt.jdbj == 0].dfhh.nunique() / out_cnt
    # in_dfhh_ratio = in_dfhh / (in_dfhh + out_dfhh)

    name_len = lt.dfmccd.mean() # 对方名称长度
    date_cnt = lt.jyrq.nunique() # 转账日期数
    date_std = lt.jyrq.std() # 转帐日期方差
    date_sum = lt.jyrq.sum()
    d[user_id] = [all_cnt, in_cnt, out_cnt, in_ratio,
                in_amt, out_amt, in_amt_ratio, in_amt_std, out_amt_std,
                # inc, dec, 
                # ratio
                zhengshu,
                in_user_cnt, out_user_cnt, in_user_ratio, 
                in_jyqd, out_jyqd, in_jyqd_ratio,
                # in_dfhh, out_dfhh, in_dfhh_ratio,
                name_len,
                date_cnt, date_std, date_sum]

data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 
                    'in_amt', 'out_amt', 'in_amt_ratio', 'in_amt_std', 'out_amt_std',
                    # 'inc', 'dec', 
                    # 'ratio'
                    'zhengshu',
                    'in_user_cnt', 'out_user_cnt', 'in_user_ratio',
                    'in_jyqd', 'out_jyqd', 'in_jyqd_ratio',
                    # 'in_dfhh', 'out_dfhh', 'in_dfhh_ratio',
                    'name_len',
                    'date_cnt', 'date_std', 'date_sum']


data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
df_feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age', 'khrq']], on='zhdh', how='left')
df_final = pd.merge(left=df_feats, right=data_label, on='zhdh', how='left')

# 转帐日期和开户日期差
df_final['date_sum'] = df_final[['date_sum', 'khrq', 'all_cnt']].apply(lambda x : x['date_sum'] - x['khrq'] * x['all_cnt'], axis=1)
print(df_final.head(5))

df_train = df_final[df_final.black_flag.notnull()].reset_index(drop=True)
df_test = df_final[df_final.black_flag.isnull()].reset_index(drop=True)

feats = df_train.columns[1:-1].tolist()
X_train = df_train[feats]
y_train = df_train['black_flag']
X_test = df_test[feats]


gbms, oof_preds, test_preds = train_lgb_kfold(X_train, y_train, X_test, n_fold=5)
best_thresh = gen_thres_new(df_train, oof_preds)

df_test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
df_test[['zhdh', 'black_flag']].to_csv('res.csv', index=False)
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
import warnings
import gc
import torch
import os
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import sys
import argparse

'''
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost

'''

def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh


def train_xgb_kfold(X_train, y_train, X_test, n_fold=10):
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = xgb.DMatrix(X_tr, y_tr)
        dvalid = xgb.DMatrix(X_val, y_val)
        dtest = xgb.DMatrix(X_test)

        # params={
        #     'booster':'gbtree',
        #     'objective': 'binary:logistic',
        #     'eval_metric': ['logloss', 'auc'],
        #     'max_depth': 8,
        #     'subsample':0.9,
        #     'min_child_weight': 10,
        #     'colsample_bytree':0.85,
        #     'lambda': 10,
        #     'eta': 0.02,
        #     'seed': 1024
        # }
        params = {'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'gamma': 1,
            'min_child_weight': 1.5,
            'max_depth': 5,
            'lambda': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'colsample_bylevel': 0.7,
            'eta': 0.05,
            'tree_method': 'exact',
            'seed': 2020,
            'nthread': 8
        }
        watchlist = [(dtrain, 'train'), (dvalid, 'test')]

        gbm = xgb.train(params,
                        dtrain,
                        num_boost_round=10000,
                        evals=watchlist,
                        eval_metric=f1_score,
                        verbose_eval=1000,
                        early_stopping_rounds=500)

        oof_preds[val_index] = gbm.predict(dvalid, iteration_range=(0, gbm.best_iteration))
        test_preds += gbm.predict(dtest, iteration_range=(0, gbm.best_iteration)) / kfold.n_splits

    return oof_preds, test_preds



warnings.filterwarnings("ignore")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUFA_VISIBLE_DEVICES"] = "4"
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")

data = pd.read_csv("账户交易信息.csv")
data_static = pd.read_csv("账户静态信息.csv")
data_label = pd.read_csv("训练集标签.csv")
test = pd.read_csv("test_dataset.csv")

# data = data.merge(fea, on='zhdh')
# data = data.merge(act, on='zhdh')

# data['year0'] = data['khrq'].str[:4]
# data['mon0'] = data['khrq'].str[5:7]
# data['day0'] = data['khrq'].str[8:10]
# data['year'] = data['jyrq'].str[:4]
# data['mon'] = data['jyrq'].str[5:7]
# data['day'] = data['jyrq'].str[8:10]
# data['hour'] = data['jysj'].str[:2]
# data['min'] = data['jysj'].str[3:5]
# data['sec'] = data['jysj'].str[6:]
# del data['khrq']
# del data['jyrq']
# del data['jysj']
# del data['jylsxh']

# tmp = pd.concat([data['dfzh'], data['zhdh']], ignore_index=True).unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['zhdh'] = data['zhdh'].map(lambda x: mapp[x])
# data['dfzh'] = data['dfzh'].map(lambda x: mapp[x])

# tmp = data['zydh'].unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['zydh'] = data['zydh'].map(lambda x: mapp[x])

# tmp = data['jyqd'].unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['jyqd'] = data['jyqd'].map(lambda x: mapp[x])

# tmp = pd.concat([data['khjgdh'], data['dfhh']], ignore_index=True).unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['khjgdh'] = data['khjgdh'].map(lambda x: mapp[x])
# data['dfhh'] = data['dfhh'].map(lambda x: mapp[x])

d = {}
users = data.zhdh.unique().tolist()
cnt = 0
for user in tqdm(users):
    if cnt == 100:
        break
    cnt += 1
    lt = data[data.zhdh == user].reset_index(drop=True)

    all_cnt = lt.shape[0] # 操作次数
    in_cnt = lt.jdbj.sum() # 转入次数、转出次数、转入次数占比
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt

    in_amt = lt[lt.jdbj == 1].jyje.sum() # 转入金额、转出金额、转入金额占比
    out_amt = lt[lt.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)

    in_user_cnt = lt[lt.jdbj == 1].dfzh.nunique() # 转入人数、转出人数、转入人数占比
    out_user_cnt = lt[lt.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)

    date_cnt = lt.jyrq.nunique() # 转账日期数
    d[user] = [all_cnt, in_cnt, out_cnt, in_ratio, in_amt, out_amt, in_amt_ratio, in_user_cnt, out_user_cnt, in_user_ratio, date_cnt]


data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 'in_amt', 'out_amt', 'in_amt_ratio', 'in_user_cnt', 'out_user_cnt', 'in_user_ratio', 'date_cnt']

data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age']], on='zhdh', how='left')
final = pd.merge(left=feats, right=data_label, on='zhdh', how='left')

train = final[final.black_flag.notnull()].reset_index(drop=True)
test = final[final.black_flag.isnull()].reset_index(drop=True)

# feat = ['zhdh', 'khjgdh', 'xb', 'nl', 'dfzh', 'jdbj', 'jyje', 'zhye', 'dfhh', 'jyqd', 'zydh', 'dfmccd', 'year0', 'mon0', 'day0', 'year', 'mon', 'day', 'hour', 'min', 'sec', 'in_num', 'out_num', 'in_mon', 'out_mon', 'cnt']
feat = train.columns[1:].tolist()

# model = lgb.LGBMClassifier(
#     boosting_type="gbdt", num_leaves=128, reg_alpha=5, reg_lambda=5,
#     max_depth=-1, n_estimators=2000, subsample=0.8, colsample_bytree=0.8, n_jobs=-1,
#     subsample_freq=1, min_child_samples=50, learning_rate=0.1, random_state=2023
# )
# model.fit(train[feat], train['black_flag'])
# pred = model.predict_proba(test[feat])

X_train = train[feat]
y_train = train['black_flag']
X_test = test[feat]

oof_preds, test_preds = train_xgb_kfold(X_train, y_train, X_test)
best_thresh = gen_thres_new(train, oof_preds)

test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
test[['zhdh', 'black_flag']].to_csv('ans.csv', index=False)

你可能感兴趣的:(机器学习,机器学习)