######【风控建模】
说明:利用平台数据和第三方数据建立基于用户通信信息的反欺诈规则,判别通信信息及通话记录对客户潜在逾期发生的预警。
手机号
开户时间
注册时间
归属地址
、拨号地址
消费账单
通话记录
通讯录详单
紧急联系人电话号
、身份关系
表单 | 信息 | 价值 |
---|---|---|
手机号 | 手机号开户时间 | ①手机号真实性 ②判断手机号使用时长 ③用户粘性 |
通话记录 | ①通话地址(范围) ②通话群体 ③通话时长 | ①手机号价值 ②手机号粘性 ③用户活跃度 |
通讯录 | ①通讯录大小 ②判断通讯录名单联系频率 | ①客户社交范围 ②通讯录价值 |
紧急联系人 | ①是否在通讯录中 ②是否有近期通话记录 | ①联系人真实性 ②潜在欺诈风险 |
逾期账户 | ①是否逾期 | ①逾期客户通信信息情况反馈 |
函数式编程
、数据流程
、代码复用
、实例化
、debug处理
多表查询join
、多表合并union
、时间戳gettime
、筛选条件where
、调用数据库connet
、游标cursor
、数据匹配fetchall
、事务提交commit
读取文件read_excel
、计数value_counts
、重置索引reset_index
、关联匹配merge
、去除重复值drop_duplicates
、最大值max
、去除空值dropna
、替换replace
、字符串str
、矩阵形状shape
、整形int
建立一个for循环,判断字段里是否有某个字符串
、建立循环,批量填充
xlsxwriter.Workbook
、workbook.add_worksheet
、time.sleep(0)
、worksheet.insert_image
、set_column
、zip
、plt
、echart
"""
pro:AntiFraudRule.py
@author: sunyaowu
"""
import numpy as np
import pandas as pd
import time
import os
import pymysql
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pyecharts
import xlwt
import xlsxwriter
_path = r'C:\Users\A3\Desktop\2:项目\项目\项目22: Python数据库模块搭建,支持增删查改调用'
os.chdir(_path + '\data')
sys.path.append(_path)
import DataBaseSql
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 20 10:27:06 2018
pro:DataBaseSql.py
@author: sunyaowu
"""
import pymysql
import pandas as pd
class DataBaseSql():
def __init__(self):
pass
def sql_Select(sql,config):
try:
conn = pymysql.connect(**config)
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
df = pd.DataFrame(cur.fetchall())
cur.colse()
except:
conn.rollback()
finally:
conn.close()
return df
def sql_query(self,num):
sql1 ='''select
distinct
a.user_id,
gmt_modified,
real_name,
basic_phone_num,
b.phone as emer_phone,
a.phone as user_phone,
relation
from
cl_user_base_info a
left join cl_user_emer_contacts b on b.user_id = a.user_id
left join cl_operator_basic c on c.user_id = b.user_id
where a.user_id < %i '''%(num)
'''通讯录'''
sql2 ='''select
user_id,phone
from
cl_user_contacts_1
where user_id < %i
union
select
user_id,phone
from
cl_user_contacts_2
where user_id < %i
union
select
user_id,phone
from
cl_user_contacts_3
where user_id < %i '''%(num,num,num)
'''通话记录'''
sql3 ='''select
user_id,voice_to_number,voice_place,voice_date,voice_duration
from
cl_operator_voices_1
where user_id < %i
union
select
user_id,voice_to_number,voice_place,voice_date,voice_duration
from
cl_operator_voices_2
where user_id < %i
union
select
user_id,voice_to_number,voice_place,voice_date,voice_duration
from
cl_operator_voices_3
where user_id < %i ''' %(num,num,num)
sql4 ='''select
user_id,penalty_amout,penalty_day
from
cl_borrow_repay
where user_id < %i ''' %(num)
sql5 ='''select
distinct
user_id,phone
from
cl_user_emer_contacts
where user_id < %i ''' %(num)
return sql1,sql2,sql3,sql4,sql5
def get_data(self,name):
config = {
'host':'XXXXXXXXXXX',
'port':3XX6,
'db':'pXXXXX_loan',
'user':'cash_XXXXXyw_r',
'password':'IxCZIXXXXXXXXXXXXXXlext5G',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.DictCursor,
}
sql_ = list(self.sql_query(number))
n = 0
for i in sql_:
try:
data = DataBaseSql.DataBaseSql.sql_Select(i,config)
print('Bingo,get:%i!' %(n + 1))
except:
print('Bingo,error!')
finally:
pass
data.to_excel(path +"\data\%s.xlsx" % name[n],index = False, encoding = 'utf-8')
n += 1
def data_pro(self,name):
data0 = pd.read_excel('%s.xlsx'%name[0])
data1 = pd.read_excel('%s.xlsx'%name[1])
_data1 = data1['user_id'].value_counts().reset_index()
_data1.columns = 'user_id','phone_counts'
data_mix = pd.merge(data0,_data1,on = 'user_id',how = 'left')
data2 = pd.read_excel('%s.xlsx'%name[2])
_data2 = data2['user_id'].value_counts().reset_index()
_data2.columns = 'user_id','voice_counts' #data22.rename(columns = {'index':'user_id','user_id':'voice_counts'},inplace = True)
data_mix = pd.merge(data_mix,_data2,on = 'user_id',how = 'left')
_data2 = data2['voice_to_number'].value_counts().reset_index()
_data2.columns = 'voice_to_number','tel_counts'
__data2 = pd.merge(data2.dropna(),_data2,on = 'voice_to_number',how = 'left').drop_duplicates(subset=['voice_to_number']).astype({'tel_counts':'int'})
__data2 = __data2 [__data2['tel_counts'] > 5]
_data2 = __data2['user_id'].value_counts().reset_index()
_data2.columns = 'user_id','tel_5_counts'
_data2 = pd.merge(_data2,__data2[['user_id','tel_counts']].groupby('user_id').max().reset_index(),on = 'user_id',how = 'left')
_data2.rename(columns = {'tel_counts':'max_tel_count'},inplace = True)
data_mix = pd.merge(data_mix,_data2,on = 'user_id',how = 'left')
data4 = pd.read_excel('%s.xlsx'%name[4])
data24_mix = pd.merge(data2,data4,on = 'user_id',how = 'left').drop_duplicates(subset=['voice_to_number']).dropna().reset_index()
#data24_mix['voice_to_number'] = data24_mix['voice_to_number'].str.replace(')','').str.replace('(','').str.replace('*','').str.replace('+','')
#for i in range(data24_mix.shape[0]):
# data24_mix['voice_to_number'][i] = int(''.join(list(filter(lambda ch: ch in '0123456789',data24_mix['voice_to_number'][i]))))
# print(i)
#data24_mix = data24_mix.dropna()
data24_mix['emer_in_voice'] = 0
for i in range(data24_mix.shape[0]):
a = str(data24_mix['voice_to_number'][i])
b = str(data24_mix['phone'][i])
print (i)
if a == b:
data24_mix['emer_in_voice'][i] = 1
data24_mix = data24_mix[data24_mix['emer_in_voice'] == 1]
data_mix = pd.merge(data_mix,data24_mix[['user_id','emer_in_voice']],on = 'user_id',how = 'left')
data3 = pd.read_excel('%s.xlsx'%name[3])
data_mix = pd.merge(data_mix,data3,on = 'user_id',how = 'left')
for i in ['phone_counts','voice_counts','tel_5_counts','max_tel_count','emer_in_voice']:
data_mix[i].fillna(0,inplace = True)
data_mix.to_excel(_path +'\data\%s.xlsx'%name[5])
emer_in_voice = data_mix[data_mix['emer_in_voice'] == 1]
emer_in_voice.to_excel(_path +'\data\%s.xlsx'%name[6])
emer_in_repay = data_mix[data_mix['penalty_day'] >= 0]
'''emer_in_repay['penalty'] = 0
for i in range(emer_in_repay.shape[0]):
a = emer_in_repay['penalty_day'][i]
pirnt(a)
if a > 0 :
emer_in_repay['penalty'][i] = 1'''
emer_in_repay.to_excel(_path +'\data\%s.xlsx'%name[7])
return data_mix,emer_in_voice,emer_in_repay
'''所有抓取数据的特征描述、分析及可视化报告'''
def data_query_description(self,num,name):
data = pd.read_excel(_path +'\data\%s.xlsx'%name[5])
worksheet1 = workbook.add_worksheet('query_data_report')
worksheet1.set_column('A:B',50)
a = ['通讯录信息','运营商信息','通话记录','通话记录']
b = ['phone_counts','voice_counts','tel_5_counts','emer_in_voice']
c = ['开通通讯录权限','开通运营商信息权限','有通话记录','与紧急联系人存在通话记录']
data_count = []
m = 0
for i,j,n in zip(a,b,c):
_p = len(data)
_q = len(data[data[j]>0])
data_count.append(_q)
d,e = '爬取%i数据共%s条,其中有效数据共%i条' %(num,i,_q),'说明用户%s的比例为:%.2f%%' %(n,_q/_p*100)
print(d,e)
worksheet1.write(m,0,d)#写入excel报告
worksheet1.write(m,1,e)#写入excel报告
time.sleep(0)
m += 1
print('-----next-----')
time.sleep(0)
d = '开通运营商权限的%i个用户中,有过通话记录的为%i,占比%.2f%%' %(data_count[1],data_count[2],data_count[2]/data_count[1]*100)
print(d)
e = '有过通话记录的%i个用户中,与紧急联系人通话的为%i,占比%.2f%%' %(data_count[2],data_count[3],data_count[3]/data_count[2]*100)
print('-----next-----')
print(e)
print('Bingo!')
worksheet1.write(m + 1,0,d)#写入excel报告
worksheet1.write(m + 2,0,e)#写入excel报告
self.data_query_show(num,data,name,worksheet1)
workbook.close() #关闭excel文件
return data
def data_query_show(self,num,data,name,worksheet):
box = ['phone_counts','voice_counts','tel_5_counts','max_tel_count']
for item in box :
#data[item].bar()
#plt.show() #直接利用Dataframe画图
self.hist_show(data[data[item] > 0],item,100) #为变量属性画直方图
v = 8
for p,q,l in zip(box,[8,8,18,18],['A','B','A','B']) :
worksheet.insert_image('%s%i'%(l,q),_path + r'\report+image\%s.png'%p,{'x_scale': 0.5, 'y_scale': 0.5})#写入excel报告
#for item in ['relation','emer_in_voice']:
# data[item].bar()
# plt.show()
#self.hist_show(data[data[item] > 0],item,100) #为变量属性画箱型图
#self.data_report():
#self.echart_show(data['relation'])
'''存在紧急联系人数据的特征描述、分析及可视化报告'''
def data_emer_description(self,num,name):
data = pd.read_excel(_path +'\data\%s.xlsx'%name[5])
worksheet2 = workbook.add_worksheet('emer_data_report')
a = ['通讯录信息','运营商信息','通话记录','通话记录']
b = ['phone_counts','voice_counts','tel_5_counts','emer_in_voice']
c = ['开通通讯录权限','开通运营商信息权限','有通话记录','与紧急联系人存在通话记录']
data_count = []
for i,j,n in zip(a,b,c):
_p = len(data)
_q = len(data[data[j]>0])
data_count.append(_q)
print('爬取客户数据共%i条,其中有效%s数据共%i条' %(num,i,_q))
print('说明用户%s的比例为:%.2f%%' %(n,_q/_p*100))
print('-----next-----')
time.sleep(0)
#type(data_count[0])
print('开通运营商权限的%i个用户中,有过通话记录的为%i,占比%.2f%%' %(data_count[1],data_count[2],data_count[2]/data_count[1]*100) )
print('-----next-----')
time.sleep(0)
print('有过通话记录的%i个用户中,与紧急联系人通话的为%i,占比%.2f%%' %(data_count[2],data_count[3],data_count[3]/data_count[2]*100) )
print('Bingo!')
#写入text报告
self.data_query_show(num,data,name)
#函数中将过程产生图片也保存在text报告中
return data
def data_emer_show(self,name):
data = pd.read_excel(_path +'\data\%s.xlsx'%name[6])
for item in ['phone_counts','voice_counts','tel_5_counts']:
self.hist_show(data1[(data1[item] > 0) & (data1[item] < 1*data1[item].max())],item,20)
return data
'''借贷用户还款数据的特征描述、分析及可视化报告'''
def data_repay_description(self,num,name):
data = pd.read_excel(_path +'\data\%s.xlsx'%name[5])
worksheet3 = workbook.add_worksheet('repay_data_report')
a = ['通讯录信息','运营商信息','通话记录','通话记录']
b = ['phone_counts','voice_counts','tel_5_counts','emer_in_voice']
c = ['开通通讯录权限','开通运营商信息权限','有通话记录','与紧急联系人存在通话记录']
data_count = []
for i,j,n in zip(a,b,c):
_p = len(data)
_q = len(data[data[j]>0])
data_count.append(_q)
print('爬取%s数据共%i条,其中有效数据共%i条' %(i,num,_q))
print('说明用户%s的比例为:%.2f%%' %(n,_q/_p*100))
print('-----next-----')
time.sleep(0)
#type(data_count[0])
print('开通运营商权限的%i个用户中,有过通话记录的为%i,占比%.2f%%' %(data_count[1],data_count[2],data_count[2]/data_count[1]*100) )
print('-----next-----')
time.sleep(0)
print('有过通话记录的%i个用户中,与紧急联系人通话的为%i,占比%.2f%%' %(data_count[2],data_count[3],data_count[3]/data_count[2]*100) )
print('Bingo!')
#写入text报告
self.data_query_show(num,data,name)
#函数中将过程产生图片也保存在text报告中
return data
def data_repay_show(self,name):
data = pd.read_excel(_path +'\data\%s.xlsx'%name[7])
for item in ['phone_counts','voice_counts','tel_5_counts']:
self.hist_show(data1[data1[item] > 0],item,20)
for item in ['relation','phone_counts','voice_counts']:
self.boxplot_show(data3,item,'penalty_day')
return data
# =============================================================================
# #可视化及数据报告功能
# =============================================================================
'''频率分布直方图'''
def hist_show(self,data,field,bin):
'''data[field].hist(bins = bin,
histtype = 'bar',
align = 'mid',
orientation = 'vertical',
alpha = 0.5,
normed = True
)
data[field].plot(kind = 'kde',style = 'k--')'''
plt.hist(data[field], bins=40, normed=0, facecolor="blue", edgecolor="black", alpha=0.7)
plt.title(field)
plt.savefig(_path + r'\report+image\%s.png' % field, dpi=100)
plt.show()
'''柱状图'''
'''箱型图'''
'''双变量箱型图'''
def boxplot_show(self,data,field1,field2):
data = pd.concat([data[field1], data[field2]], axis=1)
fig = sns.boxplot(x=field1, y=field2, data=data)
plt.title(field1)
plt.show()
#plt.savefig('%s.png' % field, dpi=200)
'''本地文件eharts'''
def echart_show(self,data):
from pyecharts import Bar
from pyecharts import Bar, Line
from pyecharts.engine import create_default_environment
bar = Bar("紧急联系人分布", "副标题")
bar.add('联系人',data)
# bar.print_echarts_options() # 该行只为了打印配置项,方便调试时使用
bar.render() # 生成本地 HTML 文件
'''写入excel文件并生成报告'''
def data_report():
pass
def **model_result**(self):
pass
if __name__ == "__main__":
AFR = AntiFraudRule()
number = 10000 #int(input('请输入您要查询的ID数量!\n'))
name = ['emer_relation_data','phonebook_data','tel_records_data','repay_data',
'emer_real_data','data_mix','emer_in_voice','emer_in_repay']
_begin_time = time.time()
''' 1、数据获取'''
print("---------- 1.get_data ----------")
#AFR.get_data(name)
''' 2、数据处理'''
print("---------- 2.data_pro ----------")
#data_mix,emer_in_voice,emer_in_repay = AFR.data_pro(name)
''' 3、数据分析'''
print("---------- 3.data_show ----------")
workbook = xlsxwriter.Workbook('report.xlsx')
data_query = AFR.data_query_description(number,name)
#data_emer = AFR.data_emer_description(number,name)
#data_repay = AFR.data_repay_description(number,name)
''' 4、特征挖掘'''
#print("---------- 4.data_rule ----------")
#print('finished!')
''' 5、可视化报告'''
#print("---------- 5.result_model ----------")
#print('finished!')
_end_time = time.time()
print('You have finished!\nfanilly use time: {x:.2f}s'.format(x = _end_time - _begin_time))