预测下客户是否会购买银行的产品,给出了训练集和测试集,具体如下:
天池【教学赛】金融数据分析赛题1:银行客户认购产品预测
1、首先进行数据预处理
def background_combination(job,education):
job_dic={'admin.':4,'blue-collar':3,'entrepreneur':6,
'housemaid':3,'management':5,'retired':3,
'self-employed':4,'services':2,'student':1,
'technician':3,'unemployed':1,'unknown':3}
education_dic={'basic.4y':1,'basic.6y':1,'basic.9y':2,
'high.school':3,'illiterate':0,'professional.course':4,
'university.degree':4,'unknown':2}
job_score=job_dic[job]
education_score=education_dic[education]
background_score=job_score+education_score
return background_score
2)针对客户的default、housing、loan,进行了经济水平评级
def economic_level(default,housing,loan):
default_dic={'yes':1,'no':3,'unknown':2}
housing_dic={'yes':3,'no':1,'unknown':2}
loan_dic={'yes':1,'no':3,'unknown':2}
economic_score=default_dic[default]+housing_dic[housing]+loan_dic[loan]
return economic_score
3)然后针对marital、contact、poutcome进行one_hot编码
def one_hot_data_preparation(input):
marital_dic={'divorced':1,'single':2,'married':3,'unknown':2}
contact_dic={'cellular':1,'telephone':2}
poutcome_dic={'failure':2,'nonexistent':1,'success':3}
if input in marital_dic:
return marital_dic[input]
if input in contact_dic:
return contact_dic[input]
if input in poutcome_dic:
return poutcome_dic[input]
# 整合数据,将数据变成Dataframe格式,
# 并且给需要one-hot编码的marital,contact和poutcome进行数值化
def data_transfer_process(dataset):
background_score_list=[]
economic_score_list=[]
marital_list=[]
contact_list=[]
poutcome_list=[]
label_list=[]
for i in dataset.values:
# print(i[0],i[1],type(i[0]))
background_score=background_combination(i[0],i[1])
economic_score=economic_level(i[2],i[3],i[4])
marital_value=one_hot_data_preparation(i[5])
contact_value=one_hot_data_preparation(i[6])
poutcome_value=one_hot_data_preparation(i[7])
background_score_list.append(background_score)
economic_score_list.append(economic_score)
marital_list.append(marital_value)
contact_list.append(contact_value)
poutcome_list.append(poutcome_value)
# return background_score_list,economic_score_list,marital_list,contact_list,poutcome_list
list=[background_score_list,economic_score_list,marital_list,contact_list,poutcome_list]
list=transpose(list)
name=['background','economic_level','marital','contact','poutcome']
data=pd.DataFrame(columns=name,data=list)
return data
def train_data_preparation():
# data induction
train_data = pd.DataFrame(pd.read_csv('./train.csv'))
label=train_data[['subscribe']]
label_list=[]
train_data=train_data[['job','education',
'default','housing','loan',
'marital','contact','poutcome']]
processed_train_data=data_transfer_process(train_data)
for i in label.values:
label_value=judge(i)
label_list.append(label_value)
label_name=['label']
label_train=pd.DataFrame(columns=label_name,data=label_list)
one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['marital']])
train_marital_list=['marital_1','marital_2','marital_3']
one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
one_hot_marital.columns=train_marital_list
one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['contact']])
train_contact_list=['contact_1','contact_2']
one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
one_hot_contact.columns=train_contact_list
one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['poutcome']])
train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
one_hot_poutcome.columns=train_poutcome_list
train_data=processed_train_data.drop('marital',axis=1)
train_data=processed_train_data.drop('contact',axis=1)
train_data=processed_train_data.drop('poutcome',axis=1)
frames=[processed_train_data,one_hot_marital,one_hot_contact,one_hot_poutcome,label_train]
train_result=pd.concat(frames,axis=1)
return train_result
2、MLP模型的编写
import numpy as np
class MLP:
'''
用于多分类的MLP
'''
def predict(self, feature, parameter_dict):
feature = np.mat(feature)
feature = np.mat(self.normalize(feature))
re_list = []
sample_num = feature.shape[0]
for m in range(sample_num):
current_sample = feature[m]
for layer_index in range(len(parameter_dict.keys())):
current_sample = np.insert(current_sample, 0, values=1, axis=1)
# print(current_sample)
# print("===================")
# print(parameter_dict[layer_index + 1])
current_sample = current_sample * parameter_dict[layer_index + 1]
current_sample = self.sigmoid(current_sample)
# print(current_sample)
re_list.append(np.argmax(np.array(current_sample)))
print("*****************")
return re_list
def train(self, feature, label, hidden, learning_rate, iteration_num):
'''
:param feature: 装有 m行 * n列 数据的特征矩阵,样本数为m,特征数为n
:param label: 装有 m行 * 1列 标签的矩阵,样本数为m
:param hidden: 装有隐藏层信息的字典,格式为{层数: 神经元个数},层数从1开始
:param learning_rate: 学习率
:param iteration_num: 梯度下降迭代次数
:return: parameter_dict: 各层之间的参数矩阵
'''
feature = np.mat(feature)
# feature = np.mat(self.normalize(feature))
feature = np.mat(self.normalize(feature))
label = np.mat(label)
# 初始化参数矩阵
feature_num = feature.shape[1]
hidden_layer_num = len(hidden.keys())
label_set = set()
for i in np.array(label)[0]:
label_set.add(i)
label_categories_num = len(label_set)
parameter_dict = {}
parameter_dict[1] = np.mat(np.random.rand(feature_num + 1, int(hidden[1]))) # 初始化输入层到隐藏层之间的参数矩阵
if hidden_layer_num > 1: # 初始化隐藏层之间的参数矩阵
for layer_index in range(1, hidden_layer_num):
parameter_dict[layer_index+1] = np.mat(np.random.rand(hidden[layer_index] + 1, hidden[layer_index + 1]))
parameter_dict[hidden_layer_num + 1] = np.mat(np.random.rand(hidden[hidden_layer_num] + 1, label_categories_num)) # 初始化最后一个隐藏层到输出层之间的参数矩阵
# 初始化标签矩阵
sample_num = feature.shape[0]
label_matrix = np.mat(np.zeros((sample_num, label_categories_num)))
for m in range(sample_num):
label_matrix[m, label[0, m]] = 1
# 返回训练出来每一层间的参数矩阵
parameter_dict = self.gradient_descent(feature, label_matrix, parameter_dict, learning_rate, iteration_num)
return parameter_dict
# 梯度下降更新参数矩阵
def gradient_descent(self, feature, label, parameter_dict, learning_rate, iteration_num):
# 梯度下降更新参数矩阵
for _ in range(iteration_num):
sample_num = feature.shape[0]
parameter_num = len(parameter_dict.keys())
# 对每一个样本使用反向传播算法
for m in range(sample_num):
current_sample = feature[m]
current_label = label[m]
forward_input_value = {0: current_sample}
activation_value = {0: current_sample}
deviation = {}
# 前向传播算每一层的前向输入值和激活输出值
for layer_index_fp, parameter in parameter_dict.items():
activation_value[layer_index_fp - 1] = np.insert(activation_value[layer_index_fp - 1], 0, values=1, axis=1) # 增加偏置项
forward_input_value[layer_index_fp] = activation_value[layer_index_fp - 1] * parameter_dict[layer_index_fp]
activation_value[layer_index_fp] = self.sigmoid(forward_input_value[layer_index_fp])
# 反向传播求误差值
deviation[parameter_num] = activation_value[parameter_num] - current_label # 交叉熵损失函数下求输出层误差
for layer_index_bp in range(parameter_num - 1, 0, -1):
# 前向输入增加偏置参数
forward_input_value[layer_index_bp] = np.insert(forward_input_value[layer_index_bp], 0, values=1, axis=1)
# 求隐藏层误差
ones = np.mat(np.ones((1, forward_input_value[layer_index_bp].shape[1])))
deviation[layer_index_bp] = np.multiply( (deviation[layer_index_bp + 1] * parameter_dict[layer_index_bp + 1].T), ( np.multiply( self.sigmoid(forward_input_value[layer_index_bp]), (ones - self.sigmoid(forward_input_value[layer_index_bp]))) ) )
# 误差去除偏置参数
deviation[layer_index_bp] = np.delete(deviation[layer_index_bp], 0, axis=1)
# 更新参数
for parameter_index in range(parameter_num, 0, -1):
parameter_dict[parameter_index] -= learning_rate * activation_value[parameter_index - 1].T * deviation[parameter_index]
return parameter_dict
# sigmoid函数
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
# 标准化
def normalize(self, feature):
feature_normalized = np.copy(feature).astype(float)
feature_mean = np.mean(feature, 0)
feature_deviation = np.std(feature, 0)
if feature.shape[0] > 1:
feature_normalized -= feature_mean
feature_deviation[feature_deviation == 0] = 1
feature_normalized /= feature_deviation
return feature_normalized
3、模型训练及最终完整版代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
'''
1、离散特征与连续特征的问题
无序离散通常用onehot编码,对于有序离散通常用labelhot编码
2、unknown数据的处理: background与economic取平均值
'''
def judge(str):
if str=='yes':
result=0
else:
result=1
return result
def transpose(matrix):
new_matrix = []
for i in range(len(matrix[0])):
matrix1 = []
for j in range(len(matrix)):
matrix1.append(matrix[j][i])
new_matrix.append(matrix1)
return new_matrix
# 缺失数据过大
def drop_bias_data(dataset):
dataset=dataset[~(dataset['job'].isin(['unknown'])|
dataset['marital'].isin(['unknown'])|
dataset['education'].isin(['unknown'])|
dataset['default'].isin(['unknown'])|
dataset['housing'].isin(['unknown'])|
dataset['loan'].isin(['unknown'])|
dataset['poutcome'].isin(['nonexistent']))]
return dataset
def background_combination(job,education):
job_dic={'admin.':4,'blue-collar':3,'entrepreneur':6,
'housemaid':3,'management':5,'retired':3,
'self-employed':4,'services':2,'student':1,
'technician':3,'unemployed':1,'unknown':3}
education_dic={'basic.4y':1,'basic.6y':1,'basic.9y':2,
'high.school':3,'illiterate':0,'professional.course':4,
'university.degree':4,'unknown':2}
job_score=job_dic[job]
education_score=education_dic[education]
background_score=job_score+education_score
return background_score
def economic_level(default,housing,loan):
default_dic={'yes':1,'no':3,'unknown':2}
housing_dic={'yes':3,'no':1,'unknown':2}
loan_dic={'yes':1,'no':3,'unknown':2}
economic_score=default_dic[default]+housing_dic[housing]+loan_dic[loan]
return economic_score
def one_hot_data_preparation(input):
marital_dic={'divorced':1,'single':2,'married':3,'unknown':2}
contact_dic={'cellular':1,'telephone':2}
poutcome_dic={'failure':2,'nonexistent':1,'success':3}
if input in marital_dic:
return marital_dic[input]
if input in contact_dic:
return contact_dic[input]
if input in poutcome_dic:
return poutcome_dic[input]
# 整合数据,将数据变成Dataframe格式,
# 并且给需要one-hot编码的marital,contact和poutcome进行数值化
def data_transfer_process(dataset):
background_score_list=[]
economic_score_list=[]
marital_list=[]
contact_list=[]
poutcome_list=[]
label_list=[]
for i in dataset.values:
# print(i[0],i[1],type(i[0]))
background_score=background_combination(i[0],i[1])
economic_score=economic_level(i[2],i[3],i[4])
marital_value=one_hot_data_preparation(i[5])
contact_value=one_hot_data_preparation(i[6])
poutcome_value=one_hot_data_preparation(i[7])
background_score_list.append(background_score)
economic_score_list.append(economic_score)
marital_list.append(marital_value)
contact_list.append(contact_value)
poutcome_list.append(poutcome_value)
# return background_score_list,economic_score_list,marital_list,contact_list,poutcome_list
list=[background_score_list,economic_score_list,marital_list,contact_list,poutcome_list]
list=transpose(list)
name=['background','economic_level','marital','contact','poutcome']
data=pd.DataFrame(columns=name,data=list)
return data
def train_data_preparation():
# data induction
train_data = pd.DataFrame(pd.read_csv('./train.csv'))
label=train_data[['subscribe']]
label_list=[]
train_data=train_data[['job','education',
'default','housing','loan',
'marital','contact','poutcome']]
processed_train_data=data_transfer_process(train_data)
for i in label.values:
label_value=judge(i)
label_list.append(label_value)
label_name=['label']
label_train=pd.DataFrame(columns=label_name,data=label_list)
one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['marital']])
train_marital_list=['marital_1','marital_2','marital_3']
one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
one_hot_marital.columns=train_marital_list
one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['contact']])
train_contact_list=['contact_1','contact_2']
one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
one_hot_contact.columns=train_contact_list
one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['poutcome']])
train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
one_hot_poutcome.columns=train_poutcome_list
train_data=processed_train_data.drop('marital',axis=1)
train_data=processed_train_data.drop('contact',axis=1)
train_data=processed_train_data.drop('poutcome',axis=1)
frames=[processed_train_data,one_hot_marital,one_hot_contact,one_hot_poutcome,label_train]
train_result=pd.concat(frames,axis=1)
return train_result
def tes_data_preparation():
# data induction
test_data = pd.DataFrame(pd.read_csv('./test.csv'))
test_data=test_data[['job','education',
'default','housing','loan',
'marital','contact','poutcome']]
processed_test_data=data_transfer_process(test_data)
one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['marital']])
train_marital_list=['marital_1','marital_2','marital_3']
one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
one_hot_marital.columns=train_marital_list
one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['contact']])
train_contact_list=['contact_1','contact_2']
one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
one_hot_contact.columns=train_contact_list
one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['poutcome']])
train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
one_hot_poutcome.columns=train_poutcome_list
test_data=processed_test_data.drop('marital',axis=1)
test_data=processed_test_data.drop('contact',axis=1)
test_data=processed_test_data.drop('poutcome',axis=1)
frames=[processed_test_data,one_hot_marital,one_hot_contact,one_hot_poutcome]
test_result=pd.concat(frames,axis=1)
return test_result
import pandas as pd
from sklearn.preprocessing import Normalizer
from MLP import *
import matplotlib.pyplot as plt
import data_preparation
if __name__ == '__main__':
# data测试
train_data = data_preparation.train_data_preparation()
test_data = data_preparation.tes_data_preparation()
train_data = np.array(train_data)
feature_train = train_data[:, 1:13]
label_train = np.array(train_data[:, [13]].T)
test_data = np.array(test_data)
feature_test = test_data[:, 1:13]
label_test=pd.read_csv('./submission.csv')
label_test=np.array(label_test)
test_label_list=[]
for i in label_test[:,1]:
label_value=data_preparation.judge(i)
test_label_list.append(label_value)
# 多层感知机
MLP_test = MLP()
parameter_dict = MLP_test.train(feature=feature_train, label=label_train, hidden={1: 5}, learning_rate=0.001, iteration_num=1)
# print(parameter_dict)
result1 = MLP_test.predict(feature_test, parameter_dict)
# result2 = MLP_test.predict(feature_test2, parameter_dict)
# print(result1)
# print(result2)
count = 0
sum=0
for i in range(len(result1)):
sum=sum+abs(result1[i]-test_label_list[i])
error_rate = sum / len(result1)
accuracy_rate=1-error_rate
print("error_rate:",error_rate) # 用以上参数,测试中准确率约为85%
print("accuracy_rate:",accuracy_rate)