该项目通过美国人口普查数据训练一个模型来预测美国人口收入水平。数据集上包含199523个训练数据和99762个测试数据,各包含了41个属性。经分析,该数据包含了人口统计信息、年龄、贷款信息、国籍、种族等信息。属性数据中有包含空值和有偏分布等问题,处理思路如下:
1.读取数据,观察特征及其分布
2.分析缺失情况,处理缺失值
3.异常值处理
4.对分类变量进行哑编码
5.用随机森林进行重要特征筛选
6.重采样对不均衡数据进行处理
7.构建XGBOOST模型,并进行建模分析预测
import numpy as np
import pandas as pd
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
print ('train_df:%s,%s'%train_df.shape)
print ('test_df:%s,%s'%test_df.shape)
##检查因变量
train_df.income_level.unique()
test_df.income_level.unique()
#为了便于分析,将变量编码为0,1
train_df.loc[train_df['income_level']==-50000,'income_level']=0
train_df.loc[train_df['income_level']== 50000,'income_level']=1
test_df.loc[test_df['income_level']=='-50000','income_level']=0
test_df.loc[test_df['income_level']=='50000+.','income_level']=1
##检查样本不均衡程度
a=train_df['income_level'].sum()*100.0/train_df['income_level'].count()
b=test_df['income_level'].sum()*100.0/test_df['income_level'].count()
print ('train_df (1,0):(%s,%s)'%(a,100-a))
print ('test_df (1,0):(%s,%s)'%(b,100-b))
train_df.info()
import matplotlib.pyplot as plt
def num_tr(filed,n):
fig=plt.figure(figsize=(10,5))
train_df[filed].hist(bins=n)
plt.title('%s'%filed)
plt.show()
num_tr('age',100)
我猜测20岁以下和步入工作不久的人,比较不可能>50K,但是也不一定
现在将其分组,0-22,22-35,35-60,60-90,对应编码为:0,1,2,3(22岁为本科毕业平均年龄,35为工作初期(前10年),60岁为退休年龄)
'''
#创建年龄分组字段
labels=[0,1,2,3,4,5,6,7,8,9]
train_df['age_class']=pd.cut(train_df['age'],bins=[-1,10,20,30,40,50,60,70,80,90,100],labels=labels)
test_df['age_class']=pd.cut(test_df['age'],bins=[-1,10,20,30,40,50,60,70,80,90,100],labels=labels)
'''
收入水平为1的主要集中在30-50岁之间,并且可以看出,收入水平为1的人群年龄分布是接近正态的,均值为50.
'''
fig=plt.figure(figsize=(12,6))
train_df.groupby(['age_class','income_level'])['income_level'].count().unstack().plot(kind='bar')
plt.title('income_level wrt age')
plt.show()
'''
#查看收入水平的人群的年龄分布
fig=plt.figure(figsize=(12,6))
train_df.age[train_df.income_level==0].plot(kind='kde')
train_df.age[train_df.income_level==1].plot(kind='kde')
plt.legend(('0','1'))
plt.show()
右偏数据。后续有待进一步分析。
fig=plt.figure(figsize=(8,4))
plt.subplot2grid((1,2),(0,0))
train_df.capital_gains.plot(kind='box')
plt.subplot2grid((1,2),(0,1))
train_df.capital_losses.plot(kind='box')
plt.show()
水平为0的主要集中在0,50,而水平位1的则主要为50. 这里可以看出,水平为1的该变量几乎没有取值为1的。
#查看收入水平的人群的周工作时长分布
fig=plt.figure(figsize=(8,4))
plt.subplot2grid((1,2),(0,0))
train_df.weeks_worked_in_year[train_df.income_level==0].hist(bins=20)
plt.subplot2grid((1,2),(0,1))
train_df.weeks_worked_in_year[train_df.income_level==1].hist(bins=20,color='r')
plt.show()
右偏数据。后续有待进一步分析。
fig=plt.figure(figsize=(12,6))
train_df.dividend_from_Stocks[train_df.income_level==0].hist(bins=100)
train_df.dividend_from_Stocks[train_df.income_level==1].hist(bins=100)
plt.legend(('0','1'))
plt.show()
level为0的主要为0,level为1的则主要是6.
fig=plt.figure(figsize=(12,6))
#train_df.num_person_Worked_employer[train_df.income_level==0].hist(bins=100)
#train_df.num_person_Worked_employer[train_df.income_level==1].hist(bins=100)
train_df.groupby(['num_person_Worked_employer','income_level'])['income_level'].count().unstack().plot(kind='bar')
plt.legend(('0','1'))
plt.show()
虽然没有提供有关Not in universe类别的具体信息。我们假设这个答案是由填写人口普查数据而感到沮丧的人(由于任何原因)给出的。
这个变量看起来不平衡,即只有两个类别似乎占主导地位。在这种情况下,一个好的做法是将总分类频率的频率小于5%的水平组合起来。后续处理。
fig=plt.figure(figsize=(18,12))
train_df.groupby(['class_of_worker','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=120)
plt.legend(('0','1'))
plt.show()
Bachelors degree学士学位有最多level为1的
fig=plt.figure(figsize=(18,8))
train_df.groupby(['education','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
Married-civilian spouse present已婚且配偶在场的人中level=1最多
fig=plt.figure(figsize=(18,8))
train_df.groupby(['marital_status','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
白人占统样本的多数,并且,白人的level=1也最多。
fig=plt.figure(figsize=(18,8))
train_df.groupby(['race','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
样本中,整体女性人数占比较大,但是level为1的主要为男性。
fig=plt.figure(figsize=(18,8))
train_df.groupby(['sex','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
两类都集中在Not in universe中
fig=plt.figure(figsize=(18,8))
train_df.groupby(['member_of_labor_union','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
fig=plt.figure(figsize=(18,8))
train_df.groupby(['full_parttime_employment_stat','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
fig=plt.figure(figsize=(18,8))
train_df.groupby(['tax_filer_status','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
fig=plt.figure(figsize=(18,8))
train_df.groupby(['business_or_self_employed','income_level'])['income_level'].count().unstack().plot(kind='bar')
#plt.xticks(rotation=30)
plt.legend(('0','1'))
plt.show()
test数据没有缺失值,但是有?,首先将?置为空
s=pd.Series(train_df.isnull().sum())
print s
ss=pd.Series(test_df.isnull().sum())
print ss
##检查样本缺失比例
m=train_df.shape[0]
for i,j in s.iteritems():
if j>0:
print i,j*100.0/m
print '----------------------------'
#检查测试样本缺失比例
n=test_df.shape[0]
for i,j in ss.iteritems():
if j>0:
print i,j*100.0/n
del train_df['migration_msa']
del train_df['migration_reg']
del train_df['migration_within_reg']
del train_df['migration_sunbelt']
del test_df['migration_msa']
del test_df['migration_reg']
del test_df['migration_within_reg']
del test_df['migration_sunbelt']
train_df['hispanic_origin']= train_df['hispanic_origin'].fillna('others')
train_df['state_of_previous_residence']= train_df['state_of_previous_residence'].fillna('others')
train_df['country_father']= train_df['country_father'].fillna('others')
train_df['country_mother']= train_df['country_mother'].fillna('others')
train_df['country_self']= train_df['country_self'].fillna('others')
test_df['hispanic_origin']= test_df['hispanic_origin'].fillna('others')
test_df['state_of_previous_residence']= test_df['state_of_previous_residence'].fillna('others')
test_df['country_father']= test_df['country_father'].fillna('others')
test_df['country_mother']= test_df['country_mother'].fillna('others')
test_df['country_self']= test_df['country_self'].fillna('others')
def outliner(df,filed):
df[filed]=np.log(df[filed]+1)
df[filed].plot(kind='kde')
#训练数据
fig=plt.figure(figsize=(15,5))
plt.subplot2grid((2,2),(0,0))
outliner(train_df,'capital_losses') #.capital_losses&capital_gains
plt.subplot2grid((2,2),(0,1))
outliner(train_df,'capital_gains')
plt.subplot2grid((2,2),(1,0))
outliner(train_df,'dividend_from_Stocks')
plt.show()
##测试数据
fig=plt.figure(figsize=(15,5))
plt.subplot2grid((2,2),(0,0))
outliner(test_df,'capital_losses') #.capital_losses&capital_gains
plt.subplot2grid((2,2),(0,1))
outliner(test_df,'capital_gains')
plt.subplot2grid((2,2),(1,0))
outliner(test_df,'dividend_from_Stocks')
plt.show()
def dummy_encode(df,filed,a):
dummies=pd.get_dummies(df[filed],prefix=a)
len=dummies.shape[1]-1
a= dummies.iloc[:,0:len]
b=pd.concat([df, a], axis=1)
del b[filed]
return b
print train_df.shape,test_df.shape
df_all=pd.concat([train_df,test_df])
#对训练和测试数据进行哑编码
df_all=dummy_encode(df_all,'fill_questionnaire_veteran_admin','fill_questionnaire_veteran_admin')
df_all=dummy_encode(df_all,'citizenship','citizenship')
df_all=dummy_encode(df_all,'country_self','country_self')
df_all=dummy_encode(df_all,'country_mother','country_mother')
df_all=dummy_encode(df_all,'country_father','country_father')
df_all=dummy_encode(df_all,'family_members_under_18','family_members_under_18')
df_all=dummy_encode(df_all,'live_1_year_ago','live_1_year_ago')
df_all=dummy_encode(df_all,'d_household_summary','d_household_summary')
df_all=dummy_encode(df_all,'class_of_worker','class_of_worker')
df_all=dummy_encode(df_all,'education','education')
df_all=dummy_encode(df_all,'enrolled_in_edu_inst_lastwk','enrolled_in_edu_inst_lastwk')
df_all=dummy_encode(df_all,'marital_status','marital_status')
df_all=dummy_encode(df_all,'major_industry_code','major_industry_code')
df_all=dummy_encode(df_all,'major_occupation_code','major_occupation_code')
df_all=dummy_encode(df_all,'race','race')
df_all=dummy_encode(df_all,'hispanic_origin','hispanic_origin')
df_all=dummy_encode(df_all,'sex','sex')
df_all=dummy_encode(df_all,'member_of_labor_union','member_of_labor_union')
df_all=dummy_encode(df_all,'reason_for_unemployment','reason_for_unemployment')
df_all=dummy_encode(df_all,'full_parttime_employment_stat','full_parttime_employment_stat')
df_all=dummy_encode(df_all,'tax_filer_status','tax_filer_status')
df_all=dummy_encode(df_all,'region_of_previous_residence','region_of_previous_residence')
df_all=dummy_encode(df_all,'state_of_previous_residence','state_of_previous_residence')
df_all=dummy_encode(df_all,'d_household_family_stat','d_household_family_stat')
train_df=df_all.iloc[0:199523,:]
test_df=df_all.iloc[199523:,:]
print train_df.shape,test_df.shape
#test_df.to_csv('testooooo.csv')
#train_df.to_csv('trainooooo.csv')
随机森林做特征选择
##将目标变量提到最后一列
Y=train_df['income_level']
del train_df['income_level']
train_df['income_level']=Y
YT=test_df['income_level']
del test_df['income_level']
test_df['income_level']=YT
y=Y
X=train_df.iloc[:,0:351]
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
selected_feat_names=set()
for i in range(10): #这里我们进行十次循环取交集
tmp = set()
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X, y)
#print("training finished")
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1] # 降序排列
S={}
for f in range(X.shape[1]):
if importances[indices[f]] >=0.0001:
tmp.add(X.columns[indices[f]])
S[X.columns[indices[f]]]=importances[indices[f]]
#print("%2d) %-*s %f" % (f + 1, 30, X.columns[indices[f]], importances[indices[f]]))
selected_feat_names |= tmp
imp_fea=pd.Series(S)
print(len(selected_feat_names), "features are selected")
train_new=train_df[['income_level']]
test_new=test_df[['income_level']]
for i in selected_feat_names:
train_new[i]=train_df[i]
try :
test_new[i]=test_df[i]
except Exception :
print '----------------'
print i
del train_new[i]
print train_new.shape,test_new.shape
##将目标变量提到最后一列
Y=train_new['income_level']
del train_new['income_level']
train_new['income_level']=Y
YT=test_new['income_level']
del test_new['income_level']
test_new['income_level']=YT
#train_new.to_csv('train_new.csv')
#test_new.to_csv('test_new.csv')
首先:不均衡技术处理(欠采样和过采样技术)
其次:模型选择与训练(xgboost)
最后:调参,调参我希望在保证精确度》0.94的前提下,AUC越大越好
train_df (1,0):(6.20580083499,93.794199165)
test_df (1,0):(6.20075780357,93.7992421964)
正例:12382 负例:187141
抽样比例:25%
发生率为:21%
def down_sample(df):
df1=df[df['income_level']==1]#正例
df2=df[df['income_level']==0]##负例
df3=df2.sample(frac=0.25)##抽负例
return pd.concat([df1,df3],ignore_index=True)
down_train_df=down_sample(train_df)
down_train_new=down_sample(train_new)
train_df (1,0):(6.20580083499,93.794199165)
test_df (1,0):(6.20075780357,93.7992421964)
正例:12382 负例:187141
正例复制5次
发生率为:25%
def up_sample(df):
df1=df[df['income_level']==1]#正例
df2=df[df['income_level']==0]##负例
df3=pd.concat([df1,df1,df1,df1,df1],ignore_index=True)
return pd.concat([df2,df3],ignore_index=True)
up_train_df=up_sample(train_df)
up_train_new=up_sample(train_new)
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
#记录程序运行时间
import time
##定义模型参数
param = {}
# use logistic regression loss
param['objective'] = 'binary:logistic'
# scale weight of positive examples
param['scale_pos_weight'] = 1
param['bst:eta'] = 0.2
param['bst:max_depth'] = 6
param['eval_metric'] = 'logloss'
param['silent'] = 1
param['nthread'] = 10
Threshold=0.5
def xgb_model(train,tests,list,pam,Threshold):
train_xy,val = train_test_split(train, test_size = 0.3,random_state=1)
#random_state is of big influence for val-auc
y = train_xy['income_level']
X = train_xy.drop(['income_level'],axis=1)
val_y = val['income_level']
val_X = val.drop(['income_level'],axis=1)
weight1 = np.ones(len(y))
weight2 = np.ones(len(val_y))
xgb_val = xgb.DMatrix(val_X,label=val_y,weight = weight2)
xgb_train = xgb.DMatrix(X, label=y,weight = weight1)#,weight = weight1
test_y = tests['income_level']
test_X = tests.drop(['income_level'],axis=1)
xgb_test = xgb.DMatrix(test_X)##不要label
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]
num_round = 100 # 迭代次数
# training model
print ("training xgboost")
threads = list
##调参
for i in threads:
param[pam] = i
tmp = time.time()
plst = param.items()+[('eval_metric', '[email protected]')]
model = xgb.train( plst, xgb_train, num_round, watchlist,verbose_eval=False )
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
print pam,i
print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))
for j in range(len(preds)):
if preds[j]>=Threshold :
preds[j]=1
else :
preds[j]=0
print 'AUC: %.4f' % metrics.roc_auc_score(test_y,preds)
print 'ACC: %.4f' % metrics.accuracy_score(test_y,preds)
return model
**分别在 特征筛选后的train_new,以及在此基础上的上采样和下采样的数据集上进行调参,以此选出最佳模型
list=[6,7,8,9]
上采样时:树深为9,10可以继续调参 因为跨过了ACC为0.94
下采样时,选择树深为4
### 未经过重采样的训练集
list1=[4,6,8,9,10]
pam='max_depth'
xgb_model(train_new,test_new,list1,pam,Threshold)
### 下采样的训练集
xgb_model(down_train_new,test_new,list1,pam,Threshold)
### 上采样的训练集
xgb_model(up_train_new,test_new,list1,pam,Threshold)
list=[0.2,0.4,0.5,0.6,0.8,1]
经过上个步骤,上下采样的AUC都有不错的表现,但是ACC不足
而未经重采样的数据集的AUC的峰值大约在0.74左右,需要舍弃这个数据集
下采样则是ACC达不到0.94
下面着重在树深为9,10 的基础上调整上采样的ACC
结果:上采样选择scale_pos_weight=1.0,树深为9,AUC:0.8485,ACC:0.9398
### 上采样
param['max_depth'] = 10
list2=[0.8,0.9,1.0,1.1,1.2]
pam='scale_pos_weight'
xgb_model(up_train_new,test_new,list2,pam,Threshold)
### 上采样
param['max_depth'] = 9
list2=[0.8,0.9,1.0,1.1,1.2]
pam='scale_pos_weight'
Threshold=0.5
xgb_model(up_train_new,test_new,list2,pam,Threshold)
Threshold 0.45-0.55,步进0.01
最终选择:Threshold= 0.51
此时:
AUC: 0.8466
ACC: 0.9409
for m in np.arange(0.45,0.55,0.01):
param['scale_pos_weight'] = 1.0
list3=[9]
pam='max_depth'
Threshold=m
print 'Threshold=',m
xgb_model(up_train_new,test_new,list3,pam,Threshold)
from xgboost import plot_importance
import matplotlib.pyplot as plt
from graphviz import Digraph
import pydot
param['scale_pos_weight'] = 1.0
list3=[9]
pam='max_depth'
Threshold=0.51
model=xgb_model(up_train_new,test_new,list3,pam,Threshold)
imp_feat=imp_fea.sort_values()[::-1]
feat_imp=imp_feat[:30]
feat_imp.plot(kind='bar')