需要使用包括个体变量(性别、年龄、地域、职业、健康、婚姻与政治面貌等)、家庭变量(父母、配偶、子女、家庭资本)、社会态度(公平、信用、公共服务)等139个维度来预测其对幸福感的影响。
数据信息
评价指标
使用均方误差MSE,即
S c o r e = 1 n ∑ 1 n ( y i − y ∗ ) 2 Score = \frac{1}{n}\sum_1^n(y_i - y^*)^2 Score=n11∑n(yi−y∗)2
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv", parse_dates=['survey_time'], encoding='latin-1')
# parse_dates:日期解析
# latin-1向下兼容ASCII
test = pd.read_csv("test.csv", parse_dates=['survey_time'], encoding='latin-1')
#happiness存在-8的不在范围内数据,需要删除
#思考:如果是多个值需要删除,如何处理?
train = train[train["happiness"]!=-8].reset_index(drop=True)
train_data_copy = train.copy()
target_col = "happiness" #目标列
target = train_data_copy[target_col]
del train_data_copy[target_col] #去除目标列,只保留维度值
data = pd.concat([train_data_copy, test], axis=0, ignore_index=True)
#将训练集和测试集合并
train.happiness.describe()
#数据的基本信息
count 7988.000000
mean 3.867927
std 0.818717
min 1.000000
25% 4.000000
50% 4.000000
75% 4.000000
max 5.000000
Name: happiness, dtype: float64
# 查看字段的详细信息
pd.set_option("display.max_info_columns", 100) #100改为200,可以显示所有的列信息
data.info()
RangeIndex: 10956 entries, 0 to 10955
Columns: 139 entries, id to public_service_9
dtypes: datetime64[ns](1), float64(26), int64(109), object(3)
memory usage: 11.6+ MB
观察到上面的happiness的值处理后范围都在1-5之间,为有效值
首先对数据中连续出现的负数值进行处理。数据中的负数值只有-1, -2, -3, -8,可以分别进行操作
# 负数值视为有问题的特征,不进行删除
def getres1(row):
return len([x for x in row.values if type(x)==int and x<0])
def getres2(row):
return len([x for x in row.values if type(x)==int and x==-1])
def getres3(row):
return len([x for x in row.values if type(x)==int and x==-2])
def getres4(row):
return len([x for x in row.values if type(x)==int and x==-3])
def getres5(row):
return len([x for x in row.values if type(x)==int and x==-8])
# 检查数据,检测每行数据出现负数、-1、-2、-3、-8的次数
data['neg1'] = data[data.columns].apply(lambda row:getres1(row), axis=1)
data.loc[data['neg1']>20,'neg1'] = 20 #平滑处理,最多出现20次
data['neg2'] = data[data.columns].apply(lambda row:getres2(row), axis=1)
data['neg3'] = data[data.columns].apply(lambda row:getres3(row), axis=1)
data['neg4'] = data[data.columns].apply(lambda row:getres4(row), axis=1)
data['neg5'] = data[data.columns].apply(lambda row:getres5(row), axis=1)
采取将缺失值补全,使用fillna(value),其中value的数值根据具体的情况来确定。将大部分缺失值信息认为是0,家庭成员认为是1,家庭收入使用平均值66365填充
family_income_mean = data['family_income'].mean()
family_income_mean
66365.63760839798
data['work_status'] = data['work_status'].fillna(0)
data['work_yr'] = data['work_yr'].fillna(0)
data['work_manage'] = data['work_manage'].fillna(0)
data['work_type'] = data['work_type'].fillna(0)
data['edu_yr'] = data['edu_yr'].fillna(0)
data['edu_status'] = data['edu_status'].fillna(0)
data['s_work_type'] = data['s_work_type'].fillna(0)
data['s_work_status'] = data['s_work_status'].fillna(0)
data['s_political'] = data['s_political'].fillna(0)
data['s_hukou'] = data['s_hukou'].fillna(0)
data['s_income'] = data['s_income'].fillna(0)
data['s_birth'] = data['s_birth'].fillna(0)
data['s_edu'] = data['s_edu'].fillna(0)
data['s_work_exper'] = data['s_work_exper'].fillna(0)
data['minor_child'] = data['minor_child'].fillna(0)
data['marital_now'] = data['marital_now'].fillna(0)
data['marital_1st'] = data['marital_1st'].fillna(0)
data['social_neighbor']=data['social_neighbor'].fillna(0)
data['social_friend']=data['social_friend'].fillna(0)
data['hukou_loc']=data['hukou_loc'].fillna(1) #最少为1,表示户口
data['family_income']=data['family_income'].fillna(66365)
data.info()
RangeIndex: 10956 entries, 0 to 10955
Columns: 144 entries, id to neg5
dtypes: datetime64[ns](1), float64(26), int64(114), object(3)
memory usage: 12.0+ MB
对于特殊格式的信息进行另外处理。将“连续”的年龄,进行分层处理,分成6个区间;其次计算出具体的年龄
# 145.survey_time味素
data['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d', errors='coerce')
#防止时间格式不同的报错
data['survey_time'] = data['survey_time'].dt.year #获取年龄
data['age'] = data['survey_time']-data['birth']
data[['age','survey_time','birth']]
age | survey_time | birth | |
---|---|---|---|
0 | 56 | 2015 | 1959 |
1 | 23 | 2015 | 1992 |
2 | 48 | 2015 | 1967 |
3 | 72 | 2015 | 1943 |
4 | 21 | 2015 | 1994 |
... | ... | ... | ... |
10951 | 69 | 2015 | 1946 |
10952 | 38 | 2015 | 1977 |
10953 | 47 | 2015 | 1968 |
10954 | 65 | 2015 | 1950 |
10955 | 74 | 2015 | 1941 |
10956 rows × 3 columns
# 146.对年龄进行分层
bins = [0, 17, 26, 34, 50, 63, 100]
data['age_bin'] = pd.cut(data['age'], bins, labels=[0,1,2,3,4,5])
其他字段的缺失值处理
# 对‘宗教’的处理,特征为负的认为是“不信仰宗教”,并认为“参加宗教活动的频率”从为1,从不参加宗教活动
data.loc[data['religion']<0, 'religion'] = 1
data.loc[data['religion_freq']<0, 'religion_freq'] = 1
# 对‘教育程度’处理
data.loc[data['edu']<0, 'edu'] = 4 #初中
data.loc[data['edu_status']<0,'edu_status'] = 0
data.loc[data['edu_yr']<0,'edu_yr'] = 0
#对‘个人收入’处理
data.loc[data['income']<0,'income'] = 0 #认为无收入
#对‘政治面貌’处理
data.loc[data['political']<0,'political'] = 1 #认为是群众
#对体重处理
data.loc[(data['weight_jin']<=80)&(data['height_cm']>=160),'weight_jin']= data['weight_jin']*2
data.loc[data['weight_jin']<=60,'weight_jin']= data['weight_jin']*2 #个人的想法,哈哈哈,没有60斤的成年人吧
#对身高处理
data.loc[data['height_cm']<150,'height_cm'] = 150 #成年人的实际情况
#对‘健康’处理
data.loc[data['health']<0,'health'] = 4 #认为是比较健康
data.loc[data['health_problem']<0,'health_problem'] = 4
#对‘沮丧’处理
data.loc[data['depression']<0,'depression'] = 4 #一般人都是很少吧
#对‘媒体’处理
data.loc[data['media_1']<0,'media_1'] = 1 #都是从不
data.loc[data['media_2']<0,'media_2'] = 1
data.loc[data['media_3']<0,'media_3'] = 1
data.loc[data['media_4']<0,'media_4'] = 1
data.loc[data['media_5']<0,'media_5'] = 1
data.loc[data['media_6']<0,'media_6'] = 1
#对‘空闲活动’处理
data.loc[data['leisure_1']<0,'leisure_1'] = 1 #都是根据自己的想法
data.loc[data['leisure_2']<0,'leisure_2'] = 5
data.loc[data['leisure_3']<0,'leisure_3'] = 3
使用众数(mode())来实现异常值的修正
data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode() #取众数
data.loc[data['leisure_5']<0,'leisure_5'] = data['leisure_5'].mode()
data.loc[data['leisure_6']<0,'leisure_6'] = data['leisure_6'].mode()
data.loc[data['leisure_7']<0,'leisure_7'] = data['leisure_7'].mode()
data.loc[data['leisure_8']<0,'leisure_8'] = data['leisure_8'].mode()
data.loc[data['leisure_9']<0,'leisure_9'] = data['leisure_9'].mode()
data.loc[data['leisure_10']<0,'leisure_10'] = data['leisure_10'].mode()
data.loc[data['leisure_11']<0,'leisure_11'] = data['leisure_11'].mode()
data.loc[data['leisure_12']<0,'leisure_12'] = data['leisure_12'].mode()
data.loc[data['socialize']<0,'socialize'] = 2 #很少
data.loc[data['relax']<0,'relax'] = 4 #经常
data.loc[data['learn']<0,'learn'] = 1 #从不
#对‘社交’处理
data.loc[data['social_neighbor']<0,'social_neighbor'] = 0
data.loc[data['social_friend']<0,'social_friend'] = 0
data.loc[data['socia_outing']<0,'socia_outing'] = 1
data.loc[data['neighbor_familiarity']<0,'social_neighbor']= 4
#对‘社会公平性’处理
data.loc[data['equity']<0,'equity'] = 4
#对‘社会等级’处理
data.loc[data['class_10_before']<0,'class_10_before'] = 3
data.loc[data['class']<0,'class'] = 5
data.loc[data['class_10_after']<0,'class_10_after'] = 5
data.loc[data['class_14']<0,'class_14'] = 2
#对‘工作情况’处理
data.loc[data['work_status']<0,'work_status'] = 0
data.loc[data['work_yr']<0,'work_yr'] = 0
data.loc[data['work_manage']<0,'work_manage'] = 0
data.loc[data['work_type']<0,'work_type'] = 0
#对‘社会保障’处理
data.loc[data['insur_1']<0,'insur_1'] = 1
data.loc[data['insur_2']<0,'insur_2'] = 1
data.loc[data['insur_3']<0,'insur_3'] = 1
data.loc[data['insur_4']<0,'insur_4'] = 1
data.loc[data['insur_1']==0,'insur_1'] = 0
data.loc[data['insur_2']==0,'insur_2'] = 0
data.loc[data['insur_3']==0,'insur_3'] = 0
data.loc[data['insur_4']==0,'insur_4'] = 0
取均值进行缺失值的补全(mean())
#对家庭情况处理
data.loc[data['family_income']<0,'family_income'] = family_income_mean
data.loc[data['family_m']<0,'family_m'] = 2
data.loc[data['family_status']<0,'family_status'] = 3
data.loc[data['house']<0,'house'] = 1
data.loc[data['car']<0,'car'] = 0
data.loc[data['car']==2,'car'] = 0
data.loc[data['son']<0,'son'] = 1
data.loc[data['daughter']<0,'daughter'] = 0
data.loc[data['minor_child']<0,'minor_child'] = 0
#对‘婚姻’处理
data.loc[data['marital_1st']<0,'marital_1st'] = 0
data.loc[data['marital_now']<0,'marital_now'] = 0
#对‘配偶’处理
data.loc[data['s_birth']<0,'s_birth'] = 0
data.loc[data['s_edu']<0,'s_edu'] = 0
data.loc[data['s_political']<0,'s_political'] = 0
data.loc[data['s_hukou']<0,'s_hukou'] = 0
data.loc[data['s_income']<0,'s_income'] = 0
data.loc[data['s_work_type']<0,'s_work_type'] = 0
data.loc[data['s_work_status']<0,'s_work_status'] = 0
data.loc[data['s_work_exper']<0,'s_work_exper'] = 0
#对‘父母情况’处理
data.loc[data['f_birth']<0,'f_birth'] = 1945
data.loc[data['f_edu']<0,'f_edu'] = 1
data.loc[data['f_political']<0,'f_political'] = 1
data.loc[data['f_work_14']<0,'f_work_14'] = 2
data.loc[data['m_birth']<0,'m_birth'] = 1940
data.loc[data['m_edu']<0,'m_edu'] = 1
data.loc[data['m_political']<0,'m_political'] = 1
data.loc[data['m_work_14']<0,'m_work_14'] = 2
#和同龄人相比社会经济地位
data.loc[data['status_peer']<0,'status_peer'] = 2
#和3年前比社会经济地位
data.loc[data['status_3_before']<0,'status_3_before'] = 2
#对‘观点’处理
data.loc[data['view']<0,'view'] = 4
#对期望年收入处理
data.loc[data['inc_ability']<=0,'inc_ability']= 2
inc_exp_mean = data['inc_exp'].mean()
data.loc[data['inc_exp']<=0,'inc_exp']= inc_exp_mean #取均值
#部分特征处理,取众数
for i in range(1,10):
data.loc[data['public_service_'+str(i)]<0,'public_service_'+str(i)] = data['public_service_'+str(i)].dropna().mode()
for i in range(1,14):
data.loc[data['trust_'+str(i)]<0,'trust_'+str(i)] = data['trust_'+str(i)].dropna().mode()
data.shape
(10956, 146)
进一步分析每一个特征之间的关系,从而进行数据增广。增加如下特征:
另外,考虑对于同一省、市、县进行归一化:同一省市内的收入平均值。一个个体相对于同省、市、县其他人的各个指标的情况。
同时,考虑同龄人之间进行相互比较
# 增加维度
# 147.第一次结婚年龄
data['marital_1stbir'] = data['marital_1st'] - data['birth']
# 148.最近结婚年龄
data['marital_nowtbir'] = data['marital_now'] - data['birth']
# 149.是否再婚
data['mar'] = data['marital_nowtbir'] - data['marital_1stbir']
# 150.配偶年龄
data['marital_sbir'] = data['marital_now']-data['s_birth']
# 151.配偶年龄差
data['age_'] = data['marital_nowtbir'] - data['marital_sbir']
# 152-158.收入比
data['income/s_income'] = data['income']/(data['s_income']+1)
data['income+s_income'] = data['income']+(data['s_income']+1)
data['income/family_income'] = data['income']/(data['family_income']+1)
data['all_income/family_income'] = (data['income']+data['s_income'])/(data['family_income']+1)
data['income/inc_exp'] = data['income']/(data['inc_exp']+1)
data['family_income/m'] = data['family_income']/(data['family_m']+0.01)
data['income/m'] = data['income']/(data['family_m']+0.01)
# 159-162.收入/面积比
data['income/floor_area'] = data['income']/(data['floor_area']+0.01)
data['all_income/floor_area'] = (data['income']+data['s_income'])/(data['floor_area']+0.01)
data['family_income/floor_area'] = data['family_income']/(data['floor_area']+0.01)
data['floor_area/m'] = data['floor_area']/(data['family_m']+0.01)
# 163-165.class
data['class_10_diff'] = (data['class_10_after'] - data['class'])
data['class_diff'] = data['class'] - data['class_10_before']
data['class_14_diff'] = data['class'] - data['class_14']
# 166.悠闲指数
leisure_fea_lis = ['leisure_'+str(i) for i in range(1,13)]
data['leisure_sum'] = data[leisure_fea_lis].sum(axis=1)
# 167.满意指数
public_service_fea_lis = ['public_service_'+str(i) for i in range(1,10)]
data['public_service_sum'] = data[public_service_fea_lis].sum(axis=1)
# 168.信任指数
trust_fea_lis = ['trust_'+str(i) for i in range(1,14)]
data['trust_sum'] = data[trust_fea_lis].sum(axis=1) #skew
# 169-181.province mean
data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').values
data['province_family_income_mean'] = data.groupby(['province'])['family_income'].transform('mean').values
data['province_equity_mean'] = data.groupby(['province'])['equity'].transform('mean').values
data['province_depression_mean'] = data.groupby(['province'])['depression'].transform('mean').values
data['province_floor_area_mean'] = data.groupby(['province'])['floor_area'].transform('mean').values
data['province_health_mean'] = data.groupby(['province'])['health'].transform('mean').values
data['province_class_10_diff_mean'] = data.groupby(['province'])['class_10_diff'].transform('mean').values
data['province_class_mean'] = data.groupby(['province'])['class'].transform('mean').values
data['province_health_problem_mean'] = data.groupby(['province'])['health_problem'].transform('mean').values
data['province_family_status_mean'] = data.groupby(['province'])['family_status'].transform('mean').values
data['province_leisure_sum_mean'] = data.groupby(['province'])['leisure_sum'].transform('mean').values
data['province_public_service_sum_mean'] = data.groupby(['province'])['public_service_sum'].transform('mean').values
data['province_trust_sum_mean'] = data.groupby(['province'])['trust_sum'].transform('mean').values
# 182-194. city mean
data['city_income_mean'] = data.groupby(['city'])['income'].transform('mean').values
data['city_family_income_mean'] = data.groupby(['city'])['family_income'].transform('mean').values
data['city_equity_mean'] = data.groupby(['city'])['equity'].transform('mean').values
data['city_depression_mean'] = data.groupby(['city'])['depression'].transform('mean').values
data['city_floor_area_mean'] = data.groupby(['city'])['floor_area'].transform('mean').values
data['city_health_mean'] = data.groupby(['city'])['health'].transform('mean').values
data['city_class_10_diff_mean'] = data.groupby(['city'])['class_10_diff'].transform('mean').values
data['city_class_mean'] = data.groupby(['city'])['class'].transform('mean').values
data['city_health_problem_mean'] = data.groupby(['city'])['health_problem'].transform('mean').values
data['city_family_status_mean'] = data.groupby(['city'])['family_status'].transform('mean').values
data['city_leisure_sum_mean'] = data.groupby(['city'])['leisure_sum'].transform('mean').values
data['city_public_service_sum_mean'] = data.groupby(['city'])['public_service_sum'].transform('mean').values
data['city_trust_sum_mean'] = data.groupby(['city'])['trust_sum'].transform('mean').values
# 195-207. county mean
data['county_income_mean'] = data.groupby(['county'])['income'].transform('mean').values
data['county_family_income_mean'] = data.groupby(['county'])['family_income'].transform('mean').values
data['county_equity_mean'] = data.groupby(['county'])['equity'].transform('mean').values
data['county_depression_mean'] = data.groupby(['county'])['depression'].transform('mean').values
data['county_floor_area_mean'] = data.groupby(['county'])['floor_area'].transform('mean').values
data['county_health_mean'] = data.groupby(['county'])['health'].transform('mean').values
data['county_class_10_diff_mean'] = data.groupby(['county'])['class_10_diff'].transform('mean').values
data['county_class_mean'] = data.groupby(['county'])['class'].transform('mean').values
data['county_health_problem_mean'] = data.groupby(['county'])['health_problem'].transform('mean').values
data['county_family_status_mean'] = data.groupby(['county'])['family_status'].transform('mean').values
data['county_leisure_sum_mean'] = data.groupby(['county'])['leisure_sum'].transform('mean').values
data['county_public_service_sum_mean'] = data.groupby(['county'])['public_service_sum'].transform('mean').values
data['county_trust_sum_mean'] = data.groupby(['county'])['trust_sum'].transform('mean').values
# 208-220. ratio相比同省
data['income/province'] = data['income']/(data['province_income_mean'])
data['family_income/province'] = data['family_income']/(data['province_family_income_mean'])
data['equity/province'] = data['equity']/(data['province_equity_mean'])
data['depression/province'] = data['depression']/(data['province_depression_mean'])
data['floor_area/province'] = data['floor_area']/(data['province_floor_area_mean'])
data['health/province'] = data['health']/(data['province_health_mean'])
data['class_10_diff/province'] = data['class_10_diff']/(data['province_class_10_diff_mean'])
data['class/province'] = data['class']/(data['province_class_mean'])
data['health_problem/province'] = data['health_problem']/(data['province_health_problem_mean'])
data['family_status/province'] = data['family_status']/(data['province_family_status_mean'])
data['leisure_sum/province'] = data['leisure_sum']/(data['province_leisure_sum_mean'])
data['public_service_sum/province'] = data['public_service_sum']/(data['province_public_service_sum_mean'])
data['trust_sum/province'] = data['trust_sum']/(data['province_trust_sum_mean']+1)
# 221-233. ratio 相比同市
data['income/city'] = data['income']/(data['city_income_mean'])
data['family_income/city'] = data['family_income']/(data['city_family_income_mean'])
data['equity/city'] = data['equity']/(data['city_equity_mean'])
data['depression/city'] = data['depression']/(data['city_depression_mean'])
data['floor_area/city'] = data['floor_area']/(data['city_floor_area_mean'])
data['health/city'] = data['health']/(data['city_health_mean'])
data['class_10_diff/city'] = data['class_10_diff']/(data['city_class_10_diff_mean'])
data['class/city'] = data['class']/(data['city_class_mean'])
data['health_problem/city'] = data['health_problem']/(data['city_health_problem_mean'])
data['family_status/city'] = data['family_status']/(data['city_family_status_mean'])
data['leisure_sum/city'] = data['leisure_sum']/(data['city_leisure_sum_mean'])
data['public_service_sum/city'] = data['public_service_sum']/(data['city_public_service_sum_mean'])
data['trust_sum/city'] = data['trust_sum']/(data['city_trust_sum_mean'])
# 234-246. ratio相比同个地区
data['income/county'] = data['income']/(data['county_income_mean'])
data['family_income/county'] = data['family_income']/(data['county_family_income_mean'])
data['equity/county'] = data['equity']/(data['county_equity_mean'])
data['depression/county'] = data['depression']/(data['county_depression_mean'])
data['floor_area/county'] = data['floor_area']/(data['county_floor_area_mean'])
data['health/county'] = data['health']/(data['county_health_mean'])
data['class_10_diff/county'] = data['class_10_diff']/(data['county_class_10_diff_mean'])
data['class/county'] = data['class']/(data['county_class_mean'])
data['health_problem/county'] = data['health_problem']/(data['county_health_problem_mean'])
data['family_status/county'] = data['family_status']/(data['county_family_status_mean'])
data['leisure_sum/county'] = data['leisure_sum']/(data['county_leisure_sum_mean'])
data['public_service_sum/county'] = data['public_service_sum']/(data['county_public_service_sum_mean'])
data['trust_sum/county'] = data['trust_sum']/(data['county_trust_sum_mean'])
# 247-259. age mean
data['age_income_mean'] = data.groupby(['age'])['income'].transform('mean').values
data['age_family_income_mean'] = data.groupby(['age'])['family_income'].transform('mean').values
data['age_equity_mean'] = data.groupby(['age'])['equity'].transform('mean').values
data['age_depression_mean'] = data.groupby(['age'])['depression'].transform('mean').values
data['age_floor_area_mean'] = data.groupby(['age'])['floor_area'].transform('mean').values
data['age_health_mean'] = data.groupby(['age'])['health'].transform('mean').values
data['age_class_10_diff_mean'] = data.groupby(['age'])['class_10_diff'].transform('mean').values
data['age_class_mean'] = data.groupby(['age'])['class'].transform('mean').values
data['age_health_problem_mean'] = data.groupby(['age'])['health_problem'].transform('mean').values
data['age_family_status_mean'] = data.groupby(['age'])['family_status'].transform('mean').values
data['age_leisure_sum_mean'] = data.groupby(['age'])['leisure_sum'].transform('mean').values
data['age_public_service_sum_mean'] = data.groupby(['age'])['public_service_sum'].transform('mean').values
data['age_trust_sum_mean'] = data.groupby(['age'])['trust_sum'].transform('mean').values
# 260-272.和同龄人相比
data['income/age'] = data['income']/(data['age_income_mean'])
data['family_income/age'] = data['family_income']/(data['age_family_income_mean'])
data['equity/age'] = data['equity']/(data['age_equity_mean'])
data['depression/age'] = data['depression']/(data['age_depression_mean'])
data['floor_area/age'] = data['floor_area']/(data['age_floor_area_mean'])
data['health/age'] = data['health']/(data['age_health_mean'])
data['class_10_diff/age'] = data['class_10_diff']/(data['age_class_10_diff_mean'])
data['class/age'] = data['class']/(data['age_class_mean'])
data['health_problem/age'] = data['health_problem']/(data['age_health_problem_mean'])
data['family_status/age'] = data['family_status']/(data['age_family_status_mean'])
data['leisure_sum/age'] = data['leisure_sum']/(data['age_leisure_sum_mean'])
data['public_service_sum/age'] = data['public_service_sum']/(data['age_public_service_sum_mean'])
data['trust_sum/age'] = data['trust_sum']/(data['age_trust_sum_mean'])
特征从131维扩充为272维
print('shape', data.shape)
data.head(1)
shape (10956, 272)
id | survey_type | province | city | county | survey_time | gender | birth | nationality | religion | religion_freq | edu | edu_other | edu_status | edu_yr | income | political | join_party | floor_area | property_0 | property_1 | property_2 | property_3 | property_4 | property_5 | property_6 | property_7 | property_8 | property_other | height_cm | weight_jin | health | health_problem | depression | hukou | hukou_loc | media_1 | media_2 | media_3 | media_4 | media_5 | media_6 | leisure_1 | leisure_2 | leisure_3 | leisure_4 | leisure_5 | leisure_6 | leisure_7 | leisure_8 | leisure_9 | leisure_10 | leisure_11 | leisure_12 | socialize | relax | learn | social_neighbor | social_friend | socia_outing | equity | class | class_10_before | class_10_after | class_14 | work_exper | work_status | work_yr | work_type | work_manage | insur_1 | insur_2 | insur_3 | insur_4 | family_income | family_m | family_status | house | car | invest_0 | invest_1 | invest_2 | invest_3 | invest_4 | invest_5 | invest_6 | invest_7 | invest_8 | invest_other | son | daughter | minor_child | marital | marital_1st | s_birth | marital_now | s_edu | s_political | s_hukou | s_income | s_work_exper | s_work_status | s_work_type | f_birth | f_edu | f_political | f_work_14 | m_birth | m_edu | m_political | m_work_14 | status_peer | status_3_before | view | inc_ability | inc_exp | trust_1 | trust_2 | trust_3 | trust_4 | trust_5 | trust_6 | trust_7 | trust_8 | trust_9 | trust_10 | trust_11 | trust_12 | trust_13 | neighbor_familiarity | public_service_1 | public_service_2 | public_service_3 | public_service_4 | public_service_5 | public_service_6 | public_service_7 | public_service_8 | public_service_9 | neg1 | neg2 | neg3 | neg4 | neg5 | age | age_bin | marital_1stbir | marital_nowtbir | mar | marital_sbir | age_ | income/s_income | income+s_income | income/family_income | all_income/family_income | income/inc_exp | family_income/m | income/m | income/floor_area | all_income/floor_area | family_income/floor_area | floor_area/m | class_10_diff | class_diff | class_14_diff | leisure_sum | public_service_sum | trust_sum | province_income_mean | province_family_income_mean | province_equity_mean | province_depression_mean | province_floor_area_mean | province_health_mean | province_class_10_diff_mean | province_class_mean | province_health_problem_mean | province_family_status_mean | province_leisure_sum_mean | province_public_service_sum_mean | province_trust_sum_mean | city_income_mean | city_family_income_mean | city_equity_mean | city_depression_mean | city_floor_area_mean | city_health_mean | city_class_10_diff_mean | city_class_mean | city_health_problem_mean | city_family_status_mean | city_leisure_sum_mean | city_public_service_sum_mean | city_trust_sum_mean | county_income_mean | county_family_income_mean | county_equity_mean | county_depression_mean | county_floor_area_mean | county_health_mean | county_class_10_diff_mean | county_class_mean | county_health_problem_mean | county_family_status_mean | county_leisure_sum_mean | county_public_service_sum_mean | county_trust_sum_mean | income/province | family_income/province | equity/province | depression/province | floor_area/province | health/province | class_10_diff/province | class/province | health_problem/province | family_status/province | leisure_sum/province | public_service_sum/province | trust_sum/province | income/city | family_income/city | equity/city | depression/city | floor_area/city | health/city | class_10_diff/city | class/city | health_problem/city | family_status/city | leisure_sum/city | public_service_sum/city | trust_sum/city | income/county | family_income/county | equity/county | depression/county | floor_area/county | health/county | class_10_diff/county | class/county | health_problem/county | family_status/county | leisure_sum/county | public_service_sum/county | trust_sum/county | age_income_mean | age_family_income_mean | age_equity_mean | age_depression_mean | age_floor_area_mean | age_health_mean | age_class_10_diff_mean | age_class_mean | age_health_problem_mean | age_family_status_mean | age_leisure_sum_mean | age_public_service_sum_mean | age_trust_sum_mean | income/age | family_income/age | equity/age | depression/age | floor_area/age | health/age | class_10_diff/age | class/age | health_problem/age | family_status/age | leisure_sum/age | public_service_sum/age | trust_sum/age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 12 | 32 | 59 | 2015 | 1 | 1959 | 1 | 1 | 1 | 11 | NaN | 4.0 | 0.0 | 20000 | 1 | NaN | 45.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 176 | 155 | 3 | 2 | 5 | 5 | 2.0 | 4 | 2 | 5 | 5 | 4 | 3 | 1 | 4 | 3 | 1.0 | 2.0 | 3.0 | 4.0 | 1.0 | 4.0 | 5.0 | 4.0 | 1.0 | 2 | 4 | 3 | 3.0 | 3.0 | 2 | 3 | 3 | 3 | 3 | 1 | 1 | 3.0 | 30.0 | 1.0 | 2.0 | 1 | 1 | 1 | 2 | 60000.0 | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 1 | 0 | 0.0 | 3 | 1984.0 | 1958.0 | 1984.0 | 6.0 | 1.0 | 5.0 | 40000.0 | 5.0 | 0.0 | 0.0 | 1945 | 4 | 4 | 1 | 1940 | 4 | 1 | 1 | 3 | 2 | 4 | 3 | 50000.0 | 4.0 | 2.0 | 4.0 | 4.0 | 5.0 | 3.0 | 2.0 | 3.0 | 4.0 | 3.0 | -8.0 | 4.0 | 1.0 | 4 | 50.0 | 60.0 | 50.0 | 50.0 | 30.0 | 30.0 | 50.0 | 50.0 | 50.0 | 5 | 0 | 2 | 0 | 3 | 56 | 4 | 25.0 | 25.0 | 0.0 | 26.0 | -1.0 | 0.499988 | 60001.0 | 0.333328 | 0.999983 | 0.399992 | 29850.746269 | 9950.248756 | 444.345701 | 1333.037103 | 1333.037103 | 22.38806 | 0 | 0 | 2 | 33.0 | 420.0 | 31.0 | 61859.505703 | 131638.952209 | 2.773764 | 3.954373 | 88.692205 | 3.701521 | 1.127376 | 4.572243 | 4.051331 | 2.71673 | 41.701521 | 506.456274 | 35.511407 | 43096.656716 | 93764.357811 | 2.8 | 4.01194 | 82.744776 | 3.650746 | 1.164179 | 4.465672 | 4.113433 | 2.665672 | 40.059701 | 542.671642 | 34.955224 | 28979.591837 | 60630.549241 | 2.571429 | 4.081633 | 38.530612 | 3.489796 | 1.183673 | 4.938776 | 3.877551 | 2.489796 | 36.693878 | 559.163265 | 31.346939 | 0.323313 | 0.455792 | 1.081563 | 1.264423 | 0.507373 | 0.810478 | 0.0 | 0.656133 | 0.493665 | 0.736179 | 0.791338 | 0.829292 | 0.84905 | 0.464073 | 0.639902 | 1.071429 | 1.24628 | 0.543841 | 0.82175 | 0.0 | 0.671791 | 0.486212 | 0.75028 | 0.82377 | 0.773949 | 0.886849 | 0.690141 | 0.9896 | 1.166667 | 1.225 | 1.167903 | 0.859649 | 0.0 | 0.607438 | 0.515789 | 0.803279 | 0.899333 | 0.751122 | 0.988932 | 24371.808219 | 68176.863107 | 3.061644 | 3.890411 | 109.662329 | 3.534247 | 0.479452 | 4.390411 | 3.835616 | 2.726027 | 45.541096 | 608.657534 | 36.054795 | 0.82062 | 0.880064 | 0.979866 | 1.285211 | 0.410351 | 0.848837 | 0.0 | 0.683307 | 0.521429 | 0.733668 | 0.72462 | 0.690043 | 0.859802 |
删除有效样本数很少的特征,如:负值太多或缺失值太多的特征以及已经分解的原维度。
# 删除9类维度,从272变为263
del_list = ['id','survey_time','edu_other','invest_other','property_other','join_party','province','city','county']
use_feature = [col for col in data.columns if col not in del_list]
data.fillna(0, inplace=True) #补0
train_shape = train.shape[0] #训练集总数据量
features = data[use_feature].columns #删除后的所有特征
X_train_263 = data[:train_shape][use_feature].values
y_train = target
X_test_263 = data[train_shape:][use_feature].values
X_train_263.shape
(7988, 263)
选择最重要的49个特征,作为另外一组特征train_shape
imp_fea_49 = ['equity','depression','health','class','family_status','health_problem','class_10_after',
'equity/province','equity/city','equity/county',
'depression/province','depression/city','depression/county',
'health/province','health/city','health/county',
'class/province','class/city','class/county',
'family_status/province','family_status/city','family_status/county',
'family_income/province','family_income/city','family_income/county',
'floor_area/province','floor_area/city','floor_area/county',
'leisure_sum/province','leisure_sum/city','leisure_sum/county',
'public_service_sum/province','public_service_sum/city','public_service_sum/county',
'trust_sum/province','trust_sum/city','trust_sum/county',
'income/m','public_service_sum','class_diff','status_3_before','age_income_mean','age_floor_area_mean',
'weight_jin','height_cm',
'health/age','depression/age','equity/age','leisure_sum/age'
]
train_shape = train.shape[0]
X_train_49 = data[:train_shape][imp_fea_49].values
X_test_49 = data[train_shape:][imp_fea_49].values
X_train_49.shape
(7988, 49)
选择需要进行one-hot编码的离散变量进行one-hot编码,再合成为第三类特征,共383维
from sklearn import preprocessing
cat_fea = ['survey_type','gender','nationality','edu_status','political','hukou','hukou_loc','work_exper','work_status','work_type',
'work_manage','marital','s_political','s_hukou','s_work_exper','s_work_status','s_work_type','f_political','f_work_14',
'm_political','m_work_14']
noc_fea = [clo for clo in use_feature if clo not in cat_fea]
onehot_data = data[cat_fea].values
enc = preprocessing.OneHotEncoder(categories = 'auto')
oh_data=enc.fit_transform(onehot_data).toarray()
oh_data.shape #变为onehot编码格式
X_train_oh = oh_data[:train_shape,:]
X_test_oh = oh_data[train_shape:,:]
X_train_oh.shape #其中的训练集
X_train_383 = np.column_stack([data[:train_shape][noc_fea].values,X_train_oh])#先是noc,再是cat_fea
X_test_383 = np.column_stack([data[train_shape:][noc_fea].values,X_test_oh])
X_train_383.shape
(7988, 383)
上面构建完成了三种特征工程(训练数据集)
对原始的263维的特征,使用lightGBM进行处理,使用5折交叉验证
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
lgb_263_param = {
'num_leaves':7, #叶子节点数
'min_data_in_leaf':20, #叶子可能具有的最小记录数
'objective':'regression', #目标函数
'max_depth': -1,
'learning_rate': 0.003,
'boosting': 'gbdt', #使用gbdt算法
'feature_fraction': 0.18, #在每次迭代中随机选择18%的参数来建树
'bagging_freq': 1, #每k次迭代执行bagging
'bagging_fraction': 0.55, #每次迭代时用的数据比例
'bagging_seed': 14,
'metric': 'mse', #评估函数
'lambda_l1': 0.1005,
'lambda_l2': 0.1996,
'verbosity': -1
}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) #交叉切分:5
oof_lgb_263 = np.zeros(len(X_train_263))
predictions_lgb_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train_263[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train_263[val_idx], y_train[val_idx])#train:val=4:1
num_round = 10000
lgb_263 = lgb.train(lgb_263_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 800)
oof_lgb_263[val_idx] = lgb_263.predict(X_train_263[val_idx], num_iteration=lgb_263.best_iteration)
predictions_lgb_263 += lgb_263.predict(X_test_263, num_iteration=lgb_263.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb_263, target)))
fold n°1
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.500101 valid_1's l2: 0.533754
[1000] training's l2: 0.451915 valid_1's l2: 0.501188
[1500] training's l2: 0.425965 valid_1's l2: 0.488273
[2000] training's l2: 0.408014 valid_1's l2: 0.482152
[2500] training's l2: 0.393607 valid_1's l2: 0.478549
[3000] training's l2: 0.381367 valid_1's l2: 0.476341
[3500] training's l2: 0.370543 valid_1's l2: 0.475054
[4000] training's l2: 0.360716 valid_1's l2: 0.473788
[4500] training's l2: 0.351673 valid_1's l2: 0.472955
[5000] training's l2: 0.343329 valid_1's l2: 0.472481
[5500] training's l2: 0.335486 valid_1's l2: 0.472065
[6000] training's l2: 0.327869 valid_1's l2: 0.472095
[6500] training's l2: 0.320536 valid_1's l2: 0.471812
[7000] training's l2: 0.313706 valid_1's l2: 0.471653
[7500] training's l2: 0.307024 valid_1's l2: 0.471513
[8000] training's l2: 0.300623 valid_1's l2: 0.471502
[8500] training's l2: 0.294494 valid_1's l2: 0.471714
Early stopping, best iteration is:
[7884] training's l2: 0.302099 valid_1's l2: 0.471375
fold n°2
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.504846 valid_1's l2: 0.514311
[1000] training's l2: 0.455741 valid_1's l2: 0.479932
[1500] training's l2: 0.429699 valid_1's l2: 0.46678
[2000] training's l2: 0.411894 valid_1's l2: 0.459892
[2500] training's l2: 0.3978 valid_1's l2: 0.455978
[3000] training's l2: 0.385654 valid_1's l2: 0.453208
[3500] training's l2: 0.375002 valid_1's l2: 0.451663
[4000] training's l2: 0.365202 valid_1's l2: 0.450395
[4500] training's l2: 0.356064 valid_1's l2: 0.449422
[5000] training's l2: 0.347611 valid_1's l2: 0.448576
[5500] training's l2: 0.33964 valid_1's l2: 0.448066
[6000] training's l2: 0.331981 valid_1's l2: 0.447876
[6500] training's l2: 0.324735 valid_1's l2: 0.447835
[7000] training's l2: 0.317708 valid_1's l2: 0.447551
[7500] training's l2: 0.310942 valid_1's l2: 0.44739
[8000] training's l2: 0.304521 valid_1's l2: 0.447248
[8500] training's l2: 0.29834 valid_1's l2: 0.446924
[9000] training's l2: 0.292332 valid_1's l2: 0.446925
[9500] training's l2: 0.286584 valid_1's l2: 0.44676
[10000] training's l2: 0.280976 valid_1's l2: 0.44697
Did not meet early stopping. Best iteration is:
[10000] training's l2: 0.280976 valid_1's l2: 0.44697
fold n°3
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.503936 valid_1's l2: 0.518085
[1000] training's l2: 0.455875 valid_1's l2: 0.481564
[1500] training's l2: 0.430458 valid_1's l2: 0.465397
[2000] training's l2: 0.412991 valid_1's l2: 0.456098
[2500] training's l2: 0.398852 valid_1's l2: 0.450544
[3000] training's l2: 0.386818 valid_1's l2: 0.447112
[3500] training's l2: 0.375987 valid_1's l2: 0.445001
[4000] training's l2: 0.36615 valid_1's l2: 0.443158
[4500] training's l2: 0.357182 valid_1's l2: 0.442228
[5000] training's l2: 0.348605 valid_1's l2: 0.44152
[5500] training's l2: 0.340452 valid_1's l2: 0.440958
[6000] training's l2: 0.332771 valid_1's l2: 0.440471
[6500] training's l2: 0.325337 valid_1's l2: 0.440145
[7000] training's l2: 0.318312 valid_1's l2: 0.439753
[7500] training's l2: 0.311662 valid_1's l2: 0.439775
[8000] training's l2: 0.305313 valid_1's l2: 0.439678
[8500] training's l2: 0.299159 valid_1's l2: 0.439626
[9000] training's l2: 0.293289 valid_1's l2: 0.439485
[9500] training's l2: 0.287524 valid_1's l2: 0.439431
[10000] training's l2: 0.281877 valid_1's l2: 0.439972
Did not meet early stopping. Best iteration is:
[10000] training's l2: 0.281877 valid_1's l2: 0.439972
fold n°4
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.505012 valid_1's l2: 0.51259
[1000] training's l2: 0.456326 valid_1's l2: 0.47774
[1500] training's l2: 0.429825 valid_1's l2: 0.465935
[2000] training's l2: 0.411736 valid_1's l2: 0.460181
[2500] training's l2: 0.397521 valid_1's l2: 0.45707
[3000] training's l2: 0.385412 valid_1's l2: 0.455253
[3500] training's l2: 0.374708 valid_1's l2: 0.454153
[4000] training's l2: 0.364887 valid_1's l2: 0.453365
[4500] training's l2: 0.355802 valid_1's l2: 0.45265
[5000] training's l2: 0.347222 valid_1's l2: 0.452064
[5500] training's l2: 0.33926 valid_1's l2: 0.451795
[6000] training's l2: 0.331692 valid_1's l2: 0.451627
[6500] training's l2: 0.324336 valid_1's l2: 0.451448
[7000] training's l2: 0.317485 valid_1's l2: 0.451158
[7500] training's l2: 0.310856 valid_1's l2: 0.451105
[8000] training's l2: 0.304333 valid_1's l2: 0.451129
[8500] training's l2: 0.29824 valid_1's l2: 0.451094
[9000] training's l2: 0.292221 valid_1's l2: 0.451218
Early stopping, best iteration is:
[8384] training's l2: 0.299655 valid_1's l2: 0.450968
fold n°5
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.503608 valid_1's l2: 0.520349
[1000] training's l2: 0.455513 valid_1's l2: 0.485606
[1500] training's l2: 0.429792 valid_1's l2: 0.47215
[2000] training's l2: 0.411754 valid_1's l2: 0.465636
[2500] training's l2: 0.397228 valid_1's l2: 0.461877
[3000] training's l2: 0.38493 valid_1's l2: 0.459667
[3500] training's l2: 0.373942 valid_1's l2: 0.458256
[4000] training's l2: 0.364025 valid_1's l2: 0.457571
[4500] training's l2: 0.354782 valid_1's l2: 0.457254
[5000] training's l2: 0.346061 valid_1's l2: 0.457078
[5500] training's l2: 0.337927 valid_1's l2: 0.45671
[6000] training's l2: 0.330158 valid_1's l2: 0.456812
[6500] training's l2: 0.322753 valid_1's l2: 0.456942
Early stopping, best iteration is:
[5771] training's l2: 0.333658 valid_1's l2: 0.456643
CV score: 0.45318547
使用已经训练完的lightGBM的模型进行特征重要性的判断以及可视化。
import matplotlib.pyplot as plt
import seaborn as sns
# 特征重要性
pd.set_option('display.max_columns',None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
df = pd.DataFrame(data[use_feature].columns.tolist(), columns=['feature'])
df['importance']=list(lgb_263.feature_importance())
df = df.sort_values(by='importance',ascending=False)
plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="feature", data=df.head(50))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
from sklearn.model_selection import KFold
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore') #消除warning
xgb_263_params = {
'eta': 0.02, #lr
'max_depth': 6,
'min_child_weight': 3, #最小叶子节点样本权重和
'gamma': 0, #指定节点分裂所需的最小损失函数下降值
'subsample':0.7, #控制每棵树,随机采样的比例
'colsample_bytree': 0.3, #控制每棵树随机采样的列数的占比(每一列是一个特征)
'lambda': 2,
'objective': 'reg:squarederror',#reg:linear
'eval_metric': 'rmse',
'nthread': -1
}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb_263 = np.zeros(len(X_train_263))
predictions_xgb_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train_263[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train_263[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valis_data')]
xgb_263 = xgb.train(dtrain=trn_data, num_boost_round=3000, evals=watchlist, early_stopping_rounds=600, verbose_eval=500, params=xgb_263_params)
oof_xgb_263[val_idx] = xgb_263.predict(xgb.DMatrix(X_train_263[val_idx]), ntree_limit=xgb_263.best_ntree_limit)
#ntree_limit is deprecated, use `iteration_range` or model slicing instead
predictions_xgb_263 += xgb_263.predict(xgb.DMatrix(X_test_263), ntree_limit=xgb_263.best_ntree_limit)/folds.n_splits
print("CV score:{:<8.8f}".format(mean_squared_error(oof_xgb_263,target)))
fold n°1
[0] train-rmse:3.40425 valis_data-rmse:3.38315
[500] train-rmse:0.40176 valis_data-rmse:0.70383
[1000] train-rmse:0.26477 valis_data-rmse:0.70696
[1117] train-rmse:0.24002 valis_data-rmse:0.70781
fold n°2
[0] train-rmse:3.39812 valis_data-rmse:3.40796
[500] train-rmse:0.40489 valis_data-rmse:0.69430
[1000] train-rmse:0.27176 valis_data-rmse:0.69486
[1230] train-rmse:0.22589 valis_data-rmse:0.69626
fold n°3
[0] train-rmse:3.40186 valis_data-rmse:3.39314
[500] train-rmse:0.40740 valis_data-rmse:0.66197
[1000] train-rmse:0.27269 valis_data-rmse:0.66321
[1083] train-rmse:0.25490 valis_data-rmse:0.66377
fold n°4
[0] train-rmse:3.40238 valis_data-rmse:3.39016
[500] train-rmse:0.41076 valis_data-rmse:0.66292
[1000] train-rmse:0.27113 valis_data-rmse:0.66357
[1192] train-rmse:0.23261 valis_data-rmse:0.66431
fold n°5
[0] train-rmse:3.39341 valis_data-rmse:3.42638
[500] train-rmse:0.41569 valis_data-rmse:0.65207
[1000] train-rmse:0.27688 valis_data-rmse:0.65138
[1493] train-rmse:0.18607 valis_data-rmse:0.65397
CV score:0.45517551
from sklearn.ensemble import RandomForestRegressor as rfr
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_rfr_263 = np.zeros(len(X_train_263))
predictions_rfr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
rfr_263 = rfr(n_estimators=1600,max_depth=9, min_samples_leaf=9, min_weight_fraction_leaf=0.0,
max_features=0.25,verbose=1,n_jobs=-1)
rfr_263.fit(tr_x,tr_y)
oof_rfr_263[val_idx] = rfr_263.predict(X_train_263[val_idx])
predictions_rfr_263 += rfr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr_263, target)))
fold n°1
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.7s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 4.0s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 7.3s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 11.5s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 15.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°2
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.7s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 4.0s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 7.4s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 11.5s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 15.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°3
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.6s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 4.0s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 7.7s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 12.1s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 15.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.2s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°4
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.6s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 4.2s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 7.9s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 12.2s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 15.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°5
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.6s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.9s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 6.2s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 9.6s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 12.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
CV score: 0.47894802
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
from sklearn.ensemble import GradientBoostingRegressor as gbr
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof_gbr_263 = np.zeros(train_shape)
predictions_gbr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
gbr_263 = gbr(n_estimators=400, learning_rate=0.01,subsample=0.65,max_depth=7, min_samples_leaf=20,
max_features=0.22,verbose=1)
gbr_263.fit(tr_x,tr_y)
oof_gbr_263[val_idx] = gbr_263.predict(X_train_263[val_idx])
predictions_gbr_263 += gbr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target)))
fold n°1
Iter Train Loss OOB Improve Remaining Time
1 0.6611 0.0034 19.96s
2 0.6680 0.0031 20.50s
3 0.6376 0.0034 20.25s
4 0.6626 0.0030 19.80s
5 0.6674 0.0028 19.88s
6 0.6162 0.0033 19.74s
7 0.6364 0.0031 19.53s
8 0.6457 0.0025 19.30s
9 0.6501 0.0027 19.28s
10 0.6282 0.0027 19.22s
20 0.5878 0.0023 18.26s
30 0.5426 0.0022 17.65s
40 0.5348 0.0018 17.08s
50 0.5074 0.0016 16.53s
60 0.4956 0.0011 16.23s
70 0.4719 0.0009 15.97s
80 0.4578 0.0008 15.51s
90 0.4454 0.0006 14.97s
100 0.4303 0.0006 14.46s
200 0.3391 0.0001 9.46s
300 0.3095 0.0000 4.71s
400 0.2625 -0.0000 0.00s
fold n°2
Iter Train Loss OOB Improve Remaining Time
1 0.6579 0.0033 19.15s
2 0.6754 0.0030 18.71s
3 0.6582 0.0031 18.93s
4 0.6541 0.0029 18.98s
5 0.6611 0.0032 18.86s
6 0.6421 0.0031 18.63s
7 0.6277 0.0026 18.57s
8 0.6245 0.0027 18.56s
9 0.6464 0.0028 18.50s
10 0.6195 0.0028 18.44s
20 0.6027 0.0022 18.09s
30 0.5615 0.0022 17.53s
40 0.5342 0.0017 17.02s
50 0.5192 0.0012 16.51s
60 0.5033 0.0010 16.01s
70 0.4854 0.0009 15.62s
80 0.4503 0.0009 15.11s
90 0.4494 0.0006 14.62s
100 0.4335 0.0006 14.13s
200 0.3382 0.0001 9.35s
300 0.3040 -0.0000 4.66s
400 0.2677 -0.0001 0.00s
fold n°3
Iter Train Loss OOB Improve Remaining Time
1 0.6595 0.0038 19.16s
2 0.6670 0.0029 19.31s
3 0.6599 0.0031 19.59s
4 0.6534 0.0034 19.80s
5 0.6380 0.0029 19.52s
6 0.6398 0.0029 19.41s
7 0.6399 0.0031 19.29s
8 0.6186 0.0031 19.63s
9 0.6338 0.0029 19.62s
10 0.6261 0.0029 19.49s
20 0.5893 0.0023 18.38s
30 0.5611 0.0022 17.76s
40 0.5284 0.0019 17.18s
50 0.5106 0.0014 16.68s
60 0.5028 0.0010 16.13s
70 0.4722 0.0011 15.62s
80 0.4717 0.0009 15.13s
90 0.4482 0.0007 14.69s
100 0.4147 0.0006 14.22s
200 0.3505 0.0001 9.39s
300 0.2985 -0.0000 4.70s
400 0.2542 -0.0000 0.00s
fold n°4
Iter Train Loss OOB Improve Remaining Time
1 0.6721 0.0033 19.20s
2 0.6540 0.0035 18.93s
3 0.6589 0.0033 19.07s
4 0.6581 0.0030 19.22s
5 0.6434 0.0032 18.97s
6 0.6484 0.0031 18.79s
7 0.6385 0.0031 18.71s
8 0.6413 0.0028 18.76s
9 0.6388 0.0031 18.67s
10 0.6320 0.0029 18.60s
20 0.6068 0.0022 17.93s
30 0.5558 0.0022 17.48s
40 0.5443 0.0015 16.95s
50 0.5190 0.0014 16.43s
60 0.4727 0.0013 15.97s
70 0.4716 0.0010 15.59s
80 0.4419 0.0007 15.08s
90 0.4485 0.0006 14.64s
100 0.4257 0.0005 14.16s
200 0.3421 0.0001 9.34s
300 0.2924 0.0000 4.67s
400 0.2596 -0.0000 0.00s
fold n°5
Iter Train Loss OOB Improve Remaining Time
1 0.6612 0.0033 18.76s
2 0.6440 0.0037 18.72s
3 0.6575 0.0033 19.33s
4 0.6715 0.0030 19.31s
5 0.6417 0.0033 19.28s
6 0.6246 0.0031 18.98s
7 0.6422 0.0030 18.87s
8 0.6439 0.0028 18.77s
9 0.6359 0.0028 18.77s
10 0.6324 0.0028 18.73s
20 0.5933 0.0024 17.98s
30 0.5764 0.0019 17.49s
40 0.5285 0.0017 17.20s
50 0.4985 0.0014 16.66s
60 0.4958 0.0010 16.15s
70 0.4831 0.0009 15.70s
80 0.4542 0.0009 15.19s
90 0.4443 0.0005 14.69s
100 0.4217 0.0006 14.17s
200 0.3333 0.0001 9.38s
300 0.2974 0.0001 4.68s
400 0.2667 -0.0000 0.00s
CV score: 0.45721342
from sklearn.ensemble import ExtraTreesRegressor as etr
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_etr_263 = np.zeros(train_shape)
predictions_etr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
etr_263 = etr(n_estimators=1000,max_depth=8, min_samples_leaf=12, min_weight_fraction_leaf=0.0,
max_features=0.4,verbose=1,n_jobs=-1)
etr_263.fit(tr_x,tr_y)
oof_etr_263[val_idx] = etr_263.predict(X_train_263[val_idx])
predictions_etr_263 += etr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr_263, target)))
fold n°1
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 5.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 7.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°2
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.2s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 5.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 7.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°3
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.4s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 6.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 7.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°4
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 5.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 7.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°5
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.3s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 3.2s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 5.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 7.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
CV score: 0.48568712
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
以上,我们得到5中模型的预测结果以及模型架构及参数,其中在每一种特征工程中,进行5折的交叉验证,并重复两次(Kernel Ridge Regression,核脊回归),取得每一个特征数下的模型结果
from sklearn.model_selection import RepeatedKFold
from sklearn.kernel_ridge import KernelRidge as kr
train_stack2 = np.vstack([oof_lgb_263, oof_xgb_263, oof_gbr_263, oof_rfr_263, oof_etr_263]).transpose()
# transpose()函数的作用就是调换x,y,z的位置,也就是数组的索引值
test_stack2 = np.vstack([predictions_lgb_263, predictions_xgb_263, predictions_gbr_263, predictions_rfr_263, predictions_etr_263]).transpose()
# 交叉验证:5折,重复2次
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack2 = np.zeros(train_stack2.shape[0])
predictions_lr2 = np.zeros(test_stack2.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack2,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack2[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack2[val_idx], target.iloc[val_idx].values
#Kernel Ridge Regression
lr2 = kr()
lr2.fit(trn_data, trn_y)
oof_stack2[val_idx] = lr2.predict(val_data)
predictions_lr2 += lr2.predict(test_stack2) / 10
mean_squared_error(target.values, oof_stack2)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4490918192270783
lgb_49_param = {
'num_leaves':7, #叶子节点数
'min_data_in_leaf':20, #叶子可能具有的最小记录数
'objective':'regression', #目标函数
'max_depth': -1,
'learning_rate': 0.003,
'boosting': 'gbdt', #使用gbdt算法
'feature_fraction': 0.18, #在每次迭代中随机选择18%的参数来建树
'bagging_freq': 1, #每k次迭代执行bagging
'bagging_fraction': 0.55, #每次迭代时用的数据比例
'bagging_seed': 14,
'metric': 'mse', #评估函数
'lambda_l1': 0.1005,
'lambda_l2': 0.1996,
'verbosity': -1
}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
oof_lgb_49 = np.zeros(len(X_train_49))
predictions_lgb_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train_49[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train_49[val_idx], y_train[val_idx])#train:val=4:1
num_round = 10000
lgb_49 = lgb.train(lgb_49_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 800)
oof_lgb_49[val_idx] = lgb_49.predict(X_train_49[val_idx], num_iteration=lgb_49.best_iteration)
predictions_lgb_49 += lgb_49.predict(X_test_49, num_iteration=lgb_49.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb_49, target)))
fold n°1
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.502295 valid_1's l2: 0.533139
[1000] training's l2: 0.460948 valid_1's l2: 0.505619
[1500] training's l2: 0.441858 valid_1's l2: 0.497812
[2000] training's l2: 0.429016 valid_1's l2: 0.494794
[2500] training's l2: 0.418466 valid_1's l2: 0.492613
[3000] training's l2: 0.40904 valid_1's l2: 0.492049
[3500] training's l2: 0.400556 valid_1's l2: 0.491392
[4000] training's l2: 0.392757 valid_1's l2: 0.49076
[4500] training's l2: 0.385307 valid_1's l2: 0.490212
[5000] training's l2: 0.378388 valid_1's l2: 0.489739
[5500] training's l2: 0.371885 valid_1's l2: 0.489324
[6000] training's l2: 0.36558 valid_1's l2: 0.489706
Early stopping, best iteration is:
[5412] training's l2: 0.373031 valid_1's l2: 0.489245
fold n°2
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.506969 valid_1's l2: 0.514896
[1000] training's l2: 0.464639 valid_1's l2: 0.484927
[1500] training's l2: 0.445377 valid_1's l2: 0.476706
[2000] training's l2: 0.432164 valid_1's l2: 0.473702
[2500] training's l2: 0.421427 valid_1's l2: 0.472132
[3000] training's l2: 0.411905 valid_1's l2: 0.470944
[3500] training's l2: 0.40327 valid_1's l2: 0.470226
[4000] training's l2: 0.395332 valid_1's l2: 0.469434
[4500] training's l2: 0.38797 valid_1's l2: 0.469221
[5000] training's l2: 0.381019 valid_1's l2: 0.469257
Early stopping, best iteration is:
[4426] training's l2: 0.389028 valid_1's l2: 0.469127
fold n°3
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.506714 valid_1's l2: 0.51902
[1000] training's l2: 0.465845 valid_1's l2: 0.486289
[1500] training's l2: 0.447115 valid_1's l2: 0.475431
[2000] training's l2: 0.434599 valid_1's l2: 0.470281
[2500] training's l2: 0.424123 valid_1's l2: 0.467064
[3000] training's l2: 0.414723 valid_1's l2: 0.465137
[3500] training's l2: 0.406371 valid_1's l2: 0.464074
[4000] training's l2: 0.398581 valid_1's l2: 0.463142
[4500] training's l2: 0.391413 valid_1's l2: 0.462511
[5000] training's l2: 0.384559 valid_1's l2: 0.462095
[5500] training's l2: 0.377974 valid_1's l2: 0.461616
[6000] training's l2: 0.371705 valid_1's l2: 0.461122
[6500] training's l2: 0.365677 valid_1's l2: 0.460701
[7000] training's l2: 0.359938 valid_1's l2: 0.46033
[7500] training's l2: 0.354438 valid_1's l2: 0.4601
[8000] training's l2: 0.349115 valid_1's l2: 0.45991
[8500] training's l2: 0.343907 valid_1's l2: 0.459985
[9000] training's l2: 0.338773 valid_1's l2: 0.460035
Early stopping, best iteration is:
[8286] training's l2: 0.346138 valid_1's l2: 0.459696
fold n°4
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.507414 valid_1's l2: 0.513
[1000] training's l2: 0.464811 valid_1's l2: 0.482987
[1500] training's l2: 0.445041 valid_1's l2: 0.476178
[2000] training's l2: 0.431887 valid_1's l2: 0.474967
[2500] training's l2: 0.421084 valid_1's l2: 0.474746
[3000] training's l2: 0.41172 valid_1's l2: 0.474576
[3500] training's l2: 0.403175 valid_1's l2: 0.4749
Early stopping, best iteration is:
[2993] training's l2: 0.411837 valid_1's l2: 0.474545
fold n°5
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.505559 valid_1's l2: 0.520516
[1000] training's l2: 0.464158 valid_1's l2: 0.488744
[1500] training's l2: 0.445294 valid_1's l2: 0.479389
[2000] training's l2: 0.432409 valid_1's l2: 0.475736
[2500] training's l2: 0.421628 valid_1's l2: 0.473822
[3000] training's l2: 0.41213 valid_1's l2: 0.472862
[3500] training's l2: 0.403666 valid_1's l2: 0.472348
[4000] training's l2: 0.395805 valid_1's l2: 0.471916
[4500] training's l2: 0.388455 valid_1's l2: 0.471829
Early stopping, best iteration is:
[4131] training's l2: 0.393835 valid_1's l2: 0.471742
CV score: 0.47287107
xgb_49_params = {
'eta': 0.02, #lr
'max_depth': 6,
'min_child_weight': 3, #最小叶子节点样本权重和
'gamma': 0, #指定节点分裂所需的最小损失函数下降值
'subsample':0.7, #控制每棵树,随机采样的比例
'colsample_bytree': 0.3, #控制每棵树随机采样的列数的占比(每一列是一个特征)
'lambda': 2,
'objective': 'reg:squarederror',#reg:linear
'eval_metric': 'rmse',
'nthread': -1
}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb_49 = np.zeros(len(X_train_49))
predictions_xgb_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train_49[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train_49[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valis_data')]
xgb_49 = xgb.train(dtrain=trn_data, num_boost_round=3000, evals=watchlist, early_stopping_rounds=600, verbose_eval=500, params=xgb_49_params)
oof_xgb_49[val_idx] = xgb_49.predict(xgb.DMatrix(X_train_49[val_idx]), ntree_limit=xgb_49.best_ntree_limit)
#ntree_limit is deprecated, use `iteration_range` or model slicing instead
predictions_xgb_49 += xgb_49.predict(xgb.DMatrix(X_test_49), ntree_limit=xgb_49.best_ntree_limit)/folds.n_splits
print("CV score:{:<8.8f}".format(mean_squared_error(oof_xgb_49,target)))
fold n°1
[0] train-rmse:3.40426 valis_data-rmse:3.38346
[500] train-rmse:0.47044 valis_data-rmse:0.71654
[933] train-rmse:0.36469 valis_data-rmse:0.72034
fold n°2
[0] train-rmse:3.39825 valis_data-rmse:3.40777
[500] train-rmse:0.47145 valis_data-rmse:0.70880
[1000] train-rmse:0.35111 valis_data-rmse:0.70913
[1336] train-rmse:0.28864 valis_data-rmse:0.71138
fold n°3
[0] train-rmse:3.40190 valis_data-rmse:3.39291
[500] train-rmse:0.47726 valis_data-rmse:0.66701
[1000] train-rmse:0.35647 valis_data-rmse:0.67142
[1008] train-rmse:0.35484 valis_data-rmse:0.67140
fold n°4
[0] train-rmse:3.40240 valis_data-rmse:3.39008
[500] train-rmse:0.47199 valis_data-rmse:0.68019
[1000] train-rmse:0.35145 valis_data-rmse:0.68241
[1076] train-rmse:0.33659 valis_data-rmse:0.68315
fold n°5
[0] train-rmse:3.39341 valis_data-rmse:3.42628
[500] train-rmse:0.47965 valis_data-rmse:0.66327
[1000] train-rmse:0.35526 valis_data-rmse:0.66568
[1148] train-rmse:0.32655 valis_data-rmse:0.66713
CV score:0.47155437
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof_gbr_49 = np.zeros(train_shape)
predictions_gbr_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
gbr_49 = gbr(n_estimators=400, learning_rate=0.01,subsample=0.65,max_depth=7, min_samples_leaf=20,
max_features=0.22,verbose=1)
gbr_49.fit(tr_x,tr_y)
oof_gbr_49[val_idx] = gbr_49.predict(X_train_49[val_idx])
predictions_gbr_49 += gbr_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_49, target)))
fold n°1
Iter Train Loss OOB Improve Remaining Time
1 0.6772 0.0030 5.59s
2 0.6625 0.0033 5.57s
3 0.6527 0.0030 5.56s
4 0.6570 0.0031 5.55s
5 0.6600 0.0029 5.69s
6 0.6470 0.0026 5.78s
7 0.6590 0.0028 5.78s
8 0.6450 0.0031 5.78s
9 0.6406 0.0028 5.78s
10 0.6363 0.0029 5.73s
20 0.6099 0.0021 5.53s
30 0.5622 0.0018 5.24s
40 0.5550 0.0015 5.02s
50 0.5186 0.0014 4.90s
60 0.5017 0.0011 4.80s
70 0.4800 0.0009 4.61s
80 0.4796 0.0007 4.46s
90 0.4601 0.0007 4.30s
100 0.4595 0.0006 4.15s
200 0.3961 -0.0000 2.70s
300 0.3501 -0.0000 1.35s
400 0.3237 -0.0000 0.00s
fold n°2
Iter Train Loss OOB Improve Remaining Time
1 0.6840 0.0034 5.59s
2 0.6573 0.0030 5.40s
3 0.6478 0.0033 5.44s
4 0.6465 0.0032 5.46s
5 0.6695 0.0030 5.46s
6 0.6487 0.0028 5.46s
7 0.6435 0.0032 5.40s
8 0.6365 0.0028 5.40s
9 0.6376 0.0029 5.39s
10 0.6332 0.0029 5.43s
20 0.5964 0.0021 5.36s
30 0.5667 0.0018 5.18s
40 0.5280 0.0016 5.01s
50 0.5188 0.0013 4.86s
60 0.5098 0.0011 4.69s
70 0.4784 0.0009 4.56s
80 0.4675 0.0008 4.41s
90 0.4593 0.0006 4.25s
100 0.4380 0.0006 4.11s
200 0.3671 0.0001 2.69s
300 0.3473 0.0000 1.34s
400 0.3196 -0.0000 0.00s
fold n°3
Iter Train Loss OOB Improve Remaining Time
1 0.6524 0.0034 5.59s
2 0.6491 0.0035 5.57s
3 0.6701 0.0033 5.56s
4 0.6527 0.0032 5.55s
5 0.6422 0.0032 5.45s
6 0.6404 0.0030 5.45s
7 0.6437 0.0034 5.45s
8 0.6376 0.0031 5.39s
9 0.6448 0.0029 5.35s
10 0.6196 0.0027 5.38s
20 0.6062 0.0022 5.19s
30 0.5579 0.0023 5.01s
40 0.5366 0.0017 4.84s
50 0.5239 0.0013 4.73s
60 0.5018 0.0012 4.59s
70 0.4957 0.0009 4.46s
80 0.4674 0.0008 4.33s
90 0.4715 0.0006 4.20s
100 0.4525 0.0005 4.05s
200 0.3853 0.0001 2.68s
300 0.3328 -0.0000 1.34s
400 0.3165 -0.0000 0.00s
fold n°4
Iter Train Loss OOB Improve Remaining Time
1 0.6530 0.0035 5.58s
2 0.6592 0.0032 5.57s
3 0.6638 0.0035 5.43s
4 0.6599 0.0032 5.45s
5 0.6547 0.0034 5.37s
6 0.6507 0.0034 5.39s
7 0.6482 0.0029 5.39s
8 0.6254 0.0031 5.34s
9 0.6372 0.0030 5.34s
10 0.6290 0.0029 5.34s
20 0.6041 0.0022 5.21s
30 0.5588 0.0019 5.03s
40 0.5376 0.0015 4.85s
50 0.5184 0.0012 4.71s
60 0.5176 0.0009 4.55s
70 0.4737 0.0011 4.43s
80 0.4653 0.0006 4.29s
90 0.4651 0.0005 4.16s
100 0.4464 0.0005 4.01s
200 0.3702 0.0000 2.67s
300 0.3386 -0.0000 1.34s
400 0.3287 -0.0000 0.00s
fold n°5
Iter Train Loss OOB Improve Remaining Time
1 0.6776 0.0034 5.99s
2 0.6535 0.0033 5.97s
3 0.6647 0.0031 5.82s
4 0.6441 0.0033 5.74s
5 0.6504 0.0026 5.69s
6 0.6382 0.0030 5.65s
7 0.6635 0.0028 5.56s
8 0.6471 0.0029 5.54s
9 0.6225 0.0031 5.52s
10 0.6414 0.0027 5.50s
20 0.6069 0.0021 5.28s
30 0.5507 0.0020 5.12s
40 0.5461 0.0017 4.96s
50 0.5308 0.0013 4.84s
60 0.4986 0.0012 4.67s
70 0.4929 0.0009 4.52s
80 0.4797 0.0007 4.36s
90 0.4760 0.0006 4.22s
100 0.4420 0.0004 4.08s
200 0.3865 0.0000 2.69s
300 0.3504 -0.0000 1.34s
400 0.3310 -0.0001 0.00s
CV score: 0.47296863
我们得到了以上3个模型基于49个特征的预测结果以及模型架构及参数。在每一种特征工程中,进行5折的交叉验证,并重复2次,取得每一个特征数下的模型结果
train_stack3 = np.vstack([oof_lgb_49,oof_xgb_49,oof_gbr_49]).transpose()
test_stack3 = np.vstack([predictions_lgb_49, predictions_xgb_49,predictions_gbr_49]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack3 = np.zeros(train_stack3.shape[0])
predictions_lr3 = np.zeros(test_stack3.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack3,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack3[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack3[val_idx], target.iloc[val_idx].values
#Kernel Ridge Regression
lr3 = kr()
lr3.fit(trn_data, trn_y)
oof_stack3[val_idx] = lr3.predict(val_data)
predictions_lr3 += lr3.predict(test_stack3) / 10
mean_squared_error(target.values, oof_stack3)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4688927368397409
由于49维是最重要的特征,增加更多的模型进行49维特征的数据的构建工作
from sklearn.kernel_ridge import KernelRidge as kr
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_kr_49 = np.zeros(train_shape)
predictions_kr_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
kr_49 = kr()
kr_49.fit(tr_x,tr_y)
oof_kr_49[val_idx] = kr_49.predict(X_train_49[val_idx])
predictions_kr_49 += kr_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_kr_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.50446766
from sklearn.linear_model import Ridge
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_ridge_49 = np.zeros(train_shape)
predictions_ridge_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
ridge_49 = Ridge(alpha=6)
ridge_49.fit(tr_x,tr_y)
oof_ridge_49[val_idx] = ridge_49.predict(X_train_49[val_idx])
predictions_ridge_49 += ridge_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.49587680
from sklearn.linear_model import BayesianRidge as br
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_br_49 = np.zeros(train_shape)
predictions_br_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
br_49 = br()
br_49.fit(tr_x,tr_y)
oof_br_49[val_idx] = br_49.predict(X_train_49[val_idx])
predictions_br_49 += br_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_br_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.49684965
from sklearn.linear_model import ElasticNet as en
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_en_49 = np.zeros(train_shape)
predictions_en_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
en_49 = en(alpha=1.0,l1_ratio=0.05)
en_49.fit(tr_x,tr_y)
oof_en_49[val_idx] = en_49.predict(X_train_49[val_idx])
predictions_en_49 += en_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.53979354
基于4.6-4.7构建的模型,进行5折的交叉验证并重复2次(LinearRegression简单线性回归),取得每一个特征数下的模型的结果
from sklearn.linear_model import LinearRegression as lr
train_stack4 = np.vstack([oof_br_49,oof_kr_49,oof_en_49,oof_ridge_49]).transpose()
test_stack4 = np.vstack([predictions_br_49, predictions_kr_49,predictions_en_49,predictions_ridge_49]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack4 = np.zeros(train_stack4.shape[0])
predictions_lr4 = np.zeros(test_stack4.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack4,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack4[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack4[val_idx], target.iloc[val_idx].values
#LinearRegression
lr4 = lr()
lr4.fit(trn_data, trn_y)
oof_stack4[val_idx] = lr4.predict(val_data)
predictions_lr4 += lr4.predict(test_stack1) / 10
mean_squared_error(target.values, oof_stack4)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4962263802341183
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_kr_383 = np.zeros(train_shape)
predictions_kr_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#Kernel Ridge Regression 岭回归
kr_383 = kr()
kr_383.fit(tr_x,tr_y)
oof_kr_383[val_idx] = kr_383.predict(X_train_383[val_idx])
predictions_kr_383 += kr_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_kr_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.50864146
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_ridge_383 = np.zeros(train_shape)
predictions_ridge_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#使用岭回归
ridge_383 = Ridge(alpha=1200)
ridge_383.fit(tr_x,tr_y)
oof_ridge_383[val_idx] = ridge_383.predict(X_train_383[val_idx])
predictions_ridge_383 += ridge_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.48782923
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_en_383 = np.zeros(train_shape)
predictions_en_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#ElasticNet 弹性网络
en_383 = en(alpha=1.0,l1_ratio=0.06)
en_383.fit(tr_x,tr_y)
oof_en_383[val_idx] = en_383.predict(X_train_383[val_idx])
predictions_en_383 += en_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.53650465
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_br_383 = np.zeros(train_shape)
predictions_br_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#BayesianRidge 贝叶斯回归
br_383 = br()
br_383.fit(tr_x,tr_y)
oof_br_383[val_idx] = br_383.predict(X_train_383[val_idx])
predictions_br_383 += br_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_br_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.48828537
得到了以上4种模型基于383个特征的预测结果以及模型架构和参数,每一个特征工程中进行5折交叉验证并重复2次(LinearRegression简单线性回归),取得每一个特征数下的模型的结果
from sklearn.linear_model import LinearRegression as lr
train_stack1 = np.vstack([oof_br_383,oof_kr_383,oof_en_383,oof_ridge_383]).transpose()
test_stack1 = np.vstack([predictions_br_383, predictions_kr_383,predictions_en_383,predictions_ridge_383]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack1 = np.zeros(train_stack1.shape[0])
predictions_lr1 = np.zeros(test_stack1.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack1,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack1[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack1[val_idx], target.iloc[val_idx].values
# LinearRegression简单的线性回归
lr1 = lr()
lr1.fit(trn_data, trn_y)
oof_stack1[val_idx] = lr1.predict(val_data)
predictions_lr1 += lr1.predict(test_stack1) / 10
mean_squared_error(target.values, oof_stack1)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4888697167657431
对于上述四种集成学习的模型的预测结果进行加权的求和,得到最终的结果(方式是很不准确的)
mean_squared_error(target.values, 0.7*(0.6*oof_stack2 + 0.4*oof_stack3)+0.3*(0.55*oof_stack1+0.45*oof_stack4))
0.4541253212559639
train_stack5 = np.vstack([oof_stack1,oof_stack2,oof_stack3,oof_stack4]).transpose()
test_stack5 = np.vstack([predictions_lr1, predictions_lr2,predictions_lr3,predictions_lr4]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack5 = np.zeros(train_stack5.shape[0])
predictions_lr5= np.zeros(test_stack5.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack5,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack5[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack5[val_idx], target.iloc[val_idx].values
#LinearRegression
lr5 = lr()
lr5.fit(trn_data, trn_y)
oof_stack5[val_idx] = lr5.predict(val_data)
predictions_lr5 += lr5.predict(test_stack5) / 10
mean_squared_error(target.values, oof_stack5)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.449254925253198
submit_example = pd.read_csv('submit_example.csv',sep=',',encoding='latin-1')
submit_example['happiness'] = predictions_lr5
submit_example.happiness.describe()
count 2968.000000
mean 3.880761
std 0.460840
min 1.583454
25% 3.668695
50% 3.954693
75% 4.187084
max 5.013256
Name: happiness, dtype: float64
预测的值是1-5的连续值,进一步进行优化,对于结果取整数解的近似值
submit_example.loc[submit_example['happiness']>4.96,'happiness']= 5
submit_example.loc[submit_example['happiness']<=1.04,'happiness']= 1
submit_example.loc[(submit_example['happiness']>1.96)&(submit_example['happiness']<2.04),'happiness']= 2
submit_example.to_csv("submision.csv",index=False)
submit_example.happiness.describe()
count 2968.000000
mean 3.880808
std 0.460679
min 1.583454
25% 3.668695
50% 3.954693
75% 4.187084
max 5.000000
Name: happiness, dtype: float64