阿里天池新人赛——快来一起挖掘幸福感最终完整代码。
比赛链接:https://tianchi.aliyun.com/competition/entrance/231702/introduction?spm=5176.12281973.1005.8.3dd53eafwScooV
1:使用前面https://mp.csdn.net/console/editor/html/106105652介绍的插值方法填充缺失值
2:去掉缺失值比较多的特征
3:对于income、按分组平均值填充,pandas.groupby.transform,最后使用panda.cut进行分桶
完整代码如下:
train_file=r"D:\AI\tianchi\data\happiness\happiness_train_complete.csv"
test_file=r"D:\AI\tianchi\data\happiness\happiness_test_complete.csv"
import warnings
import pandas as pd
import numpy as np
import itertools
import scipy.interpolate as si
def fill_with_corr(df, col):
#取class相关系数最大的前5个特征
class_corr_index=df.corr(method="pearson")[col].sort_values().\
tail(5).dropna().index
#去掉happiness,因为它是要估计的
if "happiness" in class_corr_index:
class_corr_index=class_corr_index.drop(["happiness"])
#负值和缺失值都需要插值填补,都认为是缺失值
print("填补前空、负值统计:")
print(((df[class_corr_index] < 0) | (df[class_corr_index].isnull())).sum())
#第一层循环分别对只缺1、2、3、4个的值进行填补
#第二层循环组合1,2,3,4个特征,找出这些组合缺失,而剩下特征都未缺失的样本
#第三层循环分别对缺失的值进行填补
for i in range(1,len(class_corr_index)):
for j in itertools.combinations(class_corr_index,r=i):
#特征组合是否缺失矩阵
tmp1=((df_o.loc[:,j].isnull()) | (df_o.loc[:,j]<0)).sum(axis=1)==i
#剩下的特征组合
tmp_index=class_corr_index[~class_corr_index.isin(j)]
#剩下特征是否不缺失矩阵
tmp2=((df_o.loc[:,tmp_index].isnull()) | (df_o.loc[:,tmp_index]<0)).sum(axis=1)==0
#特征组合缺失,剩下不缺失的矩阵
index_for=tmp1&tmp2
#特征组合缺失,剩下不缺是的样本
x=df_o.loc[index_for,tmp_index]
if(0 == x.shape[0]):
continue
for k in j:
#k未特征组合中一个特征,与剩下特征组合形成新的组合,用于寻找拟合数据
index_points=tmp_index.insert(0,k)
#用于拟合的样本
tmp3=(((df_o.loc[:,index_points].isnull()) | (df_o.loc[:,index_points]<0)).sum(axis=1)==0)
#拟合数据样本
points_v=df_o.loc[tmp3,index_points]
#拟合数据坐标
points=points_v[tmp_index]
#拟合数据值
values=points_v[k]
#开始拟合
y=si.griddata(points.values,values.values,x.values,method="nearest")
#拟合结果填入到原数据
df_o.loc[index_for,k]=y
print("填补后空、负值统计:")
print(((df[class_corr_index] < 0) | (df[class_corr_index].isnull())).sum())
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
df_train_o=pd.read_csv(train_file,encoding="gbk")
df_test_o=pd.read_csv(test_file,encoding="gbk")
#连接训练集和测试集,特征一起处理
df_o=pd.concat([df_train_o,df_test_o],ignore_index=True)
df_o.drop(['edu_other', 'property_other', 'invest_other',"survey_time"],axis=1,inplace=True)
df_o["birth"]=2015-df_o["birth"]
df_o["f_political"][df_o["f_political"]<0]=1
df_o["f_edu"][df_o["f_edu"]<0]=df_o["f_edu"].quantile(q=0.5)
df_o["f_birth"][df_o["f_birth"]<0]=df_o["birth"]+26
df_o["daughter"][df_o["daughter"]<0]=0
df_o["son"][df_o["son"]<0]=0
df_o["f_work_14"][df_o["f_work_14"]<0]=17
df_o["car"][df_o["car"]<0]=3
df_o["house"][df_o["house"]==-1]=1
df_o["house"][df_o["house"]<0]=3
df_o["insur_1"][df_o["insur_1"]<0]=1
df_o["insur_2"][df_o["insur_2"]<0]=1
df_o["insur_3"][df_o["insur_3"]<0]=1
df_o["insur_4"][df_o["insur_4"]<0]=1
df_o["public_service_1"][df_o["public_service_1"]<0]=df_o["public_service_1"].mean()
df_o["public_service_2"][df_o["public_service_2"]<0]=df_o["public_service_2"].mean()
df_o["public_service_3"][df_o["public_service_3"]<0]=df_o["public_service_3"].mean()
df_o["public_service_4"][df_o["public_service_4"]<0]=df_o["public_service_4"].mean()
df_o["public_service_5"][df_o["public_service_5"]<0]=df_o["public_service_5"].mean()
df_o["public_service_6"][df_o["public_service_6"]<0]=df_o["public_service_6"].mean()
df_o["public_service_7"][df_o["public_service_7"]<0]=df_o["public_service_7"].mean()
df_o["public_service_8"][df_o["public_service_8"]<0]=df_o["public_service_8"].mean()
df_o["neighbor_familiarity"][df_o["neighbor_familiarity"]<0]=3
df_o["m_birth"][df_o["m_birth"]<0]=df_o["f_birth"]-1
df_o["m_edu"][df_o["m_edu"]<0]=df_o["m_edu"].quantile(q=0.5)
df_o["m_political"][df_o["m_political"]<0]=1
trust=["trust_%d"%(i+1) for i in range(13)]
df_o[df_o[trust]<0]=3
df_o["height_cm"]=df_o["height_cm"].mean()/df_o["height_cm"]
df_o["weight_jin"]=df_o["weight_jin"].mean()/df_o["weight_jin"]
#增加一个组合特征,体重和升高比
df_o["weight_height_ratio"]=df_o["weight_jin"]/df_o["height_cm"]
fill_with_corr(df_o,"class")
#按城市、农村和性别计算收入平均值,用来填充缺失和负值
df_o["income"][df_o["income"]<0]= \
df_o.groupby(["survey_type","city","gender"])["income"].transform(np.mean)
income_min=df_o["income"].min()-1
income_max=df_o["income"].max()+1
bins=[income_min,1e4,5e4,1e5,3e5,5e5,1e6,income_max]
df_o["income"]=pd.cut(df_o["income"],bins,labels=False)
df_o["inc_exp"][(df_o["inc_exp"]<0) | (df_o["inc_exp"].isnull())]= \
df_o.groupby(["survey_type","city","gender"])["inc_exp"].transform(np.mean)
inc_exp_min=df_o["inc_exp"].min()-1
inc_exp_max=df_o["inc_exp"].max()+1
bins=[inc_exp_min,1e4,5e4,1e5,3e5,5e5,1e6,inc_exp_max]
df_o["inc_exp"]=pd.cut(df_o["inc_exp"],bins,labels=False)
df_o["family_income"][(df_o["family_income"]<0) | (df_o["family_income"].isnull())]= \
df_o.groupby(["survey_type","city","gender"])["family_income"].transform(np.mean)
family_income_min=df_o["family_income"].min()-1
family_income_max=df_o["family_income"].max()+1
bins=[family_income_min,2e4,1e5,2e5,6e5,1e6,2e6,family_income_max]
df_o["family_income"]=pd.cut(df_o["family_income"],bins,labels=False)
df_o["s_income"][(df_o["s_income"]<0) | (df_o["s_income"].isnull())]= \
df_o.groupby(["survey_type","city","gender"])["s_income"].transform(np.mean)
s_income_min=df_o["s_income"].min()-1
s_income_max=df_o["s_income"].max()+1
bins=[s_income_min,1e4,5e4,1e5,3e5,5e5,1e6,s_income_max]
df_o["s_income"]=pd.cut(df_o["s_income"],bins,labels=False)
fill_with_corr(df_o,"depression")
fill_with_corr(df_o,"status_peer")
fill_with_corr(df_o,"equity")
fill_with_corr(df_o,"family_status")
df_o["inc_ability"][df_o["inc_ability"]>=2]=df_o["inc_ability"]+1
df_o["inc_ability"][df_o["inc_ability"]<0]=2
df_o["view"][df_o["view"]<0]=3
4:通过如下代码查看缺失和负值,对于缺失和负值还要特别处理
df_o.loc[:,cols]=dd
nun_neg=((df_o.isnull()) | (df_o<0)).sum().sort_values().rename("null_neg")
nun_neg.to_csv("./null_neg.csv")
print(nun_neg)
5:去掉一部分缺失值多的特征,一部分特征缺失值使用中位值填充
#定义需要填充的列
cols=["class_10_before",\
"media_1",\
"class",\
"depression",\
"health_problem",\
"family_status",\
"health",\
"class_10_after",\
"hukou_loc",\
"media_4",\
"media_2",\
"media_3",\
"socialize",\
"leisure_1",\
"class_14",\
"media_5",\
"edu",\
"nationality",\
"leisure_6",\
"media_6",\
"relax",\
"religion_freq",\
"leisure_2",\
"leisure_8",\
"leisure_9",\
"learn",\
"family_m",\
"leisure_12",\
"leisure_7",\
"leisure_10",\
"political",\
"leisure_11",\
"socia_outing",\
"leisure_5",\
"religion",\
"public_service_9",\
"social_neighbor",\
"social_friend",\
"edu_status",\
"work_status",\
"work_manage",\
"work_type",\
"s_work_status",\
"s_work_exper",
"m_work_14",\
"s_edu",\
"s_work_type"]
def func(x):
x[x<0]=np.nan
x.fillna(x.quantile(q=0.5),inplace=True)
return x
#使用中位值填充
dd=df_o.loc[:,cols].apply(lambda x:func(x))
df_o["minor_child"][df_o["minor_child"]<0]=np.nan
df_o["minor_child"].fillna(0,inplace=True)
if "marital_1st" in df_o.columns:
df_o.drop('marital_1st',axis=1,inplace=True)
if "s_birth" in df_o.columns:
df_o.drop('s_birth',axis=1,inplace=True)
if "s_hukou" in df_o.columns:
df_o.drop('s_hukou',axis=1,inplace=True)
if "s_political" in df_o.columns:
df_o.drop('s_political',axis=1,inplace=True)
if "marital_now" in df_o.columns:
df_o.drop('marital_now',axis=1,inplace=True)
if "edu_yr" in df_o.columns:
df_o.drop('edu_yr',axis=1,inplace=True)
if "work_yr" in df_o.columns:
df_o.drop('work_yr',axis=1,inplace=True)
if "join_party" in df_o.columns:
df_o.drop('join_party',axis=1,inplace=True)
df_o.loc[:,cols]=dd
6:从训练和测试联合数据分割得到训练和测试数据
def get_train_test(df):
df_train=df.iloc[:df_train_o.shape[0]]
df_test=df.iloc[-(df_test_o.shape[0]):]
train_x=df_train.iloc[:,1:]
train_y=df_train["happiness"]
test_x=df_test.iloc[:,1:]
return train_x,train_y,test_x
train_x,train_y,test_x=get_train_test(df_o)
二:模型训练与融合
1:使用了xgb,lgb、gbdt三个模型,实验了三个模型权值平均和线性融合,效果都差不多
xgb的训练代码如下,lgb和gbdt类似。
xgb_params = {"learning_rate":0.02,'eta': 0.3, 'n_estimators': 1000, 'gamma': 5, 'max_depth': 2, 'min_child_weight': 4,
'colsample_bytree': 0.6, 'colsample_bylevel': 0.7, 'subsample': 0.8,
'reg_lambda': 0.2, 'reg_alpha': 0.9,
'seed': 1}
fold=KFold(n_splits=4,random_state=0,shuffle=True)
xgb = XGBRegressor(**xgb_params)
df_xg=pd.DataFrame(columns=train_x.columns)
xg_val_predict=np.zeros(train_y.shape)
xg_test_predict=np.zeros(test_x.shape[0])
for i,(fit_index,val_index) in enumerate(fold.split(train_x,train_y)):
clf=xgb.fit(train_x.iloc[fit_index],train_y.iloc[fit_index])
print(clf.feature_importances_.shape)
tmp=pd.DataFrame([clf.feature_importances_],columns=train_x.columns)
df_xg=pd.concat([df_xg,tmp])
pre=clf.predict(train_x.iloc[val_index])
xg_val_predict[val_index]=pre
xg_test_predict = xg_test_predict+clf.predict(test_x)/fold.n_splits
print(mean_squared_error(xg_val_predict,train_y))
print(xg_test_predict.shape)
2:训练数据交叉验证代码如下,交叉验证的结果有0.46
from sklearn.linear_model import Lasso
lso=Lasso(alpha=0.01,tol=0.001)
mm=np.vstack([lg_val_predict,xg_val_predict,gg_val_predict]).T
lso.fit(mm,train_y)
val_predict1=lso.predict(mm)
print(mean_squared_error(val_predict,train_y))
print(mean_squared_error(val_predict,val_predict1))
3:测试数据预测,测试数据线上评分0.478,还未进入排名,改进地方还有很多。
from pandas import read_csv
#test_predict=lg_test_predict/2+xg_test_predict/2
mm=np.vstack([lg_test_predict,xg_test_predict,gg_test_predict]).T
test_predict=lso.predict(mm)
print(test_predict[:5])
submit_file=r"D:\AI\tianchi\data\happiness\happiness_submit.csv"
from datetime import datetime
submit_example = read_csv(submit_file)
submit_example["happiness"]=test_predict
now = datetime.now().strftime('%m-%d-%H-%M')
submit_example.to_csv("./baseline_%s.csv" % now, index=False)