教材选用《阿里云天池大赛赛题解析——机器学习篇》;
2.2 数据探索
2.2.2读取数据
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(train_test_file, sep='\t', encoding='utf-8')
train_data.describe()
2.2.4可视化数据分布
fig = plt.figure(figsize=(4,6))
sns.boxplot(train_data['V0'], orient='v', width=0.5)
column = train_data.columns.tolist()[:39]
fig = plt.figure(figsize=(80,60), dpi=75)
for i in range(38):
plt.subplot(7,8,i + 1)
sns.boxplot(train_data[column[i]], orient='v', width = 0.5)
plt.ylabel(column[i],fontsize= 36)
plt.show()
获取异常数据并画图(岭回归)
# function to detect outliers based on the predictions of a model
def find_outliers(model, X, y, sigma=3): # X:feature y:label
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting failed, try fitting the model first
except:
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate a statistic, define outliers to be where z >sigma
z = (resid - mean_resid) / std_resid
outliers = z[abs(z) > sigma].index
# print and plot the results
print('R2=', model.score(X,y))
print('mse=', mean_squared_error(y,y_pred))
print('-----------------------------------')
print('mean of residuals:', mean_resid)
print('std of residuals:' , std_resid)
print('-----------------------------------')
print(len(outliers),'outliers:')
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y ,y_pred, '.')
plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
plt.legend(['Accepted','Outliers'])
plt.xlabel('y')
plt.ylabel('y_pred')
ax_132 = plt.subplot(1,3,2)
plt.plot(y ,y - y_pred, '.')
plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
plt.legend(['Accepted','Outliers'])
plt.xlabel('y')
plt.ylabel('y_pred')
ax_133 = plt.subplot(1,3,3)
z.plot.hist(bins=50,ax= ax_133)
z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
plt.legend(['Accepted','Outliers'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(), X_train, y_train)
output
R2= 0.8890858938210386
mse= 0.10734857773123632
-----------------------------------
mean of residuals: 3.3291313688552134e-17
std of residuals: 0.32769766731934985
-----------------------------------
31 outliers:
[321, 348, 376, 777, 884, 1145, 1164, 1310, 1458, 1466, 1484, 1523, 1704, 1874, 1879, 1979, 2002, 2279, 2528, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2767, 2769, 2807, 2842, 2863]
直方图和Q-Q图
首先,通过绘制特征变量的直方图查看其在训练集中的统计分布,并绘制Q-Q图查看统计分布是否近似于正态分布。
train_cols = 8
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4* train_rows))
i = 0
for col in train_data.columns:
i+=1
ax = plt.subplot(train_rows, train_cols, i)
sns.distplot(train_data[col],fit=stats.norm)
i+=1
ax = plt.subplot(train_rows, train_cols, i)
res = stats.probplot(train_data[col], plot=plt)
plt.tight_layout()
plt.show()
KDE(核密度估计)分布图
查看并对比训练集和测试集中特征变量的分布情况。
dist_cols = 8
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for col in test_data.columns:
ax = plt.subplot(dist_rows, dist_cols, i)
ax = sns.kdeplot(train_data[col], color='red', shade=True)
ax = sns.kdeplot(test_data[col], color='Blue',shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax =ax.legend(["train","test"])
i+=1
plt.show()
根据上图,删除分布不一致的特征变量V5,V9,V11,V17,V22,V28
线性回归关系图
线性回归关系图主要用于分析变量间的线性回归关系。
fcols = 8
frows = len(test_data.columns)
plt.figure(figsize=(5*fcols,4*frows))
i=0
for col in test_data.columns:
i+=1
ax = plt.subplot(frows, fcols, i)
sns.regplot(x=col, y= 'target', data=train_data, ax=ax, scatter_kws={'marker':'.','s':3,'alpha':0.3}, line_kws={'color':'k'})
ax.set_xlabel(col)
ax.set_ylabel("Target")
i+=1
ax = plt.subplot(frows, fcols, i)
sns.distplot(train_data[col].dropna())
plt.xlabel(col)
plt.show()
2.2.5查看特征变量的相关性
1.计算相关性系数
在删除训练集和测试集中分布不一致的特征变量后,如V5,V9,V11,V17,V22,V28之后,计算剩余特征变量和target变量的相关性系数。
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
data_train1 = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
train_corr = data_train1.corr()
train_corr
ax = plt.subplots(figsize=(20,16))
ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)
3.根据相关系数筛选特征变量
找出与target变量的相关系数大于0.5的特征变量。
threshold = 0.5
corrmat = train_data.corr()
top_corr_features = corrmat.index[abs(corrmat["target"]) > threshold]
plt.figure(figsize=(10,10))
g = sns.heatmap(train_data[top_corr_features].corr(), annot=True, cmap="RdYlGn")
4.Box-Cox变换
由于线性回归是基于正态分布的,因此在进行统计分析时,需要把数据转换使其符合正态分布。
在做Box-Cox之前,需要对数据做归一化处理。在归一化时,对数据进行合并操作可以使训练数据和测试数据一致。
drop_columns = ['V5','V9','V11','V17','V22','V28']
train_x = train_data.drop(['target'],axis=1)
# 合并训练集和测试集的数据
data_all =pd.concat([train_x,test_data])
data_all.drop(drop_columns,axis=1, inplace =True)
data_all.head()
对合并后的每一列数据做归一化。
cols_numeric = list(data_all.columns)
def scale_minmax(col):
return (col - col.min()) / (col.max() - col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
data_all[cols_numeric].describe()
对特征变量做Box-Cox变换后,计算分位数并画图展示(基于正态分布),显示特征变量与targe变量的线性关系。
train_data_process = train_data[cols_numeric]
train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
cols_numeric_left = cols_numeric[0:13]
cols_numeric_right = cols_numeric[13:]
train_data_process = pd.concat([train_data_process, train_data['target']],axis=1)
fcols = 6
frows = len(cols_numeric_left)
plt.figure(figsize=(4*fcols,4*frows))
i = 0
for var in cols_numeric_left:
dat = train_data_process[[var, 'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var],fit=stats.norm)
plt.title(var+' Original')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_ = stats.probplot(dat[var],plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var],dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var],dat['target'])[0][1]))
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna() + 1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var,fit=stats.norm)
plt.title(var+' Transformed')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_ = stats.probplot(trans_var,plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var,dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
可以发现,经过变换后,变量分布更接近于正态分布。
3.4 特征工程
3.4.1 异常值分析
绘制各个特征的箱线图:
plt.figure(figsize=(18,10))
plt.boxplot(x = train_data.values, labels = train_data.columns)
plt.hlines([-7.5,7.5],0,40,colors='r')
plt.show()
把训练集和测试集中的异常值删除。
train_data = train_data[train_data['V9']>-7.5]
test_data = test_data[test_data['V9']>-7.5]
display(train_data.describe())
display(test_data.describe())
3.4.2 最大值和最小值的归一化处理
对数据进行归一化处理。
from sklearn import preprocessing
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']
display(train_data_scaler.describe())
display(test_data_scaler.describe())
3.4.3 查看数据分布
删除分布不一致的特征变量。
train_data_scaler = train_data_scaler.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
test_data_scaler = test_data_scaler.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
3.4.4 特征相关性
计算特征相关性,以热力图形式可视化显示。
import seaborn as sns
plt.figure(figsize=(20,16))
column = train_data_scaler.columns.tolist()
mcorr = train_data_scaler[column].corr(method="spearman")
mask = np.zeros_like(mcorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220,10,as_cmap = True)
g = sns.heatmap(mcorr, mask = mask, cmap = cmap, square = True, annot = True, fmt='0.2f')
plt.show()
3.4.5 特征降维
计算相关性系数并筛选大于0.1的特征变量。
mcorr = mcorr.abs()
numerical_corr = mcorr[mcorr['target']>0.1]['target']
print(numerical_corr.sort_values(ascending=False))
output
target 1.000000
V0 0.866709
V1 0.832457
V8 0.799280
V27 0.765133
V31 0.749034
V2 0.630160
V4 0.574775
V12 0.542429
V16 0.510025
V3 0.501114
V37 0.497162
V20 0.420424
V10 0.371067
V24 0.296056
V36 0.287696
V6 0.264778
V15 0.213490
V29 0.198244
V13 0.177317
V19 0.171120
V7 0.164981
V18 0.162710
V30 0.123423
Name: target, dtype: float64
3.4.7 PCA处理
利用PCA方法去除数据的多重共线性,并进行降维。PCA处理后可保持90%的信息,并与处理前测数据对比:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.9)
new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_90 = pca.transform(test_data_scaler)
new_train_pca_90 = pd.DataFrame(new_train_pca_90)
new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_train_pca_90['target'] = train_data_scaler['target']
display(new_train_pca_90.describe())
display(train_data_scaler.describe())
PCA保留16个主成分:
pca = PCA(n_components = 16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']
display(new_train_pca_16.describe())
display(train_data_scaler.describe())
4.2 模型训练
4.2.1 导入相关库
首先导入一些python工具包,用于模型拟合和性能评价。
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.neighbors import KNeighborsRegressor # K近邻回归
from sklearn.tree import DecisionTreeRegressor # 决策树回归
from sklearn.ensemble import RandomForestRegressor # 随机森林回归
from sklearn.svm import SVR # 支持向量机回归
import lightgbm as lgb # LightGBM模型
from sklearn.model_selection import train_test_split # 切分数据
from sklearn.metrics import mean_squared_error # 评价指标
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
4.2.2 切分数据
对训练集进行切分,得到80%的训练集和20%的验证集。
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_train_pca_16.columns]
target = new_train_pca_16['target']
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)
4.2.3 多元线性回归
优点:模型简单,部署方便,回归权重可以用于结果分析:训练快。
缺点:精度低,特征存在一定的共线性问题。
使用技巧:需要进行归一化处理,建议进行一定的特征选择,尽量避免高度相关的特征同时存在。
本题结果:效果一般。适合分析使用。
clf = LinearRegression()
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("LinearRegression:", score)
LinearRegression: 0.26582236740768944
4.2.4 K近邻回归
优点:模型简单,易于理解,对于数据量小的情况方便快捷,可视化方便。
缺点:计算量大,不适合数据量大的情况:需要调参数。
使用技巧:特征需要归一化,重要的特征可以适当加一定比例的权重。
本题结果:效果一般。
clf = KNeighborsRegressor(n_neighbors=8)
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("KNeighborsRegressor:", score)
KNeighborsRegressor: 0.27912407507028547
4.2.5 随机森林回归
优点:使用方便,特征无需做过多变换;精度较高;模型并行训练快
缺点:结果不容易解释
使用技巧:参数调节,提高精度
本题结果:比较适合
clf = RandomForestRegressor(n_estimators=200)
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor:", score)
RandomForestRegressor: 0.24562419837569202
4.2.6 LGB模型回归
优点:精度高
缺点:训练时间长,模型复杂
使用技巧:有效的验证集防止过拟合;参数搜索。
本题结果:适合。
clf = lgb.LGBMRegressor(
learning_rate = 0.01,
max_depth = -1,
n_estimators = 5000,
boosting_type = 'gbdt',
random_state = 2019,
objective='regression',
)
clf.fit(X=train_data, y=train_target, eval_metric='MSE', verbose=50)
score = mean_squared_error(test_target, clf.predict(test_data))
print("lightGBM:", score)
lightGBM: 0.2309519095788757
5.3 模型验证
5.3.1 模型欠拟合和过拟合
模型欠拟合的情况,代码和运行结果如下:
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import SGDRegressor
clf = SGDRegressor(max_iter=500, tol=1e-2)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.37776579423967643
SGDRegressor test MSE: 0.30333099831254734
模型过拟合的情况,代码和运行结果如下:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.3088568266233788
SGDRegressor test MSE: 0.26365730216068484
模型正常拟合的情况,代码和运行结果如下:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.31246590877931896
SGDRegressor test MSE: 0.26532038403344865
5.3.2 模型正则化
采用L2范数正则化处理模型,代码如下:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='L2', alpha=0.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.31444724414117475
SGDRegressor test MSE: 0.26588746232300303
采用L1范数正则化处理模型,代码如下:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='L1', alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.3143081286245166
SGDRegressor test MSE: 0.26633882737907394
采用ElasticNet正则化处理模型,代码如下:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='elasticnet', l1_ratio = 0.9, alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.3145747479972795
SGDRegressor test MSE: 0.2665089998958983
5.3.3 模型交叉验证
使用简单交叉验证并切分数据,训练集和验证集为8:2。
# 简单交叉验证
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据,训练集和验证集为8:2
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size = 0.2, random_state=0)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
SGDRegressor train MSE: 0.35653456761449837
SGDRegressor test MSE: 0.27929212422296323
使用K折交叉验证方法对模型进行交叉验证,K = 5, 代码如下:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print(k,"折","SGDRegressor train MSE:", score_train)
print(k,"折","SGDRegressor test MSE:", score_test, '\n')
0 折 SGDRegressor train MSE: 0.3705300016301921
0 折 SGDRegressor test MSE: 0.21367230031317322
1 折 SGDRegressor train MSE: 0.3465093570084947
1 折 SGDRegressor test MSE: 0.3071764501574828
2 折 SGDRegressor train MSE: 0.34798441935505325
2 折 SGDRegressor test MSE: 0.33916831324981905
3 折 SGDRegressor train MSE: 0.3062791836060481
3 折 SGDRegressor test MSE: 0.5373752495243562
4 折 SGDRegressor train MSE: 0.30784894549380654
4 折 SGDRegressor test MSE: 0.49677933559823606
使用留一法交叉验证对模型进行交叉验证,代码如下:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
num = 100
for k, (train_index, test_index) in enumerate(loo.split(train)):
train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print(k,"个","SGDRegressor train MSE:", score_train)
print(k,"个","SGDRegressor test MSE:", score_test, '\n')
if k >= 9:
break
使用留P法交叉验证对模型进行交叉验证,代码如下:
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=10)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print(k,"10个","SGDRegressor train MSE:", score_train)
print(k,"10个","SGDRegressor test MSE:", score_test, '\n')
if k >= 9:
break
5.3.4 模型超参空间及调参
使用数据训练随机森林模型,采用网格搜索方法调参,代码如下:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)
randomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators':[50, 100, 200],'max_depth':[1, 2, 3]}
clf = GridSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor GridSearchCV test MSE: ", score_test)
sorted(clf.cv_results_.keys())
output
RandomForestRegressor GridSearchCV test MSE: 0.34479414952956544
['mean_fit_time',
'mean_score_time',
'mean_test_score',
'mean_train_score',
'param_max_depth',
'param_n_estimators',
'params',
'rank_test_score',
'split0_test_score',
'split0_train_score',
'split1_test_score',
'split1_train_score',
'split2_test_score',
'split2_train_score',
'split3_test_score',
'split3_train_score',
'split4_test_score',
'split4_train_score',
'std_fit_time',
'std_score_time',
'std_test_score',
'std_train_score']
使用数据训练随机森林模型,采用随机参数优化方法调参,代码如下:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)
randomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators':[50, 100, 200, 300],'max_depth':[1, 2, 3, 4, 5]}
clf = RandomizedSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor RandomizedSearchCV test MSE: ", score_test)
sorted(clf.cv_results_.keys())
output
RandomForestRegressor RandomizedSearchCV test MSE: 0.2824045787313695
['mean_fit_time',
'mean_score_time',
'mean_test_score',
'mean_train_score',
'param_max_depth',
'param_n_estimators',
'params',
'rank_test_score',
'split0_test_score',
'split0_train_score',
'split1_test_score',
'split1_train_score',
'split2_test_score',
'split2_train_score',
'split3_test_score',
'split3_train_score',
'split4_test_score',
'split4_train_score',
'std_fit_time',
'std_score_time',
'std_test_score',
'std_train_score']
使用数据训练LGB模型,采用网格搜索方法调参,代码如下:
clf = lgb.LGBMRegressor(num_leaves=31)
parameters = {'learning_rate': [0.01, 0.1, 1],'n_estimators': [20, 40]}
clf = GridSearchCV(clf, parameters, cv=5)
clf.fit(train_data, train_target)
print('Best parameters found by grid search are:', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("LGBMRegressor RandomizedSearchCV test MSE: ", score_test)
5.3.5 学习曲线和验证曲线
绘制数据训练SGDRegressor模型的学习曲线。
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
train_data2 = pd.read_csv('./zhengqi_train.txt',sep='\t')
test_data2 = pd.read_csv('./zhengqi_test.txt',sep='\t')
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
title = "LinearRegression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = model_selection.ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = SGDRegressor()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1)
绘制数据训练SGDRegressor模型的验证曲线。
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
train_scores, test_scores = validation_curve(
SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1'), X, y, param_name="alpha", param_range=param_range,
cv=10, scoring='r2', n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with SGDRegressor")
plt.xlabel("alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
6.2 特征优化
import pandas as pd
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
epsilon=1e-5
#组交叉特征,可以自行定义,如增加: x*x/y, log(x)/y 等等
func_dict = {
'add': lambda x,y: x+y,
'mins': lambda x,y: x-y,
'div': lambda x,y: x/(y+epsilon),
'multi': lambda x,y: x*y
}
def auto_features_make(train_data,test_data,func_dict,col_list):
train_data, test_data = train_data.copy(), test_data.copy()
for col_i in col_list:
for col_j in col_list:
for func_name, func in func_dict.items():
for data in [train_data,test_data]:
func_features = func(data[col_i],data[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
data[col_func_features] = func_features
return train_data,test_data
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)
from sklearn.decomposition import PCA #主成分分析法
#PCA方法降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
# 5折交叉验证
Folds=5
kf = KFold(n_splits=Folds, random_state=2019, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
'train_mse':[],
'test_mse':[]
}
# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
# lgb树模型
lgb_reg = lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2019,
objective='regression',
)
# 切分训练集和预测集
X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
# 训练模型
lgb_reg.fit(
X=X_train_KFold,y=y_train_KFold,
eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
eval_names=['Train','Test'],
early_stopping_rounds=100,
eval_metric='MSE',
verbose=50
)
# 训练集预测 测试集预测
y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
print('------\n', '训练MSE\n', train_mse, '\n------')
test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
print('------\n', '预测MSE\n', test_mse, '\n------\n')
MSE_DICT['train_mse'].append(train_mse)
MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')
output
Training until validation scores don't improve for 100 rounds
[50] Train's l2: 0.413128 Test's l2: 0.455026
[100] Train's l2: 0.198017 Test's l2: 0.245523
[150] Train's l2: 0.108839 Test's l2: 0.164432
[200] Train's l2: 0.0683439 Test's l2: 0.132008
[250] Train's l2: 0.0478354 Test's l2: 0.117217
[300] Train's l2: 0.035836 Test's l2: 0.110572
[350] Train's l2: 0.0279916 Test's l2: 0.10673
[400] Train's l2: 0.0225218 Test's l2: 0.104686
[450] Train's l2: 0.0183733 Test's l2: 0.103133
[500] Train's l2: 0.0151476 Test's l2: 0.102168
[550] Train's l2: 0.012598 Test's l2: 0.101216
[600] Train's l2: 0.0105448 Test's l2: 0.100722
[650] Train's l2: 0.00886925 Test's l2: 0.100606
[700] Train's l2: 0.00751108 Test's l2: 0.100288
[750] Train's l2: 0.00639588 Test's l2: 0.100224
[800] Train's l2: 0.00547284 Test's l2: 0.100142
[850] Train's l2: 0.00469886 Test's l2: 0.0999705
[900] Train's l2: 0.00405206 Test's l2: 0.0997473
[950] Train's l2: 0.00350702 Test's l2: 0.0997148
[1000] Train's l2: 0.00304687 Test's l2: 0.0996172
[1050] Train's l2: 0.00265095 Test's l2: 0.0995466
[1100] Train's l2: 0.00231359 Test's l2: 0.0993888
[1150] Train's l2: 0.00202587 Test's l2: 0.0992934
[1200] Train's l2: 0.00178175 Test's l2: 0.0992655
[1250] Train's l2: 0.0015751 Test's l2: 0.0992336
[1300] Train's l2: 0.00139401 Test's l2: 0.0992164
[1350] Train's l2: 0.00123563 Test's l2: 0.0991488
[1400] Train's l2: 0.00110108 Test's l2: 0.0991156
[1450] Train's l2: 0.000984264 Test's l2: 0.0990345
[1500] Train's l2: 0.000882609 Test's l2: 0.0989801
[1550] Train's l2: 0.000793992 Test's l2: 0.0989664
[1600] Train's l2: 0.000717061 Test's l2: 0.0989519
[1650] Train's l2: 0.000649073 Test's l2: 0.0989394
[1700] Train's l2: 0.00058903 Test's l2: 0.0989571
[1750] Train's l2: 0.000536084 Test's l2: 0.0989557
Early stopping, best iteration is:
[1666] Train's l2: 0.000628941 Test's l2: 0.0989285
7.2 模型融合
pip install xgboost --user
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
#load_dataset
with open("./zhengqi_train.txt") as fr:
data_train=pd.read_table(fr,sep="\t")
with open("./zhengqi_test.txt") as fr_test:
data_test=pd.read_table(fr_test,sep="\t")
#merge train_set and test_set
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)
# normalise numeric columns
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
#Box-Cox
cols_transform=data_all.columns[0:-2]
for col in cols_transform:
# transform column
data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
#Log Transform SalePrice to improve normality
sp = data_train.target
data_train.target1 =np.power(1.5,sp)
# function to get training samples
def get_training_data():
# extract training samples
from sklearn.model_selection import train_test_split
df_train = data_all[data_all["oringin"]=="train"]
df_train["label"]=data_train.target1
# split SalePrice and features
y = df_train.target
X = df_train.drop(["oringin","target","label"],axis=1)
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)
return X_train,X_valid,y_train,y_valid
# extract test data (without SalePrice)
def get_test_data():
df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)
return df_test.drop(["oringin","target"],axis=1)
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff**2)
n = len(y_pred)
return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
return mean_squared_error(y_ture,y_pred)
# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X,y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index
return outliers
from sklearn.linear_model import Ridge
X_train, X_valid,y_train,y_valid = get_training_data()
test=get_test_data()
# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)
# permanently remove these outliers from the data
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)
ef get_trainning_data_omitoutliers():
y1=y_t.copy()
X1=X_t.copy()
return X1,y1
from sklearn.preprocessing import StandardScaler
def train_model(model, param_grid=[], X=[], y=[],
splits=5, repeats=5):
# get unmodified training data, unless data to use already specified
if len(y)==0:
X,y = get_trainning_data_omitoutliers()
#poly_trans=PolynomialFeatures(degree=2)
#X=poly_trans.fit_transform(X)
#X=MinMaxScaler().fit_transform(X)
# create cross-validation method
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
# perform a grid search if param_grid given
if len(param_grid)>0:
# setup grid search parameters
gsearch = GridSearchCV(model, param_grid, cv=rkfold,
scoring="neg_mean_squared_error",
verbose=1, return_train_score=True)
# search the grid
gsearch.fit(X,y)
# extract best model from the grid
model = gsearch.best_estimator_
best_idx = gsearch.best_index_
# get cv-scores for best model
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
cv_std = grid_results.loc[best_idx,'std_test_score']
# no grid search, just cross-val score for given model
else:
grid_results = []
cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
cv_mean = abs(np.mean(cv_results))
cv_std = np.std(cv_results)
# combine mean and std cv-score in to a pandas series
cv_score = pd.Series({'mean':cv_mean,'std':cv_std})
# predict y using the fitted model
y_pred = model.predict(X)
# print stats on model performance
print('----------------------')
print(model)
print('----------------------')
print('score=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print('mse=',mse(y, y_pred))
print('cross_val: mean=',cv_mean,', std=',cv_std)
# residual plots
y_pred = pd.Series(y_pred,index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = (resid - mean_resid)/std_resid
n_outliers = sum(abs(z)>3)
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))
ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
return model, cv_score, grid_results
# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])
# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5
7.2.5 单一模型预测结果
岭回归
model = 'Ridge'
opt_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25)
param_grid = {'alpha': alph_range}
opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
output
Fitting 25 folds for each of 23 candidates, totalling 575 fits
[Parallel(n_jobs=1)]: Done 575 out of 575 | elapsed: 43.7s finished
----------------------
Ridge(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)
----------------------
score= 0.8962818810610906
rmse= 0.31906002611988776
mse= 0.10179930026762334
cross_val: mean= 0.10638412641872626 , std= 0.007268366114488213
Text(0, 0.5, 'score')
lasso回归
model = 'Lasso'
opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
output
Fitting 25 folds for each of 23 candidates, totalling 575 fits
[Parallel(n_jobs=1)]: Done 575 out of 575 | elapsed: 1.5min finished
----------------------
Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)
----------------------
score= 0.8964039177441461
rmse= 0.31887226486884535
mse= 0.10167952130258699
cross_val: mean= 0.10608347958156857 , std= 0.006409844698430518
Text(0, 0.5, 'score')
7.2.6 模型融合Boosting方法
GBDT模型
model = 'GradientBoosting'
opt_models[model] = GradientBoostingRegressor()
param_grid = {'n_estimators':[150,250,350],
'max_depth':[1,2,3],
'min_samples_split':[5,6,7]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
output
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 1.1min finished
----------------------
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=5, min_weight_fraction_leaf=0.0,
n_estimators=250, presort='auto', random_state=None,
subsample=1.0, verbose=0, warm_start=False)
----------------------
score= 0.9687667265046709
rmse= 0.17508697226339623
mse= 0.03065544785636322
cross_val: mean= 0.0975195734183274 , std= 0.005170355408187927
XGB模型
model = 'XGB'
opt_models[model] = XGBRegressor()
param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
output
Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Done 75 out of 75 | elapsed: 3.2min finished
----------------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=0, num_parallel_tree=1,
objective='reg:squarederror', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
----------------------
score= 0.97027238602052
rmse= 0.17081464670059526
mse= 0.0291776435274492
cross_val: mean= 0.10242834745407967 , std= 0.010701020260954689
7.2.7 多模型预测bagging方法
def model_predict(test_data,test_y=[],stack=False):
#poly_trans=PolynomialFeatures(degree=2)
#test_data1=poly_trans.fit_transform(test_data)
#test_data=MinMaxScaler().fit_transform(test_data)
i=0
y_predict_total=np.zeros((test_data.shape[0],))
for model in opt_models.keys():
if model!="LinearSVR" and model!="KNeighbors":
y_predict=opt_models[model].predict(test_data)
y_predict_total+=y_predict
i+=1
if len(test_y)>0:
print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
y_predict_mean=np.round(y_predict_total/i,3)
if len(test_y)>0:
print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
else:
y_predict_mean=pd.Series(y_predict_mean)
return y_predict_mean
model_predict(X_valid,y_valid)
output
Ridge_mse: 0.13825688728309005
Lasso_mse: 0.13841411408178492
XGB_mse: 0.1413474866753928
mean_mse: 0.1289751753171857
7.2.8 多模型融合stacking方法
模型融合stacking简单示例
pip install mlxtend --user
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
##主要使用pip install mlxtend安装mlxtend
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.data import iris_data
from mlxtend.plotting import plot_decision_regions
%matplotlib inline
# Initializing Classifiers
clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[2, 1, 1], voting='soft')
# Loading some example data
X, y = iris_data()
X = X[:,[0, 2]]
# Plotting Decision Regions
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10, 8))
for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
['Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble'],
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()