集成算法基本算法主要分为Bagging算法与Boosting算法
Bagging的算法过程
Boosting的算法过程(三个臭皮匠顶个诸葛亮)
Bagging和Boosting 的主要区别
典型集成学习算法
本演练针对波士顿房价数据集进行建模和预测。将使用多种模型,包括:线性回归、Adaboost、随机森林、GBDT、XGBoost、LightGBM。
波士顿房价数据集位于【boston_house.csv】, 一共506条数据,每条数据包括14个字段:
链接:https://pan.baidu.com/s/1uuVxPq40281V_3R-i2zwKw?pwd=6688
提取码:6688
import pandas as pd
import numpy as nnp
#import pandas as pd
from sklearn.model_selection import train_test_split
data_path = './boston_house.csv'
data = pd.read_csv(data_path)
print(data.head())
X = data.iloc[:, :-1] # 前13列作为特征数据集
y = data.iloc[:, -1] # 最后1列作为结果列,
# 按照8:2拆分训练集和测试集
random_state = 100 #设置随机数种子
test_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)
print("训练数据集维度:", X_train.shape)
print("测试数据集维度:", X_test.shape)
from sklearn.metrics import mean_squared_error #MSE
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# 针对测试数据集,计算误差平方和的平均值(MSE)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test) # 使用训练数据集的均值和标准差对测试数据集进行归一化
print(X_train_norm[:5]) # 打印前5行归一化结果
model = LinearRegression()
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
# y_pred = y_scaler.inverse_transform(y_pred)
# 针对测试数据集,计算误差平方和的平均值(MSE)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=random_state)
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import LinearSVR # 线性支持向量回归器
estimator = LinearSVR()
model = AdaBoostRegressor(base_estimator=estimator, loss='square', random_state=random_state)
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
from sklearn.tree import DecisionTreeRegressor
estimator = DecisionTreeRegressor()
model = AdaBoostRegressor(base_estimator=estimator, loss='square', random_state=random_state)
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=random_state)
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
#!pip install xgboost #安装xgboost
#!conda install xgboost
from xgboost import XGBRegressor
random_state = 100
model = XGBRegressor(random_state=random_state)
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
import lightgbm as lgb
train_set = lgb.Dataset(X_train_norm, label=y_train)
model = lgb.train(
params={
'boosting_type': 'gbdt',
'objective': 'regression'
},
train_set=train_set)
y_pred = model.predict(X_test_norm)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
random_state = 100
estimator = DecisionTreeRegressor(random_state=random_state)
rfecv = RFECV(estimator=estimator, step=1, cv=5)
rfecv.fit(X_train_norm, y_train)
print("选中的特征名称: " , X_train.columns[rfecv.support_])
X_train_rfecv = rfecv.transform(X_train_norm)
X_test_rfecv = rfecv.transform(X_test_norm)
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=random_state)
model.fit(X_train_rfecv, y_train)
y_pred = model.predict(X_test_rfecv)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("测试数据集误差:", error)