x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
树类模型不需要,使数据符合N(0,1)分布。很多ML的算法要求训练的输入参数的平均值是0并且有相同阶数的方差例如:RBF核的SVM,L1和L2正则的线性回归
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.transform(x_test)
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_test_hat = lr.predict(x_test)
lr_score = lr.score(x_test, y_test)
print ("lr:", lr_score)
from sklearn.linear_model import Lasso
lasso = LassoCV(alphas=np.logspace(-3,1,20))
lasso.fit(x_train, y_train)
lasso_y_test_hat = lasso.predict(x_test)
lasso_score = lasso.score(x_test, y_test)
print ("lasso:", lasso_score)
print('lasso_MSE:',mean_squared_error(y_test,lasso_y_test_hat))
from sklearn.linear_model import Ridge
ridge = RidgeCV(alphas=np.logspace(-3,1,20))
ridge.fit(x_train, y_train)
ridge_y_test_hat = ridge.predict(x_test)
ridge_score = ridge.score(x_test, y_test)
print ("ridge:", ridge_score)
print('ridge_MSE:',mean_squared_error(y_test,ridge_y_test_hat))
lr=LogisticRegressionCV(Cs=np.logspace(-4,1,50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='ovr')
lr.fit(X_train, Y_train)
lr_y_predict = lr.predict(X_test)
lr_r = lr.score(X_train, Y_train)
y1 = lr.predict_proba(X_test)
print ("Logistic算法准确率:", lr_r)
print ("Logistic算法参数:",lr.coef_)
print("logistic概率值:",y1.ravel())
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn.fit(X_train,Y_train)
knn_y_predict=knn.predict(X_test)
print('knn score:',knn.score(X_test, Y_test))
print(knn_y_predict)
tree = DecisionTreeClassifier(criterion='gini',random_state=1)#另外也可选gini
tree.fit(X_train, Y_train)
Y_test_hat = tree.predict(X_test)
print ("tree Score:", tree.score(X_test, Y_test))
每棵树对特征和样本同时进行采样,样本进行随机有放回的采样,特征进行随机选取,m颗树,投票表决
优:随机采样适合高维,并行计算快;缺:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
rf= RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=1, random_state=0)
rf.fit(X_train, Y_train)
rf.predict(X_test)
print('forest score:',rf.score(X_test, Y_test))
from sklearn.ensemble import BaggingRegressor
bg=BaggingRegressor(LinearRegression(),n_estimators=100,max_samples=0.7,max_features=0.8,random_state=28)
bg.fit(x_train, y_train)
bg_y_test_hat = bg.predict(x_test)
bg_score = bg.score(x_test, y_test)
print ("Bagging:", bg_score)
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(LinearRegression(), n_estimators=100, learning_rate=0.0001, random_state=28)
abr.fit(x_train, y_train)
abr_y_test_hat = abr.predict(x_test)
abr_score = abr.score(x_test, y_test)
print ("Bagging:", abr_score)
CART为基分类器,上一颗树的残差作为下一颗树构建的输入条件,构造树拟合残差,累加决策
优:非线性变换多,无需复杂的特征工程 ;缺:串行训练,复杂,不适合高维
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=28)
gbdt.fit(x_train, y_train)
gbdt_y_test_hat = gbdt.predict(x_test)
gbdt_score = gbdt.score(x_test, y_test)
print ("Bagging:", gbdt_score)
LightGBM 是微软开发的一款快速、分布式、高性能的基于决策树的梯度 Boosting 框架。