#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data:
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#4.1降维---pca
# from sklearn.decomposition import PCA
# pca=PCA(n_components=2)
# pca.fit(X_train_std)
# # print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 0.8666666666666667
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data:
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#5.建立机器学习模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 1.0
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
代码
#1.读取数据
import pandas as pd
import os
datapath=os.path.join(".","tantanic.txt")
tantanic=pd.read_csv(datapath)
# 信息展示函数
def show_data_info():
print(tantanic.shape) # (1313, 11)
print(tantanic.info())
import seaborn as sns
import matplotlib.pyplot as plt
sns.catplot(x="sex", y="survived", hue="pclass", kind="bar", data=tantanic);
plt.show()
#
# RangeIndex: 1313 entries, 0 to 1312
# Data columns (total 11 columns):
# row.names 1313 non-null int64
# pclass 1313 non-null object
# survived 1313 non-null int64
# name 1313 non-null object
# age 633 non-null float64
# embarked 821 non-null object
# home.dest 754 non-null object
# room 77 non-null object
# ticket 69 non-null object
# boat 347 non-null object
# sex 1313 non-null object
#调用显示图像和属性函数
show_data_info()
#2.特征工程
#2.1选择特征
X=tantanic[["age","pclass","sex"]]
print(X)
y=tantanic["survived"]
#2.2缺失值处理---age年龄列---均值插补技术
X["age"].fillna(X["age"].mean(),inplace=True)#31.194181
print(X)
#2.3切分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#2.3类别型变量处理---labelencoder+onehotencoder====>DictVector
from sklearn.feature_extraction import DictVectorizer
dv=DictVectorizer(sparse=False)
X_train_dv=dv.fit_transform(X_train.to_dict(orient="records"))
X_test_dv=dv.transform(X_test.to_dict(orient="records"))
print(X_train_dv)
print(dv.feature_names_)
# [[45. 0. 0. 1. 1. 0. ]
# [31.19418104 0. 0. 1. 0. 1. ]
# [31.19418104 1. 0. 0. 1. 0. ]
# ...
# [31.19418104 0. 0. 1. 0. 1. ]
# [36. 1. 0. 0. 1. 0. ]
# [31.19418104 0. 0. 1. 0. 1. ]]
# ['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
#3.建立模型
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="gini")
print(dtc.fit(X_train_dv,y_train))
#4.模型预测
y_pred=dtc.predict(X_test_dv)
print(y_pred)
#5.模型校验
print("model in trainset:",dtc.score(X_train_dv,y_train))
print("model in testset:",dtc.score(X_test_dv,y_test))
# model in trainset: 0.8657142857142858
# model in testset: 0.779467680608365
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
# [[152 14]
# [ 44 53]]
print(classification_report(y_test,y_pred))
#6.可视化操作
from sklearn.tree import export_graphviz
export_graphviz(dtc,filled=True,class_names=["no","yes"],
feature_names=["age","pclass_1","pclass_2","pclass_3","sex_1","sex_2"])
线性回归----监督学习的连续值预测问题
回归分为几类:
简单线性回归
多元线性回归
如何求解最优解参数?
import pandas as pd
cardata=pd.read_csv("./car.csv")
print(cardata)
# Miles Deliveries Travel Time
# 0 100 4 9.3
# 1 50 3 4.8
# 2 100 4 8.9
# 3 100 2 6.5
# 4 50 2 4.2
# 5 80 2 6.2
# 6 75 3 7.4
# 7 65 4 6.0
# 8 90 3 7.6
# 9 90 2 6.1
X=cardata.drop(labels="Travel_Time",axis=1)
y=cardata.Travel_Time
print(X)
print(type(X)) #
print(type(y)) #
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=22,test_size=0.2)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
# 系数
print(lr.coef_) # [0.05909188 0.94214744] ===> w1,w2
# 截距
print(lr.intercept_) # -0.7296207264957264 ===> w0
#预测
y_pred=lr.predict(X_test)
print(y_pred)
#校验
print("model in trainset score is:",lr.score(X_train,y_train))
print("model in testset score is:",lr.score(X_test,y_test))
# model in trainset score is: 0.8537581648463358
# model in testset score is: 0.9922035806481854
代码展示 :
#读取数据
from sklearn.datasets import load_boston
bostan=load_boston()
print(bostan.keys())#dict_keys(['data', 'target', 'feature_names', 'DESCR'])
print(bostan.DESCR) #:Number of Instances: 506 -:Number of Attributes: 13 numeric/categorical predictive
print(bostan.data)
print(bostan.target)
print(bostan.feature_names)
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
#数据处理
X=bostan.data
y=bostan.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=22,test_size=0.2)
# print(X_train)
# print(y_train)
#GBDT模型建立
from sklearn.ensemble import GradientBoostingRegressor
lr=GradientBoostingRegressor()
#LR模型建立
# from sklearn.linear_model import LinearRegression
# lr=LinearRegression()
#DTR模型建立
# from sklearn.tree import DecisionTreeRegressor
# lr=DecisionTreeRegressor()
lr.fit(X_train,y_train)
#模型预测
y_pred=lr.predict(X_test)
print(y_pred)
#模型校验
print("model in trains set score is:",lr.score(X_train,y_train))
print("model in tests set score is:",lr.score(X_test,y_test))
# model in trains set score is: 0.9782530327891484
# model in tests set score is: 0.8442713715157044
#mae\mse\r2
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print("mae:",mean_absolute_error(y_test,y_pred))
print("mse",mean_squared_error(y_test,y_pred))
# mae: 3.4240698961589606
# mse 20.765767538052017
print("r2",r2_score(y_test,y_pred)) #r2 0.7658020514461032
Cart树区别于ID3和C4.5区别
Cart树总结 :
创建分类树递归过程中,CART每次都选择当前数据集中具有最小Gini信息增益的特征作为结点划分决策树。ID3算法和C4.5算法虽然在对训练样本集的学习中可以尽可能多地挖掘信息,但其生成的决策树分支、规模较大,CART算法的二分法可以简化决策树的规模,提高生成决策树的效率。
对于连续特征,CART也是采取和C4.5同样的方法处理。为了避免过拟合(Overfitting),CART决策树需要剪枝(后剪枝)。预测过程当然也就十分简单,根据产生的决策树模型,延伸匹配特征值到最后的叶子节点即得到预测的类别。
代码如下
# 网格搜索主要用来选择给定中的最优的超参数 , 并且可以将训练出来的超参数使用到模型中
from sklearn import tree, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'criterion':('gini', 'entropy'), 'splitter':["random", "best"],
'max_depth': [1,2,3,10,12],
'min_samples_split': (2,3),
'max_features': (1,2,3)}
dtc = tree.DecisionTreeClassifier()
# criterion="gini",
# splitter="best",
# max_depth=None,
# min_samples_split=2,
# min_samples_leaf=1,
# min_weight_fraction_leaf=0.,
# max_features=None,
# random_state=None,
# max_leaf_nodes=None,
# min_impurity_decrease=0.,
clf = GridSearchCV(dtc, parameters, cv=5)
clf.fit(iris.data, iris.target)
print(clf.best_params_)
# {'criterion': 'gini', 'max_depth': 12, 'max_features': 3, 'min_samples_split': 3, 'splitter': 'best'}
# ...
# GridSearchCV(cv=5, error_score=...,
# estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
# decision_function_shape='ovr', degree=..., gamma=...,
# kernel='rbf', max_iter=-1, probability=False,
# random_state=None, shrinking=True, tol=...,
# verbose=False),
# fit_params=None, iid=..., n_jobs=None,
# param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
# scoring=..., verbose=...)
print(sorted(clf.cv_results_.keys()))
# ...
# ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
# 'mean_train_score', 'param_C', 'param_kernel', 'params',...
# 'rank_test_score', 'split0_test_score',...
# 'split0_train_score', 'split1_test_score', 'split1_train_score',...
# 'split2_test_score', 'split2_train_score',...
# 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]
Cart回归树实战波士顿房价代码
# 1.导入数据
# 1.1 从sklearn.dataset导入波士顿房价数据读取器
from sklearn.datasets import load_boston
# 1.2 从读取器中读取房价数据存储在变量boston中
boston = load_boston()
# 1.3 输出数据描述等信息
print(boston.DESCR)
print(boston.keys())
print(boston.data)
print(boston.target)
print(boston.feature_names)
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
# 2. 从sklearn.model_selection中导入数据分割器
from sklearn.model_selection import train_test_split
# 2.1 导入numpy并重命名为np
import numpy as np
# 2.2 数据处理
X = boston.data
y=boston.target
# 2.3 随机采样20%的数据构建测试样本 , 其余作为训练样本
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=22, test_size=0.2)
# 2.4 分析回归目标值的差异
print("The max target value is : ",np.max(boston.target))
# The max target value is : 50.0
print("The min target value is : ",np.min(boston.target))
# The min target value is : 5.0
print("The average target value is : ",np.mean(boston.target))
# The average target value is : 22.532806324110677
# 3. 特征工程
from sklearn.linear_model import LinearRegression
# 从sklearn.preprocessing导入数据标准化模块
from sklearn.preprocessing import StandardScaler
# 3.1 分别初始化对特征和目标值的标准化器
ss_X = StandardScaler()
ss_y = StandardScaler()
# 3.2 分别对训练和测试数据的特征以及目标值进行标准化处理
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))
# 4. 决策树分类器
# 4.1 从sklearn.tree中导入DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
# 4.2 使用默认配置初始化DecisionTreeRegressor
dtr = DecisionTreeRegressor()
# 4.3 用波士顿放假的训练数据构建回归树
dtr.fit(X_train,y_train)
# 4.4 使用默认配置的单一回归树对测试数据进行预测 , 并将预测值存储在变量dtr_y_pred中
dtr_y_pred = dtr.predict(X_test)
# 4.5 使用R-squard , MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print("R-squard value of DecisionTreeRegressor is : %.2f%%"%(dtr.score(X_test,y_test)*100))
# R-squard value of DecisionTreeRegressor is : 79.60%
print("The mean squard error value of DecisionTreeRegressor is : ",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_pred)))
# The mean squard error value of DecisionTreeRegressor is : 18.08392156862745
print("The mean absolute error value of DecisionTreeRegressor is : ",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_pred)))
# The mean absolute error value of DecisionTreeRegressor is : 3.1490196078431376
skleran的随机森林
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data:
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#5.建立机器学习模型
from sklearn.ensemble import RandomForestClassifier
dtc=RandomForestClassifier()
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 1.0
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
#读取数据
from sklearn.datasets import load_boston
bostan=load_boston()
print(bostan.keys())#dict_keys(['data', 'target', 'feature_names', 'DESCR'])
print(bostan.DESCR) #:Number of Instances: 506 -:Number of Attributes: 13 numeric/categorical predictive
print(bostan.data)
print(bostan.target)
print(bostan.feature_names)
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
#数据处理
X=bostan.data
y=bostan.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=22,test_size=0.2)
# print(X_train)
# print(y_train)
#模型建立
from sklearn.ensemble import RandomForestRegressor
lr=RandomForestRegressor()
lr.fit(X_train,y_train)
#模型预测
y_pred=lr.predict(X_test)
print(y_pred)
#模型校验
print("model in trains set score is:",lr.score(X_train,y_train))
print("model in tests set score is:",lr.score(X_test,y_test))
# model in trains set score is: 0.977707567959114
# model in tests set score is: 0.8214221038623839
#mae\mse\r2
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print("mae:",mean_absolute_error(y_test,y_pred))
print("mse",mean_squared_error(y_test,y_pred))
# mae: 3.4240698961589606
# mse 20.765767538052017
print("r2",r2_score(y_test,y_pred)) #r2 0.7658020514461032
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data:
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#5.建立机器学习模型
from sklearn.ensemble import BaggingClassifier
dtc=BaggingClassifier()
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 1.0
# model in testset score is: 1.0
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
# 读取数据
from sklearn.datasets import load_boston
bostan = load_boston()
print(bostan.keys()) # dict_keys(['data', 'target', 'feature_names', 'DESCR'])
print(bostan.DESCR) #:Number of Instances: 506 -:Number of Attributes: 13 numeric/categorical predictive
print(bostan.data)
print(bostan.target)
print(bostan.feature_names)
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
# 数据处理
X = bostan.data
y = bostan.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2)
# print(X_train)
# print(y_train)
# 模型建立
from sklearn.ensemble import BaggingRegressor
lr = BaggingRegressor()
lr.fit(X_train, y_train)
# 模型预测
y_pred = lr.predict(X_test)
print(y_pred)
# 模型校验
print("model in trains set score is:", lr.score(X_train, y_train))
print("model in tests set score is:", lr.score(X_test, y_test))
# model in trains set score is: 0.9732165103639384
# model in tests set score is: 0.7859351887744748
# mae\mse\r2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
# mae: 3.4240698961589606
# mse 20.765767538052017
print("r2", r2_score(y_test, y_pred)) # r2 0.7658020514461032
**总结:**对于单棵决策树很容易产生过拟合,Bagging就可以有它的优势。Bagging是一种降低模型方差的一种有效方法,但是对于降低模型得偏差效果不大,这也是我们选择未剪枝决策树等低偏差分类器作为集成算法成员分类器的原因。
Q : Bagging如何降低模型方差 ?
模型过拟合出现的时候就是方差的出现 , 相当于对过拟合的优化
模型欠拟合出现的时候就是偏差的出现
Boosting—串行学习方法 , 可将弱学习器升为强学习器的算法
Boosting框架:
Boosting算法机制
Boosting算法,存在两个问题
如果每一步的弱预测模型生成都是依据损失函数的梯度方向,则称之为梯度提升(GradientBoosting)
补充 :
均方误差的目标函数最优值(取定值)是均值
绝对误差的目标函数的最优解是中位数
Adaboost算法----自适应增强算法
Adaboost:**"关注"被错分的样本,“器重”**性能好的弱分类器
(1)不同的训练集—>调整样本权重
(2)“关注”—>增加错分样本权重
(3)“器重”—>好的分类器权重大
(4) 样本权重间接影响分类器权重
Adaboost算法步骤:
AdaBoost算法的两个核心步骤:
加权多数表决
的方法
Adaboost使用的是整个数据 ; Bagging使用的是数据的子集
Adaboost特点 : 把多个不同的弱分类算法,用一种非随机的方式组合起来 , 实现性能的提高
Adaboost数学原理实战:
首先Adaboost算法是解决二分类问题
{-1,+1}两个类别
对样本进行初始化权值 ===> 1/N
定义了hm(x)模型本身 ---- KNN\DT
定义em分类误差率(计算hm(x)在训练数据集上的分类误差率)----sum(w*I(hm(xi)!=yi)) (不相等返回1,相等返回0)
定义am学习器的权值(计算hm(x)的相关系数)----am=1/2*log({1-em}/em)
更新训练数据集的权值分布----w(k+1)=w(k)*exp(-am*yi*hm(x))/zm
构建基本分类器的线性组合:f(x)=sum(am*hm(x))
二分类输出(最终分类器):F(x)=sign(f(x))
错误率的由来
由于0
em>=1/2
是不对的 ; 因为增加减少权重是整体的分配的 , 因此不会出现小于0
Adaboost的损失函数是指数函数 : 为了方便计算
#1.进行数据的读入---导入数据
from sklearn.datasets import load_iris
iris=load_iris()
#2.对数据进行简单的统计分析和图形化的展示
print(iris.keys())#['data', 'target', 'target_names', 'DESCR', 'feature_names']
print("iris data:",iris.data)
print("iris data:",type(iris.data))#iris data:
print("iris target:",iris.target)
print("iris targetname:",iris.target_names)
print("iris DESCR:",iris.DESCR)
print("iris features_names:",iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# import pandas as pd
# df=pd.DataFrame(iris)
# print(df)
#2.1绘制图形
import seaborn as sns
import matplotlib.pyplot as plt
# sns.pairplot(iris, hue="sepal length (cm)")
# plt.show()
#3.确定特征和标签-X和y
X=iris.data
y=iris.target
#4.1降维---pca
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X=pca.fit_transform(X)#Fit the model with X.
print(":"*1000)
print(X)
print(":"*1000)
# print(pca.explained_variance_)#[0.24301994 0.03386828 0.01034326 0.00170887]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)
#4.特征处理-特征工程
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc=MinMaxScaler() #feature_range : tuple (min, max), default=(0, 1)
#"""Standardize features by removing the mean and scaling to unit variance
X_train_std=sc.fit_transform(X_train)
# """Fit to data, then transform it.
X_test_std=sc.transform(X_test)
# """Perform standardization by centering and scaling
#5.建立机器学习模型
from sklearn.ensemble import AdaBoostClassifier
dtc=AdaBoostClassifier()
dtc.fit(X_train_std,y_train)
#6.利用模型进行预测
y_pred=dtc.predict(X_test_std)
print(y_pred)
#7.校验模型
print("model in trainset score is:",dtc.score(X_train_std,y_train))
print("model in testset score is:",dtc.score(X_test_std,y_test))
# model in trainset score is: 0.95
# model in testset score is: 0.9333333333333333
# #7.保存模型
# from sklearn.externals import joblib
# joblib.dump(dtc,"dtctree.pkl")
# #8.模型可视化
# from sklearn.tree import export_graphviz
# export_graphviz(dtc,filled=True)
# 读取数据
from sklearn.datasets import load_boston
bostan = load_boston()
print(bostan.keys()) # dict_keys(['data', 'target', 'feature_names', 'DESCR'])
print(bostan.DESCR) #:Number of Instances: 506 -:Number of Attributes: 13 numeric/categorical predictive
print(bostan.data)
print(bostan.target)
print(bostan.feature_names)
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
# 数据处理
X = bostan.data
y = bostan.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2)
# print(X_train)
# print(y_train)
# 模型建立
from sklearn.ensemble import AdaBoostRegressor
lr = AdaBoostRegressor()
lr.fit(X_train, y_train)
# 模型预测
y_pred = lr.predict(X_test)
print(y_pred)
# 模型校验
print("model in trains set score is:", lr.score(X_train, y_train))
print("model in tests set score is:", lr.score(X_test, y_test))
# model in trains set score is: 0.9024524541824029
# model in tests set score is: 0.8109792865749363
# mae\mse\r2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
# mae: 3.4240698961589606
# mse 20.765767538052017
print("r2", r2_score(y_test, y_pred)) # r2 0.7658020514461032
Adaboost官网 : 分类 和 回归
import pandas as pd
df_wine=pd.read_csv("wine.data",sep=",")
df_wine.columns=["Class_label","Alcohol","Malic acid","Ash",
"Alcalinity of ash","Magnesium","Total phenols",
"Flavanoids","Nonflavanoid phenols","Proanthocyanins",
"Color intensity","Hue","OD280/OD315 of diluted wines","Proline "]
print(df_wine)
print(df_wine.info())
print(df_wine.shape)
print(df_wine.columns)
# (177, 14)
# Index(['Class_label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
# 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
# 'Proanthocyanins', 'Color intensity', 'Hue',
# 'OD280/OD315 of diluted wines', 'Proline '],
# dtype='object')
df_wine=df_wine[df_wine["Class_label"]!=1]
X=df_wine[["Alcohol","Hue"]].values
print(type(df_wine[["Alcohol","Hue"]])) #
print(type(X))
y=df_wine["Class_label"].values
print(type(y))
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=22)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
tree=DecisionTreeClassifier()
ada=AdaBoostClassifier()
# tree.fit(X_train,y_train)
# 为了求解准确率 , 固求解下面指标
# y_train_pred=tree.predict(X_train)
# y_test_pred=tree.predict(X_test)
ada.fit(X_train,y_train)
y_train_pred=ada.predict(X_train)
y_test_pred=ada.predict(X_test)
from sklearn.metrics import accuracy_score
train_train=accuracy_score(y_train,y_train_pred)
train_test=accuracy_score(y_test,y_test_pred)
print(train_train,train_test)
**总结:**AdaBosst预测准确了所有的训练集类标,与单层决策树相比,它在测试机上表现稍微好一些。单决策树对于训练数据过拟合的程度更加严重一些。总之,我们可以发现Adaboost分类器能够些许提高分类器性能,并且与bagging分类器的准确率接近.
GBDT
XGBoost
LightGBM
XGBoost模型选择 : 决策树集合
L1正则控制的是叶子节点的个数 ; L2正则控制的是w权重
模型的复杂度越高 , 越容易导致方差大 ; 模型越简单 , 越容易导致偏差大
#读取数据
from sklearn.datasets import load_boston
bostan=load_boston()
def data_info():
print(bostan.keys()) # dict_keys(['data', 'target', 'feature_names', 'DESCR'])
print(bostan.DESCR) #:Number of Instances: 506 -:Number of Attributes: 13 numeric/categorical predictive
print(bostan.data)
print(bostan.target)
print(bostan.feature_names)
data_info()
# ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
# 'B' 'LSTAT']
#数据处理
X=bostan.data
y=bostan.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=22,test_size=0.2)
# print(X_train)
# print(y_train)
#模型建立
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
def method_reg(method):
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
# 模型预测
y_pred = dtr.predict(X_test)
# print(y_pred)
# 模型校验
print("model in trains set score is:", dtr.score(X_train, y_train))
print("model in tests set score is:", dtr.score(X_test, y_test))
# model in trains set score is: 0.9782530327891484
# model in tests set score is: 0.8442713715157044
# mae\mse\r2
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
# mae: 3.4240698961589606
# mse 20.765767538052017
print("r2", r2_score(y_test, y_pred)) # r2 0.7658020514461032
print("*"*100)
print("This is GBDT : ")
gbdt = GradientBoostingRegressor()
method_reg(gbdt)
print("*"*100)
print("This is DTR : ")
dtr = DecisionTreeRegressor()
method_reg(dtr)
print("*"*100)
print("This is ADBR : ")
adbr = AdaBoostRegressor()
method_reg(adbr)
print("*"*100)
print("This is RFR : ")
rfr = RandomForestRegressor()
method_reg(rfr)
print("*"*100)
print("This is BR : ")
br = BaggingRegressor()
method_reg(br)