from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
wine = load_wine()
feature = wine.data
target = wine.target
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=2021)
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
clf = DecisionTreeClassifier(criterion='entropy', random_state=2021)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
wine = load_wine()
feature = wine.data
target = wine.target
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=2021)
cl = DecisionTreeClassifier(criterion='entropy')
cl.fit(x_train, y_train)
print(cl.score(x_test, y_test))
clf = DecisionTreeClassifier(criterion='entropy', random_state=2021)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
from sklearn import tree
import graphviz
feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮', '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸']
dot_data = tree.export_graphviz(clf, # 训练好的决策树模型
out_file=None, # 图片保存路径
feature_names=feature_name, # 指定特征名称
class_names=["琴酒", "雪莉", "贝尔摩德"], # 分类类别
filled=True # 使用颜色分类结果
)
graph = graphviz.Source(dot_data)
print(graph)
print(clf.feature_importances_) # 返回特征的重要性\
featurea_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮', '非黄烷类酚类', '花青素', '颜 色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸']
print([*zip(featurea_name, clf.feature_importances_)])
cll = DecisionTreeClassifier(criterion='entropy', random_state=2021, splitter='random')
cll.fit(x_train, y_train)
print(clf.score(x_test, y_test))
import graphviz
from sklearn.tree import DecisionTreeClassifier
clf1 = DecisionTreeClassifier(criterion='entropy', random_state=2021, max_depth=4 # 树最多只能4层,超过就会剪掉,不算根节点
, min_samples_leaf=10, # 一个节点分支后每个叶子必须最少有10个样本,否则不分枝
min_samples_split=20 # 节点必须有20个样本才允许分支,否则就不能分枝
)
clf1.fit(x_train, y_train)
dot_datat = tree.export_graphviz(clf1, feature_names=feature_name, class_names=["琴酒", "雪莉", "贝尔摩德"], filled=True,
out_file=None)
grapht = graphviz.Source(dot_datat)
print(grapht)
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
data = pd.read_csv("titanic.csv", index_col='PassengerId')
print(data.head())
print(data.info())
# 删除缺失值较多和无关的特征
data.drop(labels=['Cabin', 'Name', 'Ticket'], inplace=True, axis=1)
# 填充age列
data["Age"] = data["Age"].fillna(data["Age"].mean())
# 清洗空值
data.dropna(inplace=True)
print(data.head())
# 将性别转换成数值型数据
data['Sex'] = (data['Sex'] == 'male').astype("int")
print(data.head())
# 将三分类变量转换为数值型变量
labels = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].map(lambda x: labels.index(x))
x = data.iloc[:, data.columns != 'Survived']
y = data.iloc[:, data.columns == 'Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2021)
clf = DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_leaf=11, splitter='best', random_state=25)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
from sklearn.model_selection import GridSearchCV
parameters = {'splitter': ('best', 'random'), 'criterion': ('gini', 'entropy'), 'max_depth': [*range(1, 10)],
'min_samples_leaf': [*range(1, 50, 5)]
}
GS = GridSearchCV(clf, parameters, cv=10) # cv交叉验证
GS.fit(x_train, y_train)
print(GS.best_params_)
# print(GS.best_estimator_)
print(GS.best_score_)
随机森林是一种有监督学习算法,是以决策树为基学习器的集成学习算法。随机森林非常简单,易于实现,计算开销也很小,在分类和回归上表现出非常惊人的性能,因此,随机森林被誉为“代表集成学习技术水平的方法”。
随机森林的随机性体现在哪几个方面呢?
随机森林的重要作用:
随机森林的构建过程
优点:
1.由于采用了集成算法,本身精度比大多数单个算法要好,所以准确性高
2.由于两个随机性的引入,使得随机森林不容易陷入过拟合(样本随机,特征随机)
3.在工业上,由于两个随机性的引入,使得随机森林具有一定的抗噪声能力,对比其他算法具有一定优势
4.它能够处理很高维度(feature很多)的数据,并且不用做特征选择,对数据集的适应能力强:既能处理离散型数据,也能处理连续型数据
5.在训练过程中,能够检测到feature间的互相影响,且可以得出feature的重要性,具有一定参考意义
缺点:
1.当随机森林中的决策树个数很多时,训练时需要的空间和时间会比较大
在sklearn.ensemble库中,我们可以找到Random Forest分类和回归的实现:
API:from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
RandomForestClassifier # 分类
RandomForestRegression # 回归
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
x = load_wine().data
y = load_wine().target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2021)
# 逻辑回归
l = LogisticRegression(solver='liblinear').fit(x_train, y_train) # solver为求解器
print(f1_score(y_test, l.predict(x_test), average='micro'))
# 决策树
d = DecisionTreeClassifier().fit(x_train, y_train)
print(f1_score(y_test, d.predict(x_test), average='micro'))
# 随机森林
r = RandomForestClassifier().fit(x_train, y_train)
print(f1_score(y_test, r.predict(x_test), average='micro'))
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10)
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10)
plt.plot(range(1,11),rfc_s,label = "RandomForest")
plt.plot(range(1,11),clf_s,label = "Decision Tree")
plt.legend()
plt.show()
rfc_l = []
clf_l = []
for i in range(10):
rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc, x, y, cv=10).mean()
rfc_l.append(rfc_s)
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf, x, y, cv=10).mean()
clf_l.append(clf_s)
plt.plot(range(1, 11), rfc_l, label="RandomForest")
plt.plot(range(1, 11), clf_l, label="DecisionTree")
plt.legend()
plt.show()
# 无需划分训练集和测试集
rfc = RandomForestClassifier(n_estimators=25, oob_score=True)
rfc = rfc.fit(x, y)
print(rfc.oob_score_)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
x = load_boston().data
y = load_boston().target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2021)
d = DecisionTreeRegressor(criterion='friedman_mse').fit(x_train, y_train)
print(d.score(x_test, y_test))
r = RandomForestRegressor(criterion='friedman_mse').fit(x_train, y_train)
print(r.score(x_test, y_test))
from xgboost import XGBRegressor as xgbr
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold, cross_val_score, train_test_split
data = load_boston()
x = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=420)
# xgb
reg = xgbr(n_estimators=100).fit(x_train, y_train)
print(reg.score(x_test, y_test))
print(MSE(y_test, reg.predict(x_test)))
print(cross_val_score(reg, x_train, y_train, cv=5).mean())
# 随机森林
rfr = RFR(n_estimators=100).fit(x_train, y_train)
print(rfr.score(x_test, y_test))
print(MSE(y_test, rfr.predict((x_test))))
print(cross_val_score(rfr, x_train, y_train, cv=5).mean())
model_count = []
scores = []
for i in range(50, 170):
xgb = xgbr(n_estimators=i).fit(x_train, y_train)
score = xgb.score(x_test, y_test)
scores.append(score)
model_count.append(i)
import matplotlib.pyplot as plt
plt.plot(model_count, scores)
plt.show()
import numpy as np
subs = []
scores = []
for i in np.linspace(0.05, 1, 20):
xgb = xgbr(n_estimators=182, subsample=i).fit(x_train, y_train)
score = xgb.score(x_test, y_test)
subs.append(i)
scores.append(score)
plt.plot(subs, scores)
plt.show()
rates = []
scoresr = []
for i in np.linspace(0.05, 1, 20):
xgb = xgbr(n_estimators=182, subsample=0.9, learning_rate=i).fit(x_train, y_train)
scorer = xgb.score(x_test, y_test)
rates.append(i)
scoresr.append(scorer)
plt.plot(rates, scoresr)
plt.show()
for booster in ["gbtree", "gblinear", "dart"]:
reg = xgbr(n_estimators=180, learning_rate=0.1, random_state=420, booster=booster).fit(x_train, y_train)
print(booster)
print(reg.score(x_test, y_test))
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# 创建数据
# n_samples=500 原始数据有500行
# n_features=2 原始数据有2特征维度
# centers=4 原始数据有4个类别
x, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state=10)
print(x.shape)
print(y.shape)
# 将原始已经有类别的样本数据绘制在散点图中,每一个类别使用不同颜色来表示
color = ['red', 'pink', 'orange', 'gray']
fig, ax1 = plt.subplots(1) # 生成1个坐标系
for i in range(4):
# 将X中y==i的类别的行第0列拿到,s为像素的大小
ax1.scatter(x[y == i, 0], x[y == i, 1], c=color[i], s=8)
plt.show()
# 聚类分类-与原分类对比
# 用4个簇训练模型
cluster = KMeans(n_clusters=4)
cluster.fit(x)
# 重要属性Labels_,查看聚类后的类别,每个样本所对应的类
y_pred = cluster.labels_
# 重要属性cLuster_centers_,查看质心
cluster.cluster_centers_
# 重要属性inertia_,查看总距离平方和(整体平方和)
inertia = cluster.inertia_
color = ['red', 'pink', 'orange', 'gray']
fig, ax1 = plt.subplots(1) # 生成1个坐标系
for i in range(4):
# 将X中y==i的类别的行第0列拿到,s为像素的大小
ax1.scatter(x[y_pred == i, 0], x[y_pred == i, 1], c=color[i], s=8)
plt.show()
inertia = cluster.inertia_
color = ['red', 'pink', 'orange', 'gray']
fig, ax1 = plt.subplots(1) # 生成1个坐标系
for i in range(4):
# 将X中y==i的类别的行第0列拿到,s为像素的大小
ax1.scatter(x[y_pred == i, 0], x[y_pred == i, 1], c=color[i], s=8)
plt.show()
# 使用200组数据来寻找质心
c = KMeans(n_clusters=4, random_state=10)
c.fit(x[0:200])
print(c.predict(x[200:])) # 和 labels_返回的结果一样,都是他的分到的类别
# 如果我们把猜测的簇数换成5
clustera = KMeans(n_clusters=5)
clustera.fit(x)
print(clustera.labels_)
print(clustera.inertia_)
# 如果我们把猜测的簇数换成6
clusterb = KMeans(n_clusters=6)
clusterb.fit(x)
print(clusterb.inertia_)
from sklearn.metrics import silhouette_score, silhouette_samples
# silhouette_score返回的是一个数据集中,所有样本的轮廓系数的均值
print(silhouette_score(x, labels=cluster.labels_)) # 每个样本的分类
# 返回的 是数据集中每个样本自己的轮廓系数
print(x.shape[0])
print(silhouette_samples(x, cluster.labels_))
print(silhouette_samples(x, cluster.labels_).sum() / x.shape[0])
同样,对于任意一个红色的点xr来讲,我们可以将它表示为:
我们之前说过,决策边界的两边要有两个超平面,这两个超平面在二维空间中就是两条平行线(就是我们的虚线超平面),而他们之间的距离就是我们的边际d。而决策边界位于这两条线的中间,所以这两条平行线必然是对称的。我们另这两条平行线被表示为:
支持向量:
如下图所示,(xp-xr)可表示为两点之间的连线(距离),而我们的边际d是平行于w的,所以我们现在,相当于是得到了三角型中的斜边,并且知道一条直角边的方向。在线性代数中,我们有如下数学性质:
所以,我们将上述式子两边除以||w||,可以得到:(这里多除一个||w||为了简化公式,有一定影响,但影响不大)
还记得我们想求什么吗?最大边界所对应的决策边界,那问题就简单了,要最大化d,就求解w的最小值。我们可以把求解 的最小值转化为,求解以下函数的最小值:
如果wxi+b>=1,yi=1则yi(wxi+b)的值一定是>=1的,如果wxi+b<=-1,yi=-1则yi(wxi+b)的值也一定是>=1的。
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.svm import SVC, SVR # SVC做分类,SVR做回归预测
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x = load_breast_cancer().data
y = load_breast_cancer().target
print(x.shape)
print(y.shape)
# 使用PCA对特征数据降维,查看是否线性可分
pca = PCA(n_components=2) # n_components=2将数据降到2维
pca_x = pca.fit_transform(x)
print(pca_x.shape)
# 绘制散点图查看是否线性可分
plt.scatter(pca_x[:, 0], pca_x[:, 1], c=y)
plt.show()
# 由图可以看出数据大概是个偏线性可分
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2021)
# 参数C正则化力度处理过拟合,kernel核函数,class_weight处理样本不均衡
s = SVC(C=1, kernel='linear').fit(x_train, y_train)
print(s.score(x_test, y_test))
s1 = SVC(C=1, kernel='poly').fit(x_train, y_train)
print(s1.score(x_test, y_test))
s2 = SVC(C=1, kernel='rbf').fit(x_train, y_train)
print(s2.score(x_test, y_test))
1、载入数据并简略观察数据
2、总览数据概况
3、通过 describe 和 matplotlib 可视化查看数据的相关统计量(柱状图)
4、缺失值处理
5、查看目标数据的分布
6、特征分布
7、查看特征于特征之间的相关性(热力图)
8、查看特征和目标的相关性,正负相关性越强则特征对结果影响的权重越高,特征越重要
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
data = pd.read_csv('Energy.csv')
print(data.head(1))
# 查看数据形状
print(data.shape)
# 查看数据字段类型是否存在缺失值
print(data.info)
# 首先将"Not Available"替换为 np.nan
data = data.replace({'Not Available': np.nan})
print(data.info)
# 数据集有的字段显示为数值型数据,但是实际类型为str,再将部分数值型数据转换成float
for col in list(data.columns):
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
col or 'therms' in col or 'gal' in col or 'Score' in col):
data[col] = data[col].astype(float)
print(data.describe())
# 通过 describe 和 matplotlib 可视化查看数据的相关统计量(柱状图)
data_desc = data.describe() # 查看数据描述
cols = data_desc.columns # 取得列缩影
index = data_desc.index[1:] # 去除count行
plt.figure(figsize=(30, 30)) # 控制画布大小
for i in range(len(cols)):
ax = plt.subplot(10, 6, i + 1) # 绘制10x6的表格,当前数据特征维度为60
ax.set_title(cols[i]) # 设置标题
for j in range(len(index)):
plt.bar(index[j], data_desc.loc[index[j], cols[i]]) # 对每个特征绘制describe柱状图
plt.show()
# Order的图形比较正常,因为最小值,中位数,最大值是错落分布,正常分布的,且均值和标准差分布也正常
# DOF Gross Floor Area图形可能有问题,显示最大值比其他的值都大很多(离均点,异常值),如果最大值的数据数量较少,则考虑将其删除
# 发现:经度,维度特征的std极低,且数值分布特别均匀,说明这俩列特征对结果影响几乎为0,适当考虑过滤该特征
# 查看缺失值
def missing_values_table(df):
# 计算每一列缺失值的个数
mis_val = df.isnull().sum(axis=0)
# 计算每列缺失值占该列总数据的百分比
mis_val_percent = 100 * mis_val / data.shape[0]
# 将每一列缺失值的数量和缺失值的百分比级联到一起,形成一个新的表格
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 重新给上步表格的列命名
mis_val_table_ren_columns = mis_val_table.rename(columns={0: 'Missing Values', 1: '% of Total Values'})
# 将百分比不为0的行数据根据百分比进行降序排序
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# 打印概述
print('Your selected dataframe has ' + str(df.shape[1]) + 'columns.\n'
'There are' + str(
mis_val_table_ren_columns.shape[0]) + ' columns that have missing values.')
# Return the dataframe with missing information
return mis_val_table_ren_columns
missing_df = missing_values_table(data)
print(missing_df.head(3))
# 设置阈值将缺失比例超过百分之50的列删除
# 找出超过阈值的列
missing_df = missing_values_table(data)
missing_columns = list(missing_df.loc[missing_df['% of Total Values'] > 50].index)
print('We will remove %d columns.' % len(missing_columns))
data = data.drop(columns=list(missing_columns))
# print(data)
# 中位数填充剩下的空值 np.median获取中位数,如果原始数据存在空值就会返回空nan
for x in data.columns:
# 去除object类型的列(object列不存在中位数)
if str(data[x].dtypes) == 'object':
continue
if data[x].isnull().sum() > 0:
# 取出每列非空元素求得中位数进行填充
data[x] = data[x].fillna(value=np.median(data.loc[~data[x].isnull(), x]))
# 查看目标数据的分布情况
data['ENERGY STAR Score'].hist(bins=20)
plt.figure(figsize=(40, 20))
plt.scatter(data['ENERGY STAR Score'].index, data['ENERGY STAR Score'].values)
# 由图看到集中到60多出现一条线,说明数据集中在60多,没有找到离群数据
plt.show()
print(data['ENERGY STAR Score'].value_counts().sort_values().tail(1))
# data['Site EUI (kBtu/ft²)'].hist(bins=20)
# plt.figure(figsize=(15, 8))
# plt.scatter(data['Site EUI (kBtu/ft²)'].index, data['Site EUI (kBtu/ft²)'].values)
# plt.show()
# 离群点数据过滤
q1 = data['Site EUI (kBtu/ft²)'].describe()['25%']
q3 = data['Site EUI (kBtu/ft²)'].describe()['75%']
iq = q3 - q1
# data_copy就是离群的数据
data_copy = data[(data['Site EUI (kBtu/ft²)'] > (q1 - 3 * iq)) & (data['Site EUI (kBtu/ft²)'] < (q3 + 3 * iq))]
# 之后我们就可以对离群点做处理,替换replace还是删除
data_copy['Site EUI (kBtu/ft²)'].hist(bins=30)
plt.scatter(data_copy['Site EUI (kBtu/ft²)'].index, data_copy['Site EUI (kBtu/ft²)'].values)
plt.show()
# 查看特征的不同取值数量
for x in data.columns:
print('*' * 50)
print(x, data[x].nunique())
# 取值少的可能为类别性数据,取值多的为连续性数据
# 图形查看所有特征数据分布
import seaborn as sns
for col in data.columns:
if 'int' in str(data[col].dtypes) or 'float' in str(data[col].dtypes):
plt.hist(data[col], bins=50)
sns.distplot(data.loc[~data[col].isnull(), col])
plt.title(col)
plt.show()
# 发现有很多特征都是长尾分布的,需要将其转换为正太或者近正太分布,长尾分布说明特征中少数的数值是离群点数据
# log操作转换为近正太分布
# data['DOF Gross Floor Area']
sns.distplot(np.log(data.loc[~data['DOF Gross Floor Area'].isnull(), 'DOF Gross Floor Area']))
# plt.show()
# 直方图
for col in data.columns:
if 'int' in str(data[col].dtypes) or 'float' in str(data[col].dtypes):
plt.hist(data[col], bins=50)
plt.title(col)
plt.show()
feature = data.loc[:, data.columns != 'ENERGY STAR Score'] # 提取特征数据
fea_name = feature.select_dtypes('number').columns # 提取数值型特征名称
feature = feature[fea_name] # 提取数值型特征
# 提取指定的两个类别型特征
categorical_subset = data[['Borough', 'Largest Property Use Type']]
categorical_subset = pd.get_dummies(categorical_subset)
feature = pd.concat([feature, categorical_subset], axis=1)
print(feature.corr()) # corr查看特征与特征的相关性
plt.subplots(figsize=(30, 15)) # 指定窗口尺寸(单位英尺)
feature_corr = feature.corr().abs() # 返回列与列之间的相关系数 abs求得是绝对值,相关系数与正负无关
# 数据为相关系数,显示数值,显示颜色条 这里需要导入模快 import seaborn as sns 也是一个绘图模块
sns.heatmap(feature_corr, annot=True)
plt.show()
colsa = feature.columns # 获取列的名称
corr_list = []
size = feature.shape[1]
# print(size)
high_corr_fea = [] # 存储相关系数大于0.5的特征名称
for i in range(0, size):
for j in range(i + 1, size):
if (abs(feature_corr.iloc[i, j]) >= 0.5):
corr_list.append([feature_corr.iloc[i, j], i, j]) # features_corr.iloc[i,j]:按位置选取数据
sorted_corr_list = sorted(corr_list, key=lambda xx: -abs(xx[0]))
# print(sorted_corr_list)
for v, i, j in sorted_corr_list:
high_corr_fea.append(colsa[i])
# print("%s and %s = %.2f" % (cols[i], cols[j], v)) # cols: 列名
# 删除特征
feature.drop(labels=high_corr_fea, axis=1, inplace=True)
print(feature.shape)
target = data['ENERGY STAR Score']
target = pd.DataFrame(data=target, columns=['ENERGY STAR Score'])
# 级联target&feature
new_data = pd.concat((feature, target), axis=1)
# 计算相关性,之后我们就可以选择特征与目标相关性较大的数据进行特征的选取
fea_target_corr = abs(new_data.corr()['ENERGY STAR Score'][:-1])
print(fea_target_corr)
# 改名字
new_data = new_data.rename(columns={'ENERGY STAR Score': 'score'})
# print(new_data)
new_data.to_csv('eda_data.csv')
from sklearn.model_selection import GridSearchCV, train_test_split
data = pd.read_csv('eda_data.csv').drop(labels='Unnamed:0', axis=1)
fea_name = [x for x in data.columns if x not in ['score']]
feature = data[fea_name]
target = data['score']
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=2021)
# Function to calculate mean absolute error
def mae(y_true, y_pred):
return np.mean(abs(y_true-y_pred))
# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
# Train the model
model.fit(x_train, y_train)
# Make predictions and evalute
model_pred = model.predict(x_test)
model_mae = mae(y_test, model_pred)
# Return the performance metric
return model_mae
# 线性回归
lr = LinearRegression()
lr_mae = fit_and_evaluate(lr)
print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)
# svm支向量机
svm = SVR(C=1000, gamma=0.1)
svm_mae = fit_and_evaluate(svm)
print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)
# 随机森林
random_forest = RandomForestRegressor(random_state=60)
random_forest_mae = fit_and_evaluate(random_forest)
print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)
plt.style.use('fivethirtyeight')
model_comparison = pd.DataFrame({'model': ['Linear Regression', 'Support Vector Machine',
'Random Forest'
],
'mae': [lr_mae, svm_mae, random_forest_mae]})
model_comparison.sort_values('mae', ascending=False).plot(x='model', y='mae', kind='barh',
color='red', edgecolor='black')
plt.ylabel('')
plt.yticks(size=14)
plt.xlabel('Mean Absolute Error')
plt.xticks(size=14)
plt.title('Model Comparison on Test MAE', size=20)
plt.show()
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [150, 200, 250], 'min_samples_split': [2, 3, 5, 10, 15]
, "max_depth": [2, 3, 5, 10, 15]
, 'min_samples_leaf': [1, 2, 4, 6, 8]
, 'min_samples_split': [2, 4, 6, 10]
}
model = RandomForestRegressor()
GS = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='neg_mean_absolute_error')
GS.fit(x_train, y_train)
print(GS.best_params_)