Energy Efficiency数据集( ENB2012_data.xlsx,ENB2012.names)记录不同房屋的制热能源消耗和制冷能源消耗。包括768条记录,8个特征属性,两个预测值。具体说明见ENB2012.names。
1)在全数据集上训练线性回归模型预测制热能耗,计算模型性能:RMSE以及R2;
2)将数据集划分训练集和测试集,在训练集上训练线性回归模型,分析模型在训练集
和测试集上的性能。
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LinearRegression
# 1) 在全数据集上训练线性回归模型预测制热能耗,计算模型性能:RMSE以及R2
filename = 'data\ENB2012_data.xlsx'
data = pd.read_excel(filename)
# print(data.iloc[0:769,:].values)
X = data.iloc[:,0:8]
y = data.iloc[:,8] #制热
linreg = LinearRegression()
linreg.fit(X,y)
print("intercept: ",linreg.intercept_,"\ncoefficient: ",linreg.coef_) #截距+回归系数
y_pred = linreg.predict(X)
err = metrics.mean_squared_error(y,y_pred) # RMSE
print('The mean square error is: {:2f}'.format(err))
predict_score1 = linreg.score(X,y) # R^2
print('The decision coefficient is: {:2f}'.format(predict_score1),"\n")
# 2) 将数据集划分训练集和测试集,在训练集上训练线性回归模型,分析模型在训练集和测试集上的性能
y = data.iloc[:,8:10] #制热+制冷
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size=0.35,random_state=1) #切分
linregTr = LinearRegression()
linregTr.fit(X_train,y_train)
print("intercept: ",linregTr.intercept_,"\ncoefficient: ",linregTr.coef_) #截距+回归系数
y_train_pred = linregTr.predict(X_train)
y_test_pred = linregTr.predict(X_test)
train_err = metrics.mean_squared_error(y_train,y_train_pred) # RMSE
test_err = metrics.mean_squared_error(y_test,y_test_pred) # RMSE
print('The mean square error of train and test are: {:2f},{:2f}'.format(train_err,test_err))
predict_score2 = linregTr.score(X_test,y_test) # R^2
print('The decision coefficient is: {:2f}'.format(predict_score2))
基于bankpep数据集,划分训练集与测试集,建立分类模型。
1)使用决策树在训练集上建立分类模型,记录模型在测试集上的性能;
2)自学朴素贝叶斯、支持向量集训练分类模型的方法,分别在训练集上建立分类模型,记录模型在测试集上的性能;
3) 分析比较三种方法的性能差异。
1)决策树
# 基于bankpep数据集,划分训练集与测试集,建立分类模型
import pandas as pd
from sklearn import model_selection, tree
filename = 'data\bankpep.csv'
data1 = pd.read_csv(filename, index_col=0, header=None, skiprows=1)
# print(data)
data1.loc[data1[2] == 'FEMALE', 2] = 1
data1.loc[data1[2] == 'MALE', 2] = 0
data1.loc[data1[3] == 'INNER_CITY', 3] = 0
data1.loc[data1[3] == 'TOWN', 3] = 1
data1.loc[data1[3] == 'RURAL', 3] = 2
data1.loc[data1[3] == 'SUBURBAN', 3] = 3
data1.loc[data1[5] == 'YES', 5] = 1
data1.loc[data1[5] == 'NO', 5] = 0
data1.loc[data1[6] == 'YES', 6] = 1
data1.loc[data1[6] == 'NO', 6] = 0
data1.loc[data1[7] == 'YES', 7] = 1
data1.loc[data1[7] == 'NO', 7] = 0
data1.loc[data1[8] == 'YES', 8] = 1
data1.loc[data1[8] == 'NO', 8] = 0
data1.loc[data1[9] == 'YES', 9] = 1
data1.loc[data1[9] == 'NO', 9] = 0
data1.loc[data1[10] == 'YES', 10] = 1
data1.loc[data1[10] == 'NO', 10] = 0
data1.loc[data1[11] == 'YES', 11] = 1
data1.loc[data1[11] == 'NO', 11] = 0
# print(data, '\n')
# 1) 使用决策树在训练集上建立分类模型,记录模型在测试集上的性能
X1 = data1.iloc[:, 0:10]
y1 = data1.iloc[:, 10]
# linreg = LinearRegression()
# linreg.fit(X,y)
# print(linreg.intercept_,linreg.coef_)
X_train1, X_test1, y_train1, y_test1 = model_selection.train_test_split(X1, y1, test_size=0.35, random_state=1)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train1, y_train1.astype('float64'))
score1 = clf.score(X_test1, y_test1.astype('float64'))
print('1) 决策树\nscore =', score1)
2) 朴素贝叶斯
资料来源:朴素贝叶斯算法(python 实现)
在这里需要注意:上方资料中的贝叶斯算法使用数据集类型数据进行计算,而先前题目中data数据是dataframe类型。薯条在这里尝试了一个下午,最后选择了强行转换为数组形式,等效于数据集代入公式计算。
# 2) 自学朴素贝叶斯、支持向量机训练分类模型的方法,分别在训练集上建立分类模型,记录模型在测试集上的性能
# 朴素贝叶斯
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import time
# 创建矩阵装载dataframe数据,强制转换为数据集类型
data2 = data1
train2 = data2.drop([11], axis=1).astype('float')
test2 = data2[11].astype('float')
train_array = train2.values
test_array = test2.values
train_array.reshape(600, 10)
test_array.reshape(600, 1)
# print(train_array)
class Beyes(object):
def __init__(self):
self.length = -1 # 保存测试集数据量
self.train_target_list = [] # 目标值类别集合
self.p_train_target = {} # 保存个目标值概率
self.split_data_lis = [] # 保存各条件概率对应的数据集
self.feature_p_lis = [] # 保存特征概率
self.predict = [] # 保存分类结果
def fit(self, train_data, train_target):
"""
P(X)=∑kP(X|Y=Yk)P(Yk)
计算P(Yk):p_train_target
准备计算条件概率所需数据集:self.split_data_lis
:param train_data:
:param train_target:
:return:
"""
train_length = train_data.shape[0]
self.length = train_length
target_list = list(set(train_target)) # 队训练集目标值去重
self.train_target_list = target_list # 写入对象特征
target_classifier = dict(Counter(train_target)) # 保存目标值的分类计数(字典格式)
train_data = pd.DataFrame(train_data)
train_data['target'] = train_target # 将数据转换为DataFrame格式方便后续聚合
for target in self.train_target_list:
self.p_train_target[target] = target_classifier[target] / self.length # 保存各目标值的概率
split_data = train_data[train_data['target'] == target]
self.split_data_lis.append(split_data)
print('model had trained please use classifier() to get result')
def p_test_data(self, sample):
"""
映射函数:取测试集的一个样本,计算各特征对应条件概率及最终的分类结果
并将结果映射到测试集的target列
:param sample:Serise
:return self.train_target_list[position]:概率最大的类别
"""
result_p = []
for j in range(len(self.train_target_list)):
p_label = 1
this_target = self.train_target_list[j]
this_data = self.split_data_lis[j]
for i in range(0, sample.shape[0]):
feature_num_dict = dict(Counter(this_data[i])) # 计算一列数据中各类别的数量
if sample[i] in feature_num_dict:
label_num = feature_num_dict.get(sample[i])
p_label = p_label * (label_num / this_data.shape[0]) # 计算单个特征的条件概率
else:
# 加入拉普拉斯平滑系数解决概率为0的情况'
p_label = p_label * (1 / (this_data.shape[0] + len(feature_num_dict)))
this_target_p = p_label * self.p_train_target.get(this_target) # 计算该样本属于该特征的概率
result_p.append(this_target_p)
position = result_p.index(max(result_p)) # 概率最大的分类
return self.train_target_list[position]
def classifier(self, test_data):
"""
计算分类结果
:param test_data:Serise
:return:
"""
if self.length == -1:
raise ValueError('please use fit() to train the train data set ')
else:
test_data = pd.DataFrame(test_data)
test_data['target'] = test_data.apply(self.p_test_data, axis=1) #
self.predict = list(test_data['target'])
print('classfier result:', self.predict)
def score(self, test_target):
"""
计算准确率
:param test_target:
:return:
"""
if len(self.predict) == 0:
raise ValueError('please use classifier() to get classifier target')
else:
count = 0
for i in range(0, test_target.shape[0]):
if test_target[i] == self.predict[i]:
count += 1
score = count / (test_target.shape[0])
print('the Classification accuracy is:', score)
if __name__ == '__main__':
"""
;用sk_learn中的鸢尾花数据集测试
"""
# iris = load_iris()
x = train_array # iris.data
y = test_array # iris.target
# print(x,y) x为二维矩阵、y为一维矩阵 np.array
print('\n2) 朴素贝叶斯')
x_train, x_test, y_train, y_test = train_test_split(x, y)
print('训练集数据量', x_train.shape)
print('测试集数据量', x_test.shape)
start_time = time.time()
classifier = Beyes()
classifier.fit(x_train, y_train)
classifier.classifier(x_test)
classifier.score(y_test)
end_time = time.time()
time_d = end_time - start_time
print("spend time:", time_d)
3)支持向量机
# 支持向量机
data3 = pd.read_csv(filename, index_col='id')
print('\n3) 支持向量机\n') # ,data3)
seq = ['married', 'car', 'save_act', 'current_act', 'mortgage', 'pep']
for feature in seq:
data3.loc[data3[feature] == 'YES', feature] = 1
data3.loc[data3[feature] == 'NO', feature] = 0
data3.loc[data3['sex'] == 'FEMALE', 'sex'] = 1
data3.loc[data3['sex'] == 'MALE', 'sex'] = 0
# print(data3)
dumm_reg = pd.get_dummies(data3['region'], prefix='region')
dumm_child = pd.get_dummies(data3['children'], prefix='children')
df1 = data3.drop(['region', 'children'], axis=1)
df2 = df1.join([dumm_reg, dumm_child], how='outer')
# print(df2[0:5])
X3 = df2.drop(['pep'], axis=1).astype('int')
y3 = df2['pep'].astype('int')
# print('X3\n',X3, '\ny3\n',y3)
from sklearn import svm
clf = svm.SVC(kernel='rbf', gamma=0.6, C=100)
clf.fit(X3, y3)
print("Accuracy:", clf.score(X3, y3))
from sklearn import metrics
y3_predicted = clf.predict(X3)
print(metrics.classification_report(y3, y3_predicted))
X_train3, X_test3, y_train3, y_test3 = model_selection.train_test_split(X3, y3, test_size=0.3, random_state=1)
clf = svm.SVC(kernel='rbf', gamma=0.6, C=0.001)
clf.fit(X_train3, y_train3)
print("Performance on training set:", clf.score(X_train3, y_train3))
print("Performance on test set:", clf.score(X_test3, y_test3))
from sklearn import preprocessing
X_scale3 = preprocessing.scale(X3)
X_train3, X_test3, y_train3, y_test3 = model_selection.train_test_split(X_scale3, y3, test_size=0.3, )
clf = svm.SVC(kernel='poly', gamma=0.6, C=0.001)
clf.fit(X_train3, y_train3)
score3 = clf.score(X_test3, y_test3)
print('\nscore =', score3)
第二题三个代码为同一文件。
hxd们发现错误一定要戳我!
补充:仅供参考,注意查重,禁止照搬