学习来自:python及其学习经典实例【美】Prateek Joshi著 陶俊杰 陈小莉译
1.数据预处理相关操作
#(1)数据准备
import numpy as np
from sklearn import preprocessing
data=np.array([[3,-1.5,2,-5.4],[0,4,-0.3,2.1],[1,3.3,-1.9,-4.3]])
#(2)均值移除
data_standardized=preprocessing.scale(data) #均值移除以保证均值为0以此来消除特征间的偏差
print('\nMean=',data_standardized.mean(axis=0))
print('Std deviation=',data_standardized.std(axis=0)) #均值几乎为0,标准差几乎为1
#(3)缩小数据范围
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled=data_scaler.fit_transform(data)
print('\nMin max scaler data=',data_scaled)
#(4)归一化
data_normalized=preprocessing.normalize(data,norm='l1')
print('\nL1 normalized data=',data_normalized)
#(5)二值化
data_binarized=preprocessing.Binarizer(threshold=1.4).transform(data)
print('\nBinarized data=',data_binarized)
#(6)独热编码
encoder=preprocessing.OneHotEncoder()
encoder.fit([[0,2,1,12],[1,3,5,3],[2,3,2,12],[1,2,4,3]])
encoder_vector=encoder.transform([[2,3,5,3]]).toarray()
print('\nEncoded vector=',encoder_vector)
#from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder() #定义编译器
input_classes=['audi','ford','audi','toyota','ford','bmw']#创建一些标记
#为这些标记做编码
label_encoder.fit(input_classes)
print('\nClass mapping:')
for i,item in enumerate(label_encoder.classes_):
print(item,'-->',i)
#将单词标签转换成数字标签形式
labels=['toyota','ford','audi']
encoded_labels=label_encoder.transform(labels)
print('\nLabels=',labels)
print('Encoded labels=',list(encoded_labels))
#将数字标签反转成英文标签
encoded_labels=[2,1,1,3]
decoded_labels=label_encoder.inverse_transform(encoded_labels)
print('\nEncoded labels=',encoded_labels)
print('Decoded labels',list(decoded_labels))
#3. 创建线性回归器
#(1)数据读取
import sys
import numpy as np
filename=sys.argv[1]
x=[]
y=[]
with open(filename,'r')as f:
for line in f.readlines() :
xt,yt=[float(i) for i in line.split(',')]
x.append(xt)
y.append(yt)
#(2)把数据分成两组:训练集(建立模型)和测试集(验证结果)
num_training=int(0.8*len(x)) #用80%的数据用来训练,用20%来验证我们的假设结果
num_test=len(x)-num_training
#训练数据
x_train=np.array(x[:num_training]).reshape((num_training,1))
y_train=np.array(y[:num_training])
#测试数据
x_test=np.array(x[num_training:]).reshape((num_test,1))
y_test=np.array(y[num_training:])
#(3)上面已经准备好了训练模型,接下来创建回归器对象
from sklearn import linear_model
#创建线性回归对象
linear_regressor=linear_model.LinearRegression()
#用训练集训练模型
linear_regressor.fit(x_train,y_train)
#(4)拟合训练集
import matplot.pyplot as plt
y_train_pred=linear_regressor.predict(x_train)
plt.figure()
plt.scatter(x_train,y_train,color='green')
plt.plot(x_train,y_train_pred,color='black',linewidth=4)
plt.title('Train data')
plt.show()
#用模型对测试数据进行预测
y_test_pred=linear_regressor.predict(x_test)
plt.plot(x_test,y_test_pred,color='black',linewidth=4)
plt.title('Test data')
plt.show()
#(5). 保存数据模型
import cPickle as pickle
output_model_file='save_model.pkl' #保存为save_model.pkl文件
with open(output_model_file,'w') as f:
pickle.dump(linear_regressor,f)
#加载调用该模型
with open(output_model_file,'r') as f:
model_linregr=pickle.load(f)
y_test_pred_new=model_linregr.predict(x_test)
#下面可用于计算回归准确性----评价回归器的拟合效果,详细介绍见下面第4点
print("\nNew mean absolute error=",round(sm.mean_absolute_error(y_test,y_test_pred_new),2))
4.(6)计算回归准确性----评价回归器的拟合效果
# 对上面的第3点的(6)的最后一个进行详解
#评价回归器的拟合效果有许多指标,可以通过加载sklearn模块的以下函数来评价相关的指标,
#一般采用一俩个指标来评估模型即可,通常保证均方差最低,解释方差分最高
import sklearn.metrics as sm
print("Mean absolute error(平均绝对误差):",round(sm.mean_absolute_error(y_test,y_test_pred),2))
print("Mean squared error(均方误差):",round(sm.mean_squared_error(y_test,y_test_pred),2))
print("Median absolute error(中位数绝对误差):",round(sm.median_absolute_error(y_test,y_test_pred),2))
print("Explained variance score(解释方差分):",round(sm.explained_variance_score(y_test,y_test_pred),2))
print("R2 score(R2方得分)",round(sm.r2_score(y_test,y_test_pred),2))