import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
%matplotlib inline
#读取数据
df = pd.read_csv('F:/学习资源/tensorflow/data/boston.csv',header=0)
df.describe()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 |
mean | 3.613524 | 11.363636 | 11.136779 | 0.069170 | 0.554695 | 6.284634 | 68.574901 | 3.795043 | 9.549407 | 408.237154 | 18.455534 | 12.653063 | 22.532806 |
std | 8.601545 | 23.322453 | 6.860353 | 0.253994 | 0.115878 | 0.702617 | 28.148861 | 2.105710 | 8.707259 | 168.537116 | 2.164946 | 7.141062 | 9.197104 |
min | 0.006320 | 0.000000 | 0.460000 | 0.000000 | 0.385000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 12.600000 | 1.730000 | 5.000000 |
25% | 0.082045 | 0.000000 | 5.190000 | 0.000000 | 0.449000 | 5.885500 | 45.025000 | 2.100175 | 4.000000 | 279.000000 | 17.400000 | 6.950000 | 17.025000 |
50% | 0.256510 | 0.000000 | 9.690000 | 0.000000 | 0.538000 | 6.208500 | 77.500000 | 3.207450 | 5.000000 | 330.000000 | 19.050000 | 11.360000 | 21.200000 |
75% | 3.677082 | 12.500000 | 18.100000 | 0.000000 | 0.624000 | 6.623500 | 94.075000 | 5.188425 | 24.000000 | 666.000000 | 20.200000 | 16.955000 | 25.000000 |
max | 88.976200 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 37.970000 | 50.000000 |
CRIM:城镇人均犯罪率
ZN:住宅用地超过25000 sq.ft.的比例
INDUS:城镇非零售商用土地的比例
CHAS:边界是河流为1,否则0
NOX: 一氧化氮浓度
RM:住宅平均房间数
AGE: 1940年之前建成的自用房屋比例
DIS:到波士顿5个中心区域的加权距离
RAD:辐射性公路的靠近指数
TAX:每10000美元的全值财产税率
PTRATIO:城镇师生比例
LSTAT:人口中地位低下者的比例
MEDV:自住房的平均房价,单位:千美元
df.decribe()中,12个特征值范围大小相差不一,有些相差很大,
12个特征好比:杭州笋干老鸭汤(原材料的配比:鸭、火腿、笋干、青菜、水、盐、酱油、葱、姜、蒜、胡椒粉…)
如果这12个特征搭配的比例不好,那么这道菜就会不好吃
一样的道理,这12个特征的数据差异不一,如果不进行处理,会在下面优化的时候会出现训练结果异常,(train_loss=nan,valid_loss=nan)
对此,我们需要对数据进行归一化处理
归一化 = (特征值 - 特征值最小值) / (特征值最大值 - 特征值最小值)(使得数据在0-1之间)
在获取了12个特征数据后就可以对其进行归一化
for i in range(12):
x_data[:,i]=(x_data[:,i]-x_data[:,i].min()) / (x_data[:,i].max()-x_data[:,i].min())
在sklearn.preprocessing里提供了scale()方法可以直接应用。在scale()里执行的转换公式如下面公式所示:
x i ′ = x i − m e a n ( x ) s t d ( x ) x_i'=\frac{x_i-mean(x)}{std(x)} xi′=std(x)xi−mean(x)
将下面数据类型转换代码改为:
x_train = tf.cast(scale(x_train), dtype = tf.float32)
x_valid = tf.cast(scale(x_valid), dtype = tf.float32)
x_test = tf.cast(scale(x_test), dtype = tf.float32)
ds = df.values #数据集以数组形式输出
ds
输出:
array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 4.9800e+00,
2.4000e+01],
[2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 9.1400e+00,
2.1600e+01],
[2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 4.0300e+00,
3.4700e+01],
...,
[6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 5.6400e+00,
2.3900e+01],
[1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 6.4800e+00,
2.2000e+01],
[4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 7.8800e+00,
1.1900e+01]])
df.shape
输出:
(506, 13)
#从df.shape得该数据有12个特征数据,1个标签数据
## 数据归一化前的前12列特征数据
x_data = ds[:,:12]
## 数据归一化前的最后1列标签数据
y_data = ds[:,12]
print('x_data shape=',x_data.shape)
print('y_data shape=',y_data.shape)
输出:
x_data shape= (506, 12)
y_data shape= (506,)
#归一化 = (特征值 - 特征值最小值) / (特征值最大值 - 特征值最小值)
'''for i in range(12):
x_data[:,i]=(x_data[:,i]-x_data[:,i].min()) / (x_data[:,i].max()-x_data[:,i].min())
'''
1)构建和训练机器学习模型是希望对新的数据做出
良好预测
如何去保证训练的实效,可以应对以前未见过的数据呢?
一种方法是将数据集分成两个子集:
- 训练集 - 用于训练模型的子集
- 测试集 - 用于测试模型的子集
通常,在测试集上表现是否良好是衡量能否在新数据上表现良好的有用指标,前提是:- 测试集足够大(规模足够大,可产生具有统计意义的结果)
- 不会反复使用相同的测试集来作假(能代表整个数据集,测试集的特征应该与训练集的特征相同)
2)这种划分方法存在着问题:多次重复执行该流程可能导致模型不知不觉地拟合了特定测试集的特性,对此可以使用以下的这种划分方式
- 训练集 - 用于训练模型的子集
- 验证集 - 用于验证模型的子集
- 测试集 - 用于测试模型的子集
通过将数据集划分为三个子集,可以大幅降低过拟合的发生几率
#数据集数目的划分(训练集-验证集-测试集)
train_num = 300 #训练集的数目
valid_num = 100 #测试集的数目
test_num = len(x_data) - train_num - valid_num #测试集的数目
#训练集划分
x_train = x_data[:train_num]
y_train = y_data[:train_num]
#验证集划分
x_valid = x_data[train_num:train_num + valid_num]
y_valid = y_data[train_num:train_num + valid_num]
#测试集划分
x_test = x_data[train_num + valid_num:train_num + valid_num + test_num]
y_test = y_data[train_num + valid_num:train_num + valid_num + test_num]
#数据类型转换为float32
#x_train = tf.cast(x_train, dtype = tf.float32)
#x_valid = tf.cast(x_valid, dtype = tf.float32)
#x_test = tf.cast(x_test, dtype = tf.float32)
x_train = tf.cast(scale(x_train), dtype = tf.float32)
x_valid = tf.cast(scale(x_valid), dtype = tf.float32)
x_test = tf.cast(scale(x_test), dtype = tf.float32)
def model(x,w,b):
return tf.matmul(x,w) + b
#创建待优化变量
##构建模型中的变量w,对应线性函数的斜率
w = tf.Variable(tf.random.normal([12,1],mean=0.0,stddev=1.0,dtype = tf.float32))
##构建模型中的变量b,对应线性函数的截距
b = tf.Variable(tf.zeros(1),tf.float32)
print(w)
print(b)
输出::
#定义损失函数---均方差MSE
def loss(x,y,w,b):
err = model(x,w,b)-y #预测值与真实值的差值
squared_err = tf.square(err) #求平方,得出方差
return tf.reduce_mean(squared_err) #求均值,得出均方差
#训练模型,使用MBGD(小批量梯度下降算法)进行优化
## 设置训练超参数
training_epochs = 50 #迭代次数
learning_rate = 0.001 #学习率
batch_size = 10 #批量训练一次的样本数
## 计算梯度函数
def grad(x,y,w,b): #计算样本数据(x,y)在参数[w,b]点上的梯度
with tf.GradientTape() as tape: #上下文管理器封装需要求导的计算步骤,并使用其 gradient() 方法求导
loss_ = loss(x,y,w,b)
return tape.gradient(loss_,[w,b]) #返回梯度向量(求导)
- 使用
tf.keras.optimizers.SGD()
声明了一个梯度下降优化器(Optimizer),其学习率通过参数指定。- 优化器可以帮助根据计算出的求导结果更新模型参数,从而最小化损失函数,具体使用方式是调用其
apply_gradients()
方法。
optimizer = tf.keras.optimizers.SGD(learning_rate) #创建优化器指定学习率
loss_list_train = [] #用于保存训练集的loss值的列表
loss_list_valid = [] #用于保存验证集的loss值的列表
total_step = int(train_num/batch_size)
for epoch in range(training_epochs):
for step in range(total_step):
xs = x_train[step*batch_size:(step+1)*batch_size,:]
ys = y_train[step*batch_size:(step+1)*batch_size]
grads = grad(xs,ys,w,b)
optimizer.apply_gradients(zip(grads,[w,b])) # 优化器根据梯度自动调整w和b
loss_train = loss(x_train, y_train, w, b).numpy()
loss_valid = loss(x_valid, y_valid, w, b).numpy()
loss_list_train.append(loss_train)
loss_list_valid.append(loss_valid)
print("epoch={:3d},train_loss={:.4f},valid_loss={:.4f}".format(epoch+1,loss_train,loss_valid))
输出:
epoch= 1,train_loss=660.1863,valid_loss=464.3738
epoch= 2,train_loss=595.7922,valid_loss=411.7638
epoch= 3,train_loss=539.5142,valid_loss=367.2811
epoch= 4,train_loss=489.9636,valid_loss=329.2233
epoch= 5,train_loss=446.1474,valid_loss=296.4343
epoch= 6,train_loss=407.3090,valid_loss=268.0826
epoch= 7,train_loss=372.8414,valid_loss=243.5360
epoch= 8,train_loss=342.2372,valid_loss=222.2891
epoch= 9,train_loss=315.0607,valid_loss=203.9230
epoch= 10,train_loss=290.9305,valid_loss=188.0809
epoch= 11,train_loss=269.5101,valid_loss=174.4536
epoch= 12,train_loss=250.5001,valid_loss=162.7705
epoch= 13,train_loss=233.6337,valid_loss=152.7927
epoch= 14,train_loss=218.6730,valid_loss=144.3094
epoch= 15,train_loss=205.4058,valid_loss=137.1337
epoch= 16,train_loss=193.6427,valid_loss=131.1004
epoch= 17,train_loss=183.2151,valid_loss=126.0633
epoch= 18,train_loss=173.9728,valid_loss=121.8930
epoch= 19,train_loss=165.7821,valid_loss=118.4753
epoch= 20,train_loss=158.5241,valid_loss=115.7098
epoch= 21,train_loss=152.0932,valid_loss=113.5078
epoch= 22,train_loss=146.3957,valid_loss=111.7913
epoch= 23,train_loss=141.3482,valid_loss=110.4920
epoch= 24,train_loss=136.8768,valid_loss=109.5499
epoch= 25,train_loss=132.9160,valid_loss=108.9124
epoch= 26,train_loss=129.4076,valid_loss=108.5337
epoch= 27,train_loss=126.3000,valid_loss=108.3735
epoch= 28,train_loss=123.5476,valid_loss=108.3971
epoch= 29,train_loss=121.1100,valid_loss=108.5739
epoch= 30,train_loss=118.9510,valid_loss=108.8776
epoch= 31,train_loss=117.0389,valid_loss=109.2853
epoch= 32,train_loss=115.3456,valid_loss=109.7773
epoch= 33,train_loss=113.8461,valid_loss=110.3364
epoch= 34,train_loss=112.5182,valid_loss=110.9479
epoch= 35,train_loss=111.3425,valid_loss=111.5992
epoch= 36,train_loss=110.3015,valid_loss=112.2796
epoch= 37,train_loss=109.3798,valid_loss=112.9799
epoch= 38,train_loss=108.5638,valid_loss=113.6922
epoch= 39,train_loss=107.8415,valid_loss=114.4100
epoch= 40,train_loss=107.2022,valid_loss=115.1278
epoch= 41,train_loss=106.6364,valid_loss=115.8409
epoch= 42,train_loss=106.1358,valid_loss=116.5455
epoch= 43,train_loss=105.6928,valid_loss=117.2385
epoch= 44,train_loss=105.3010,valid_loss=117.9175
epoch= 45,train_loss=104.9545,valid_loss=118.5803
epoch= 46,train_loss=104.6482,valid_loss=119.2254
epoch= 47,train_loss=104.3774,valid_loss=119.8517
epoch= 48,train_loss=104.1381,valid_loss=120.4582
epoch= 49,train_loss=103.9268,valid_loss=121.0444
epoch= 50,train_loss=103.7402,valid_loss=121.6099
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(loss_list_train,'b',label='Train Loss')
plt.plot(loss_list_valid,'r',label='Valid Loss')
plt.legend(loc=1)
print("Test_loss:{:.4f}".format(loss(x_test,y_test,w,b).numpy()))
输出:
Test_loss:113.9060
test_house_id = np.random.randint(0,test_num)
y = y_test[test_house_id]
y_pred = model(x_test,w,b)[test_house_id]
y_predit = tf.reshape(y_pred,()).numpy()
print("House id",test_house_id,"Actual value",y,"Predited value",y_predit)
输出:
House id 26 Actual value 10.2 Predited value 24.10615