由于更新参数需要得到各参数的梯度信息,因此前向传播要用with tf.GradientTape() as tape:包裹起来,关于with as 的语法如果不熟悉可以参考https://www.cnblogs.com/DswCnblog/p/6126588.html。此外,还得计算代价函数,就是Loss,一般采用差平方的均值来计算,差平方使用tf.math.square(x),均值采用tf.math.reduce_mean(input_tensor,axis=None),如果不指定axis就对所有元素求均值,返回值是标量,而如果指定axis,就仅对该axis做均值,结果的shape中该axis消失。
theta_1.assign_sub(alpha*grads[0])#原地更新,类型不变 theta_2.assign_sub(alpha * grads[2]) theta_3.assign_sub(alpha * grads[4]) bias_1.assign_sub(alpha*grads[1]) bias_2.assign_sub(alpha * grads[3]) bias_3.assign_sub(alpha * grads[5])
val_y = tf.convert_to_tensor(val_y,dtype=tf.int64) #与argmax返回值类型统一
x = tf.convert_to_tensor(x,dtype=tf.float32) /255.5 -0.5
val_x = tf.convert_to_tensor(val_x,dtype=tf.float32) /255.5 -0.5
theta_1 = tf.Variable(tf.random.truncated_normal([784,256],stddev=0.1))#因为后面要记录梯度信息,所以要用Varible theta_2 = tf.Variable(tf.random.truncated_normal([256,128],stddev=0.1)) theta_3 = tf.Variable(tf.random.truncated_normal([128,10],stddev=0.1))
loss = tf.keras.losses.categorical_crossentropy(y,out,from_logits=True)
loss = tf.math.reduce_mean(loss)
1 import tensorflow as tf 2 #数据集管理器 3 from tensorflow.keras import datasets 4 import time 5 6 #导入数据集 7 (x,y),(val_x,val_y) = datasets.mnist.load_data() 8 #数据集信息 9 print('type_x:',type(x),'dtype_x:',x.dtype) 10 print('type_y:',type(y),'dtype_y:',y.dtype) 11 print('shape_x:',x.shape,'shape_y:',y.shape) 12 print('shape_val:',val_x.shape,'shape_val:',val_y.shape) 13 print('max_x:',x.max(),'min_x:',x.min()) 14 print('max_y:',y.max(),'min_y:',y.min()) 15 16 #需要将数据转成Tensor 17 x = tf.convert_to_tensor(x,dtype=tf.float32) /255 -0.5 18 y = tf.convert_to_tensor(y,dtype=tf.int32) 19 val_x = tf.convert_to_tensor(val_x,dtype=tf.float32)/255 -0.5 20 val_y = tf.convert_to_tensor(val_y,dtype=tf.int64) #与argmax返回值类型统一 21 #独热码 22 y = tf.one_hot(y,depth=10) 23 print('one_hot_y:',y.shape) 24 25 #生成批处理 26 test_db = tf.data.Dataset.from_tensor_slices((val_x,val_y)).shuffle(10000).batch(256) 27 train_db = tf.data.Dataset.from_tensor_slices((x,y)).shuffle(10000).batch(256) 28 #批处理数据信息 29 train_iter = iter(train_db) 30 sample = next(train_iter) 31 print('sample_x_shape:',sample[0].shape,'sample_y_shape:',sample[1].shape) 32 33 #参数初始化 34 #input(layer0)->layer1: nodes:784->256 35 theta_1 = tf.Variable(tf.random.truncated_normal([784,256],stddev=0.1))#因为后面要记录梯度信息,所以要用Varible 36 bias_1 = tf.Variable(tf.zeros([256])) 37 38 #layer1->layer2: nodes:256->128 39 theta_2 = tf.Variable(tf.random.truncated_normal([256,128],stddev=0.1)) 40 bias_2 = tf.Variable(tf.zeros([128])) 41 42 #layer2->out(layer3): nodes:128->10 43 theta_3 = tf.Variable(tf.random.truncated_normal([128,10],stddev=0.1)) 44 bias_3 = tf.Variable(tf.zeros([10])) 45 46 #确定学习率 47 alpha = tf.constant(1e-3) 48 # 测试样本总数 49 total_val = val_y.shape[0] 50 # 训练样本总数 51 total_y = y.shape[0] 52 #开始时间 53 start_time = time.time() 54 55 for echo in range(500): 56 #前向传播 57 correct_cnt = 0 # 预测对的数量 58 59 for batch, (x, y) in enumerate(train_db): 60 # x:[256,28,28] 61 x = tf.reshape(x, [-1, 28 * 28]) # 最后一批<256个,用-1可以自动计算 62 with tf.GradientTape() as tape: 63 # 前向传播 64 # x:[256,784] theta_1:[784,256] bias_1:[256,] h_1:[256,256] 65 h_1 = x @ theta_1 + bias_1 66 h_1 = tf.nn.relu(h_1) 67 # h_1:[256,256] theta_2:[256,128] bias_2:[128,] h_2:[256,128] 68 h_2 = h_1 @ theta_2 + bias_2 69 h_2 = tf.nn.relu(h_2) 70 # h_2:[256,128] theta_3:[128,10] bias_2:[10,] out:[256,10] 71 out = h_2 @ theta_3 + bias_3 72 73 # 计算代价函数 74 # out:[256,10] y:[256,10] 75 loss = tf.losses.categorical_crossentropy(y,out,from_logits=True)# loss:[256,10]->scalar 76 loss = tf.math.reduce_mean(loss) 77 78 # 获取梯度信息,grads为一个列表,顺序依据给定的参数列表 79 grads = tape.gradient(loss, [theta_1, bias_1, theta_2, bias_2, theta_3, bias_3]) 80 # 根据给定列表顺序,对参数求导 81 theta_1.assign_sub(alpha * grads[0]) # 原地更新,类型不变 82 theta_2.assign_sub(alpha * grads[2]) 83 theta_3.assign_sub(alpha * grads[4]) 84 bias_1.assign_sub(alpha * grads[1]) 85 bias_2.assign_sub(alpha * grads[3]) 86 bias_3.assign_sub(alpha * grads[5]) 87 88 pred = tf.math.argmax(out, axis=-1) 89 y_label = tf.math.argmax(y, axis=-1) 90 acc = tf.math.equal(pred, y_label) 91 acc = tf.cast(acc, dtype=tf.int32) 92 correct_cnt += tf.math.reduce_sum(acc) 93 94 # 每隔100个batch打印一次loss 95 if batch % 100 == 0: 96 print(batch, 'loss:', float(loss)) 97 98 #训练的准确度 99 percent = float(correct_cnt / total_y) 100 print('train_acc:', percent) 101 102 correct_cnt = 0 # 预测对的数量 103 104 105 # 测试数据预测 106 for (val_x, val_y) in test_db: 107 val_x = tf.reshape(val_x, [-1, 28 * 28]) 108 val_h_1 = val_x @ theta_1 + bias_1 109 val_h_1 = tf.nn.relu(val_h_1) 110 val_h_2 = val_h_1 @ theta_2 + bias_2 111 val_h_2 = tf.nn.relu(val_h_2) 112 val_out = val_h_2 @ theta_3 + bias_3 113 114 # val_out:(256,10) pred:(256,) 115 pred = tf.math.argmax(val_out, axis=-1) 116 # acc:bool (256,) 117 acc = tf.math.equal(pred, val_y) 118 acc = tf.cast(acc, dtype=tf.int32) 119 correct_cnt += tf.math.reduce_sum(acc) 120 121 #测试准确度 122 percent = float(correct_cnt / total_val) 123 print('val_acc:', percent) 124 print('time:',int(time.time()-start_time)//60,':',int(time.time()-start_time)%60)