pytorch基础学习

文章目录

  • Tensor Initialization
    • From a python list
    • From a Numpy Array
    • From a Tensor
    • By Specifying a Shape
    • With torch.arange
  • Tensor Properties
    • Data type
    • Shape
    • contiguous
    • Unsqueeze and squeeze
    • Device
  • Tensor Indexing
  • Operation
    • Broadcast
    • statistic
    • cat
  • Autograd
    • 举个例子
  • Neural Network Module
    • ModuleLayers
    • Activation Function Layer
    • Putting the Layers Together
    • Custom Modules
  • Optimization
  • Demo: Word Window Classification
    • Data
    • Preprocessing
    • Converting words to Embedings
    • Batching Sentences
    • Model
    • Training
    • Prediction


Tensor Initialization

From a python list

data = [
        [0,1],
        [2,3],
        [4,5]
        ]
data_tensor = torch.tensor(data)
data_tensor
# tensor([[0, 1],
#         [2, 3],
#         [4, 5]])

data_tensor_float = torch.tensor(data, dtype = torch.float)
data_tensor_float
# tensor([[0., 1.],
#         [2., 3.],
#         [4., 5.]])

data_tensor_float = torch.tensor(data, dtype = torch.bool)
data_tensor_float
# tensor([[False,  True],
#         [ True,  True],
#         [ True,  True]])
# torch.float32
data_tensor.float()
# tensor([[0., 1.],
#         [2., 3.],
#         [4., 5.]])
# torch.float32

torch.tensor 使用的是工厂函数来进行初始化,也可以用类来进行初始化
tensor.FloatTensor()
tensor.Tensor(default is float type),
tensor.LongTensor()

data_tensor_float = torch.Tensor(data)
data_tensor_float.dtype
# torch.float32

From a Numpy Array

import numpy as np
ndarray = np.array(data)
x_numpy = torch.from_numpy(ndarray)
x_numpy
# tensor([[0, 1],
#         [2, 3],
#         [4, 5]], dtype=torch.int32)

From a Tensor

x = torch.tensor([[1.,2.],[3.,4.]])
x
# tensor([[1., 2.],
#         [3., 4.]])
x_zeros = torch.zeros_like(x)
x_zeros
# tensor([[0., 0.],
#         [0., 0.]])
x_ones = torch.ones_like(x)
x_ones
# tensor([[1., 1.],
#         [1., 1.]])
x_rand = torch.rand_like(x)
x_rand
# tensor([[0.6859, 0.5000],
#         [0.1916, 0.6818]])
x_randn = torch.randn_like(x)
x_randn
# tensor([[ 0.1215,  1.3117],
#         [-1.5105,  0.3146]])

By Specifying a Shape

shape = (3,2,2)
x_zeros = torch.zeros(shape)
x_zeros
# tensor([[[0., 0.],
#          [0., 0.]],

#         [[0., 0.],
#          [0., 0.]],

#         [[0., 0.],
#          [0., 0.]]])

With torch.arange

x = torch.arange(10)
x
# tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Tensor Properties

Data type

x = torch.ones(2,3)
x.dtype

Shape

# torch.float32
x.shape
# torch.Size([2, 3])
x.size(0)
#2
x.shape[0]
#2

contiguous

对view的使用

x = torch.arange(6).reshape(2,3)
# tensor([[0, 1, 2],
#         [3, 4, 5]])
x_view1 = x.view(3,2)
# tensor([[0, 1],
#         [2, 3],
#         [4, 5]])
x_view2 = x.view(-1,2)
# tensor([[0, 1],
#         [2, 3],
#         [4, 5]])

view操作的前提是Tensor必须要是contiguous的

在pytorch中, 为了节省内存,transpose和permute等操作是没有新开辟内存的,即并没有修改或者复制底层的数组, 但是他们新建了一份Tensor元信息,并在新的元信息中重新制定了stride。torch.view 方法约定了不修改数组本身,只是使用新的形状查看数据。如果我们在 transpose、permute 操作后执行 view,Pytorch 会抛错误

t = torch.arange(12).reshape(3,4)
t
# tensor([[ 0,  1,  2,  3],
#         [ 4,  5,  6,  7],
#         [ 8,  9, 10, 11]])
t.stride() 
#(4, 1)
#4 是在0 维度上跳到下一个元素需要的距离, #1 是在1维上跳到下一个元素的距离
t_T = t.transpose(0,1)
t_T
# tensor([[ 0,  4,  8],
#         [ 1,  5,  9],
#         [ 2,  6, 10],
#         [ 3,  7, 11]])
t_T.stride()
# (1, 4)
#在第一个维度上,跨行的stride是1,但是跨列的stride是4
t_T.data_ptr() == t.data_ptr()
# True
#表示数据存在相同的位置
t_T.is_contiguous(), t.is_contiguous()
# (False, True)
t_T.view(-1)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
C:\Users\ADMINI~1\AppData\Local\Temp/ipykernel_7968/2726062531.py in <module>
----> 1 t_T.view(-1)

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

这时候只要做一下contiguous操作就可以开辟一段新的内存, 用于存储新的数据

t_T_contiguous = t_T.contiguous()
t_T
# tensor([[ 0,  4,  8],
#         [ 1,  5,  9],
#         [ 2,  6, 10],
#         [ 3,  7, 11]])
t_T_contiguous.data_ptr() == t.data_ptr()
# False

Unsqueeze and squeeze

x = torch.arange(10).reshape(5,2)
x
# tensor([[0, 1],
#         [2, 3],
#         [4, 5],
#         [6, 7],
#         [8, 9]])
x = x.unsqueeze(1)
x.shape
# torch.Size([5, 1, 2])
x = x.squeeze(1)
x.shape
# torch.Size([5, 2])

Device

Device tellls where to store the tensor, which determines which device, GPU or CPU would be handling the computations involving it.

x = torch.Tensor([[1,2],[3,4]])
x
# tensor([[1., 2.],
#         [3., 4.]])
x.device
# device(type='cpu')
x = x.to('cuda')
x.device
# device(type='cuda', index=0)
torch.cuda.is_available()
# True

Tensor Indexing

x = torch.arange(12).reshape(3,2,2)
x
# tensor([[[ 0,  1],
#          [ 2,  3]],

#         [[ 4,  5],
#          [ 6,  7]],

#         [[ 8,  9],
#          [10, 11]]])
x.shape
# torch.Size([3, 2, 2])
x[0], x[0,:], x[0,:,:], x[0].shape
# (tensor([[0, 1],
#          [2, 3]]),
#  tensor([[0, 1],
#          [2, 3]]),
#  tensor([[0, 1],
#          [2, 3]]),
#  torch.Size([2, 2]))

可以通过index他列表挑选某一个维度的数据

i = torch.tensor([0,1,0,1])
x[i], x[i].shape
# (tensor([[[0, 1],
#           [2, 3]],
 
#          [[4, 5],
#           [6, 7]],
 
#          [[4, 5],
#           [6, 7]]]),
#  torch.Size([3, 2, 2]))

也可以通过几个index列表挑选


i = torch.tensor([1,2, 1, 2])
j = torch.tensor([1])
k = torch.tensor([1])
x[i,j,k], x[i,j,k].shape
# (tensor([ 7, 11,  7, 11]), torch.Size([4]))

# 或者
x[[0,0,2],[1],:]
# tensor([[ 2,  3],
#         [ 2,  3],
#         [10, 11]])

x[0:2,[1],:]
# tensor([[[2, 3]],

#         [[6, 7]]])

Operation

Broadcast

可以像numpy一样广播

a = torch.ones((4,3)) * 6
b = torch.ones(3) * 2
a, b, a/b
# (tensor([[6., 6., 6.],
#          [6., 6., 6.],
#          [6., 6., 6.],
#          [6., 6., 6.]]),
#  tensor([2., 2., 2.]),
#  tensor([[3., 3., 3.],
#          [3., 3., 3.],
#          [3., 3., 3.],
#          [3., 3., 3.]]))

statistic

还可以算一些统计数据
注意tensor算mean和std一定要用float的

m = torch.tensor([
    [1., 1.],
    [2., 2.],
    [3., 3.],
    [4., 4.]
])
print(m.shape)
pp.pprint('Mean: {}'.format(m.mean()))
pp.pprint('Mean in the 0th dimension: {}'.format(m.mean(0)))
pp.pprint('Standard deviation in the 0th dimension: {}'.format(m.std(0)))
# torch.Size([4, 2])
# 'Mean: 2.5'
# 'Mean in the 0th dimension: tensor([2.5000, 2.5000])'
#对哪个维度平均, 那个维度就消失
# 'Standard deviation in the 0th dimension: tensor([1.2910, 1.2910])'

cat

cat命令也很实用

a = torch.arange(24).reshape(3,4,2)
a_cat0 = torch.cat([a,a,a], dim = 0)
a_cat1 = torch.cat([a,a,a], dim = 1)

print('initial shape: {}'.format(a.shape))
print('Shape after concatenation in dimension 0 is: {}'.format(a_cat0.shape))
print('Shape after concatenation in dimension 1 is: {}'.format(a_cat1.shape))
# initial shape: torch.Size([3, 4, 2])
# Shape after concatenation in dimension 0 is: torch.Size([9, 4, 2])
# Shape after concatenation in dimension 1 is: torch.Size([3, 12, 2])

Autograd

autograd 是torch的精髓

x = torch.tensor([2. ])
print(x.requires_grad)
# False
#default requires_grad is false
x = torch.tensor([2. ], requires_grad = True)
print(x.grad)
# None
# 初始化的grad是0

举个例子

x = torch.tensor([2. ], requires_grad = True)
y = x*x *3
y.backward()
pp.pprint(x.grad)
# tensor([12.])

然后我们再对x添加新的计算

z = x*x*3
z.backward()
pp.pprint(x.grad)
# tensor([24.])

注意 一定要记得把grad清零
x.grad is updated to be the sum of the gradient calculated so far

Thus we need to run zero_grad() in every training iteration, Otherwise, the grad will keep building up

BTW, 在网上看到许多Variable的教程,但其实已经用不到了

在python之前的版本中,variable可以封装tensor,计算反向传播梯度时需要将tensor封装在variable中。但是在python 0.4版本之后,将variable和tensor合并,也就是说不需要将tensor封装在variable中就可以计算梯度。tensor具有variable的性质。
作为能否autograd的标签,requires_grad现在是Tensor的属性,所以,只要当一个操作的任何输入Tensor具有requires_grad = True的属性,autograd就可以自动追踪历史和反向传播了。

原文链接:https://blog.csdn.net/weixin_44054487/article/details/92844571

Neural Network Module

在torch.nn这个模块里有许多提前定义好的网络

import torch.nn as nn
#there are some predefined block in torch

ModuleLayers

举个例子

input =  torch.ones(2,3,4)
#make a linesr layers transforming N, *, H_in in dimensional inputs to N, *, H_out
linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output, linear_output.shape
# (tensor([[[ 0.3793, -0.3961],
#           [ 0.3793, -0.3961],
#           [ 0.3793, -0.3961]],
 
#          [[ 0.3793, -0.3961],
#           [ 0.3793, -0.3961],
#           [ 0.3793, -0.3961]]], grad_fn=),
#  torch.Size([2, 3, 2]))

除了Linear,还有许多其他的例如
nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm1d, nn.BatchNorm2d, nn.Upsample, nn.MaxPool2d

Activation Function Layer

nn.ReLU(), nn.Sigmoid(), nn.LeakyReLU()

linear_output
# tensor([[[ 0.3793, -0.3961],
#          [ 0.3793, -0.3961],
#          [ 0.3793, -0.3961]],

#         [[ 0.3793, -0.3961],
#          [ 0.3793, -0.3961],
#          [ 0.3793, -0.3961]]], grad_fn=)
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output
# tensor([[[0.5937, 0.4022],
#          [0.5937, 0.4022],
#          [0.5937, 0.4022]],

#         [[0.5937, 0.4022],
#          [0.5937, 0.4022],
#          [0.5937, 0.4022]]], grad_fn=)

注意这里grad_fn 的变化

Putting the Layers Together

block = nn.Sequential(nn.Linear(4,2), nn.Sigmoid())

input = torch.ones(2,3,4)
output = block(input)
output
# tensor([[[0.5822, 0.5445],
#          [0.5822, 0.5445],
#          [0.5822, 0.5445]],

#         [[0.5822, 0.5445],
#          [0.5822, 0.5445],
#          [0.5822, 0.5445]]], grad_fn=)

Custom Modules

class MultilayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MultilayerPerceptron, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.model(x)
        return output

input = torch.randn(2, 5)
model = MultilayerPerceptron(5,3)
output = model(input)
output, output.shape
# (tensor([[0.6317, 0.5629, 0.4682, 0.6893, 0.4653],
#          [0.6339, 0.5788, 0.4785, 0.6718, 0.4759]], grad_fn=),
#  torch.Size([2, 5]))

list(model.named_parameters())
# [('model.0.weight',
#   Parameter containing:
#   tensor([[ 0.3093,  0.4423,  0.2119, -0.2477,  0.1644],
#           [-0.0874,  0.2330, -0.2347, -0.4302, -0.2038],
#           [-0.0728,  0.3057, -0.3874, -0.0650, -0.3754]], requires_grad=True)),
#  ('model.0.bias',
#   Parameter containing:
#   tensor([-0.1116, -0.2941,  0.1612], requires_grad=True)),
#  ('model.2.weight',
#   Parameter containing:
#   tensor([[-0.0755, -0.1328,  0.0387],
#           [ 0.0355, -0.4333, -0.4412],
#           [ 0.1081, -0.3106, -0.2242],
#           [-0.4018,  0.5232,  0.5647],
#           [-0.1715, -0.3474, -0.1894]], requires_grad=True)),
#  ('model.2.bias',
#   Parameter containing:
#   tensor([0.5289, 0.5478, 0.0306, 0.4223, 0.0022], requires_grad=True))]

Optimization

首先引入模块

import torch.optim as optim

定义一个dummy的输入和输出

y = torch.ones(10, 5)
x = y + torch.randn_like(y)

使用上面定义的customized nn.Module来训练

model = MultilayerPerceptron(5,3)

adam = optim.Adam(model.parameters(), lr = 1e-1)

loss_function = nn.BCELoss()

y_predict = model(x)

loss_function(y_predict, y).item()
# 0.7197282910346985

n_epoch = 100
for epoch in range(n_epoch):
    adam.zero_grad()
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    print(f"Epoch {epoch}: training loss: {loss}")
    loss.backward()
    adam.step()
# Epoch 0: training loss: 0.7025878429412842
# Epoch 1: training loss: 0.6235182285308838
# Epoch 2: training loss: 0.5331209897994995
# Epoch 3: training loss: 0.42057228088378906
# Epoch 4: training loss: 0.30540353059768677
# Epoch 5: training loss: 0.20316310226917267
# ...
# Epoch 95: training loss: 3.5762795391747204e-08
# Epoch 96: training loss: 3.5762795391747204e-08
# Epoch 97: training loss: 3.5762795391747204e-08
# Epoch 98: training loss: 3.5762795391747204e-08
# Epoch 99: training loss: 3.5762795391747204e-08

y_pred = model(x)
y_pred, loss_function(y_pred, y).item()
# (tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=),
#  3.5762795391747204e-08)

x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred, loss_function(y_pred, y).item()
# (tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [0.9998, 0.9999, 1.0000, 0.9997, 0.9999],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
#          [1.0000, 1.0000, 1.0000, 0.9999, 1.0000],
#          [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=),
#  1.5589259419357404e-05)

Demo: Word Window Classification

Data

corpus = [
    'We always come to Paris',
    'The professor, is from Australia',
    'I live in Stanford',
    'He comes from Taiwan',
    'The capital of Turkey is Ankara'
]

Preprocessing

清洗数据并添加标签

def preprocess_sentence(sentences):
        return sentences.lower().split()
    
train_sentences = [sent.lower().split() for sent in corpus]
train_sentences
# [['we', 'always', 'come', 'to', 'paris'],
#  ['the', 'professor,', 'is', 'from', 'australia'],
#  ['i', 'live', 'in', 'stanford'],
#  ['he', 'comes', 'from', 'taiwan'],
#  ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]
location = set(['australia', 'ankara', 'paris', 'stanford', 'taiwan', 'turkey'])
train_labels = [[1 if word in location else 0 for word in sent] for sent in train_sentences]
train_labels
# [[0, 0, 0, 0, 1],
#  [0, 0, 0, 0, 1],
#  [0, 0, 0, 1],
#  [0, 0, 0, 1],
#  [0, 0, 0, 1, 0, 1]]

Converting words to Embedings

vocabulary = set(w for s in train_sentences for w in s)
vocabulary.add('')
vocabulary.add('')

def pad_window(sentence, window_size, pad_token = ''):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size)
# ['', '', 'we', 'always', 'come', 'to', 'paris', '', '']

ix2word = sorted(list(vocabulary))
word2ix = {word: ix for ix, word in enumerate(ix2word)}
word2ix
# {'': 0,
#  '': 1,
#  'always': 2,
#  'ankara': 3,
#  'australia': 4,
#  'capital': 5,
#  'come': 6,
#  'comes': 7,
#  'from': 8,
#  'he': 9,
#  'i': 10,
#  'in': 11,
#  'is': 12,
#  'live': 13,
#  'of': 14,
#  'paris': 15,
#  'professor,': 16,
#  'stanford': 17,
#  'taiwan': 18,
#  'the': 19,
#  'to': 20,
#  'turkey': 21,
#  'we': 22}

def convert_token_to_indices(sentence, word2ix):
    return [word2ix.get(token, word2ix['']) for token in sentence]

example_sentence = ['we', 'always', 'come', 'to', 'kuwait']
example_indices = convert_token_to_indices(example_sentence, word2ix)
restored_example = [ix2word[ind] for ind in example_indices]

print(example_sentence, example_indices, restored_example)
# ['we', 'always', 'come', 'to', 'kuwait'] 
# [22, 2, 6, 20, 1] 
# ['we', 'always', 'come', 'to', '']

example_padded_indices = [convert_token_to_indices(s, word2ix) for s in train_sentences]
example_padded_indices
# [[22, 2, 6, 20, 15],
#  [19, 16, 12, 8, 4],
#  [10, 13, 11, 17],
#  [9, 7, 8, 18],
#  [19, 5, 14, 21, 12, 3]]

embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

#举个例子
index_paris = word2ix['paris']
index_ankara = word2ix['ankara']
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings
# tensor([[ 0.0622,  0.5321,  0.3486, -0.8931, -1.4741],
#         [ 0.3636, -0.7360,  1.4412,  0.0530, -0.8008]],
#        grad_fn=)

Batching Sentences

首先要写一个collate_fn 函数

def _custom_collate_fn(batch, window_size, word2ix):
    x, y = zip(*batch)
    x = [pad_window(s, window_size = window_size) for s in x]
    x = [convert_token_to_indices(s, word2ix) for s in x]
    
    pad_token = word2ix['']
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value = pad_token)
    
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value = 0)
    
    return x_padded, y_padded, lengths

关于pad_packed_sequence, pack_pad_sequence, pack_sequence, pad_sequence
可以查看这篇cite
相信我,印度人念这个简直是灾难

当我们load的时候实际上是load了什么呢,这个partial其实就是固定一些参数后的方程

from torch.utils.data import DataLoader
from functools import partial

data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size = window_size, word2ix = word2ix)
loader = DataLoader(data, batch_size = batch_size, shuffle= shuffle, collate_fn = collate_fn)
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    
    print(f'Iteration {counter}')
    print('Batched Input: ')
    print(batched_x)
    print('Batched Labels: ')
    print(batched_y)
    print('Batched Lengths')
    print(batched_lengths)
    counter+=1
# Iteration 0
# Batched Input: 
# tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0],
#         [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
# Batched Labels: 
# tensor([[0, 0, 0, 0, 1, 0],
#         [0, 0, 0, 1, 0, 1]])
# Batched Lengths
# tensor([5, 6])
# Iteration 1
# Batched Input: 
# tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0],
#         [ 0,  0,  9,  7,  8, 18,  0,  0,  0]])
# Batched Labels: 
# tensor([[0, 0, 0, 0, 1],
#         [0, 0, 0, 1, 0]])
# Batched Lengths
# tensor([5, 4])
# Iteration 2
# Batched Input: 
# tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])
# Batched Labels: 
# tensor([[0, 0, 0, 1]])
# Batched Lengths
# tensor([4])

接下来用unfold函数得到window
Tensor.unfold(dimension, size, step) → Tensor
Returns a view of the original tensor which contains all slices of size size from self tensor in the dimension dimension.

Parameters
dimension (int) – dimension in which unfolding happens
size (int) – the size of each slice that is unfolded
step (int) – the step between each slice
print(f'Original Tensor: ')
print(batched_x)
print("")
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)
# Original Tensor: 
# tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])

# Windows: 
# tensor([[[ 0,  0, 10, 13, 11],
#          [ 0, 10, 13, 11, 17],
#          [10, 13, 11, 17,  0],
#          [13, 11, 17,  0,  0]]])

Model

class WordWindowClassifier(nn.Module):
    def __init__(self, hyperparameters, vocab_size, pad_ix = 0):
        super(WordWindowClassifier, self).__init__()
        
        self.window_size = hyperparameters['window_size']
        self.embed_dim = hyperparameters['embed_dim']
        self.hidden_dim = hyperparameters['hidden_dim']
        self.freeze_embeddings = hyperparameters['freeze_embeddings']
        
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx = pad_ix)
        if self.freeze_embeddings:
            self.embeds.weight.requires_grad = False
            
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )
        
        self.output_layer = nn.Linear(self.hidden_dim, 1)
        
        self.probabilities = nn.Sigmoid()
        
    def forward(self, inputs):
        '''
        B: batch_size
        L: window_padded sentence length
        D: self.embed_dim
        S: self.window_size
        H: self.hidden_dim
        '''
        
        B,L = inputs.size()
        token_windows = inputs.unfold(1,2*window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()
        
        assert token_windows.size() == (B, adjusted_length, 2*self.window_size + 1)
        #(B, L~, S)
        
        embedded_windows = self.embeds(token_windows)
        #(B,L~,S,D)
        
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        #(B,L~,S*D)
        
        layer_1 = self.hidden_layer(embedded_windows)
        #(B,L~, H)
        
        output = self.output_layer(layer_1)
        
        output = self.probabilities(output)
        #(B, L~, 1)
        
        output = output.view(B, -1)
        #(B, L~)
        return output

Training

data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size= window_size, word2ix = word2ix)
loader = DataLoader(data, batch_size = batch_size, shuffle = shuffle, collate_fn= collate_fn)

model_hyperparameters = {
    'batch_size': 4,
    'window_size': 2,
    'embed_dim': 25,
    'hidden_dim': 25,
    'freeze_embeddings': False
}

vocab_size = len(word2ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

def loss_function(batch_outputs, batch_labels, batch_lengths):
    bceloss = nn.BCELoss()#don't forget ()
    loss = bceloss(batch_outputs, batch_labels.float())
    loss = loss/batch_lengths.sum().float()
    return loss

def train_epoch(loss_function, optimizer, model, loader):
    total_loss = 0
    for batch_inputs, batch_labels, batch_lengths in loader:
        optimizer.zero_grad()
        outputs = model.forward(batch_inputs)
        loss =  loss_function(outputs, batch_labels, batch_lengths)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss
    
def train(loss_function, optimizer, model, loader, num_epochs = 10000):
    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loss_function, optimizer, model, loader)
        if epoch %100 == 0: print(epoch_loss)
       
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs = num_epochs)
# 0.2615063562989235
# 0.2215040773153305
# 0.17036793380975723
# 0.15041063725948334
# 0.12963269650936127
# 0.08397780545055866
# 0.07003378123044968
# 0.05545256659388542
# 0.04804721102118492
# 0.03628341108560562

Prediction

用同样的方法生成testloader

test_corpus = ['She comes from Paris']
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [(0,0,0,1)]

test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size = 2, word2ix = word2ix)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size, shuffle = shuffle, collate_fn  = collate_fn)

验证结果

for test_instances, labels, _ in test_loader:
    outputs = model.forward(test_instances)
    print(labels)
    print(outputs)
# tensor([[0, 0, 0, 1]])
# tensor([[0.0859, 0.1915, 0.0471, 0.9349]], grad_fn=)

你可能感兴趣的:(pytorch,pytorch,深度学习,神经网络)