博客荒废了很久准备重新开始写起来。
最近一直在忙毕设的事情准备用深度学习的方法进行微博情感分析,在我们的研究中,将使用5分类的方法来将微博进行分类。之前纠结深度学习工具的选择,先后在theano,deeplearningtoolbox,torch和deeplarining4j之间纠结了很久,选来选去最终还是选了torch7。具体原因先按下不表,过程很纠结就是了。
torch7除了标准的nn包之外,还提供了dp包来进行深度学习,本文就利用了dp包进行编程。处理了固定长度(12词)的文本分类问题。不同长度的代码还在编写中。
数据准备方面,首先是用了word2vec工具将分词后的文本都学习成了embeded vector每个词向量长度100,将词长度为12的句子挑选出来进行采样,制作成训练集、验证集合测试集,相当于每个句子是一个1200维的向量。五个文件,分别代表5个分类。
在读取文件的时候,我把1×1200维的向量reshape成了12*100维的向量。prepareData.lua
for _,dataset_name in ipairs({"train","valid","test"}) do
datas=nil
classes=nil
path_prefix=os.getenv('HOME').."/data/weibo/"
th_output_prefix=os.getenv('HOME').."/workspace/torch7/"
path_surfix=".txt"
for _,index in ipairs({0,1,2,3,4}) do
data_n={}
classes_n={}
file=io.open(path_prefix..dataset_name..index..path_surfix,'r')
for line in file:lines() do
line_vector={}
for element in string.gmatch(line,"%S+") do
table.insert(line_vector,element)
end
table.insert(data_n,line_vector)
end
data_tensor_n=torch.Tensor(data_n)
data_tensor_n=data_tensor_n:resize(data_tensor_n:size(1),data_tensor_n:size(2)/100,100)
classes_tensor_n=torch.Tensor(data_tensor_n:size(1)):fill(index)
print(data_tensor_n:size())
print(classes_tensor_n:size())
datas=datas and torch.cat(datas,data_tensor_n,1) or data_tensor_n
classes=classes and torch.cat(classes,classes_tensor_n,1) or classes_tensor_n
end
classes=classes:int()
print(datas:size())
print(classes:size())
data_object={datas,classes}
torch.save(th_output_prefix..dataset_name..'.th7',data_object)
end
制作3个数据文件,分别取名为train.th7,valid.th7和test.th7。
datasource编写
利用mnist的datasource改的。注意,我们需要的输入是一个SequenceView,也就是可以用来做1维卷积的View。SequenceView中的bwc分别代表“batch大小”,“句子长度”和“embedVector的大小”
local Weibo, DataSource = torch.class("dp.Weibo", "dp.DataSource")
Weibo.isMnist = true
Weibo._name = 'weibo'
Weibo._text_axes = 'bwc'
Weibo._classes = {0, 1, 2, 3, 4, }
function Weibo:__init(config)
config = config or {}
assert(torch.type(config) == 'table' and not config[1],
"Constructor requires key-value arguments")
local args, load_all, input_preprocess, target_preprocess
args, self._valid_ratio, self._train_file, self._test_file, self._valid_file,
self._data_path, self._scale, self._binarize, self._shuffle, load_all, input_preprocess,
target_preprocess
= xlua.unpack(
{config},
'Weibo',
'Handwritten digit classification problem.' ..
'Note: Train and valid sets are already shuffled.',
{arg='valid_ratio', type='number', default=1/6,
help='proportion of training set to use for cross-validation.'},
{arg='train_file', type='string', default='train.th7',
help='name of training file'},
{arg='valid_file', type='string', default='valid.th7',
help='name of valid file'},
{arg='test_file', type='string', default='test.th7',
help='name of test file'},
{arg='data_path', type='string', default=dp.DATA_DIR,
help='path to data repository'},
{arg='scale', type='table',
help='bounds to scale the values between. [Default={0,1}]'},
{arg='binarize', type='boolean',
help='binarize the inputs (0s and 1s)', default=false},
{arg='shuffle', type='boolean',
help='shuffle different sets', default=false},
{arg='load_all', type='boolean',
help='Load all datasets : train, valid, test.', default=true},
{arg='input_preprocess', type='table | dp.Preprocess',
help='to be performed on set inputs, measuring statistics ' ..
'(fitting) on the train_set only, and reusing these to ' ..
'preprocess the valid_set and test_set.'},
{arg='target_preprocess', type='table | dp.Preprocess',
help='to be performed on set targets, measuring statistics ' ..
'(fitting) on the train_set only, and reusing these to ' ..
'preprocess the valid_set and test_set.'}
)
self:loadTrain()
self:loadValid()
self:loadTest()
DataSource.__init(self, {
train_set=self:trainSet(), valid_set=self:validSet(),
test_set=self:testSet(), input_preprocess=input_preprocess,
target_preprocess=target_preprocess
})
end
function Weibo:loadTrain()
local train_data = self:loadData(self._train_file)
self:setTrainSet(
self:createDataSet(train_data[1], train_data[2], 'train')
)
return self:trainSet()
end
function Weibo:loadValid()
local valid_data = self:loadData(self._valid_file)
self:setValidSet(
self:createDataSet(valid_data[1], valid_data[2], 'valid')
)
return self:validSet()
end
function Weibo:loadTest()
local test_data = self:loadData(self._test_file)
self:setTestSet(
self:createDataSet(test_data[1], test_data[2], 'test')
)
return self:testSet()
end
function Weibo:createDataSet(inputs, targets, which_set)
if self._shuffle then
local indices = torch.randperm(inputs:size(1)):long()
inputs = inputs:index(1, indices)
targets = targets:index(1, indices)
end
if self._binarize then
DataSource.binarize(inputs, 128)
end
-- class 0 will have index 1, class 1 index 2, and so on.
targets:add(1)
-- construct inputs and targets dp.Views
local input_v, target_v = dp.SequenceView(), dp.ClassView()
input_v:forward(self._text_axes, inputs)
target_v:forward('b', targets)
target_v:setClasses(self._classes)
-- construct dataset
dataset= dp.DataSet{inputs=input_v,targets=target_v,which_set=which_set}
--print(dataset)
return dataset
end
function Weibo:loadData(file_name)
local path="../"..file_name
print(file_name)
-- backwards compatible with old binary format
local status, data = pcall(function() return torch.load(path) end)
if not status then
return torch.load(path, "binary")
end
return data
end
实验代码编写
使用cnn的方式处理,分为三层,第一层是一个一维卷积,第二层和第三层都是传统的神经网络写法。
require 'dp'
require 'weiboSource'
--[[hyperparameters]]--
opt = {
nHidden = 100, --number of hidden units
learningRate = 0.1, --training learning rate
momentum = 0.9, --momentum factor to use for training
maxOutNorm = 1, --maximum norm allowed for output neuron weights
batchSize = 128, --number of examples per mini-batch
maxTries = 100, --maximum number of epochs without reduction in validation error.
maxEpoch = 1000, --maximum number of epochs of training
cuda =false,
useDevice =1,
inputEmbeddingSize =100,
outputEmbeddingSize=100,
convOutputSize=50,
convKernelSize=2,
convKernelStride=1,
convPoolSize=2,
convPoolStride=2,
contextSize=4,
decayPoint=100 ,--epoch at which learning rate is decayed
decayFactor=0.1, --'factory by which learning rate is decayed at decay point'
}
local datasource=dp.Weibo()
inputModel = dp.Convolution1D{
input_size = opt.inputEmbeddingSize,
output_size = opt.convOutputSize,
kernel_size = opt.convKernelSize,
kernel_stride = opt.convKernelStride,
pool_size = opt.convPoolSize,
pool_stride = opt.convPoolStride,
transfer = nn.Tanh(),
dropout = opt.dropout and nn.Dropout() or nil,
acc_update = opt.accUpdate
}
local nOutputFrame = inputModel:outputSize(opt.contextSize, 'bwc')
dp.vprint(not opt.silent, "Convolution has "..nOutputFrame.." output Frames")
inputSize = nOutputFrame*opt.convOutputSize
--print(hiddenModel)
softmax = dp.Neural{
input_size = opt.outputEmbeddingSize,
output_size = table.length(datasource:classes()),
transfer = nn.LogSoftMax(),
dropout = opt.dropout and nn.Dropout() or nil,
acc_update = opt.accUpdate
}
mlp = dp.Sequential{
models = {
inputModel,
dp.Neural{
input_size = inputSize,
output_size = opt.outputEmbeddingSize,
transfer = nn.Tanh(),
dropout = opt.dropout and nn.Dropout() or nil,
acc_update = opt.accUpdate
}
}
}
--[[Propagators]]--
train = dp.Optimizer{
loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
visitor = {
dp.Learn{
learning_rate = opt.learningRate,
observer = dp.LearningRateSchedule{
schedule = {[opt.decayPoint]=opt.learningRate*opt.decayFactor}
}
},
dp.MaxNorm{max_out_norm=opt.maxOutNorm, period=opt.maxNormPeriod}
},
feedback = dp.Perplexity(),
sampler = dp.Sampler{ --shuffle sample takes too much mem
epoch_size = opt.trainEpochSize, batch_size = opt.batchSize
},
progress = opt.progress
}
valid = dp.Evaluator{
loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
feedback = dp.Perplexity(),
sampler = dp.Sampler{
epoch_size = opt.validEpochSize,
batch_size = opt.softmaxtree and 1024 or opt.batchSize
},
progress = opt.progress
}
tester = dp.Evaluator{
loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
feedback = dp.Perplexity(),
sampler = dp.Sampler{batch_size = opt.softmaxtree and 1024 or opt.batchSize}
}
--[[Experiment]]--
xp = dp.Experiment{
model = mlp,
optimizer = train,
validator = valid,
tester = tester,
observer = (not opt.trainOnly) and {
dp.FileLogger(),
dp.EarlyStopper{max_epochs = opt.maxTries}
} or nil,
random_seed = os.time(),
max_epoch = opt.maxEpoch
}
--[[GPU or CPU]]--
if opt.cuda then
require 'cutorch'
require 'cunn'
if opt.softmaxtree or opt.softmaxforest then
require 'cunnx'
end
cutorch.setDevice(opt.useDevice)
xp:cuda()
end
print"dp.Models :"
print(mlp)
print"nn.Modules :"
trainset=datasource:trainSet():sub(1,32)
print(mlp:toModule(datasource:trainSet():sub(1,32)))
xp:run(datasource)
实验结果
实验结果在测试集上5分类达到了70%+,令我感到十分意外,真是意外之喜