使用torch7进行文本分类(一)

博客荒废了很久准备重新开始写起来。

最近一直在忙毕设的事情准备用深度学习的方法进行微博情感分析,在我们的研究中,将使用5分类的方法来将微博进行分类。之前纠结深度学习工具的选择,先后在theano,deeplearningtoolbox,torch和deeplarining4j之间纠结了很久,选来选去最终还是选了torch7。具体原因先按下不表,过程很纠结就是了。

torch7除了标准的nn包之外,还提供了dp包来进行深度学习,本文就利用了dp包进行编程。处理了固定长度(12词)的文本分类问题。不同长度的代码还在编写中。

  • 数据准备

数据准备方面,首先是用了word2vec工具将分词后的文本都学习成了embeded vector每个词向量长度100,将词长度为12的句子挑选出来进行采样,制作成训练集、验证集合测试集,相当于每个句子是一个1200维的向量。五个文件,分别代表5个分类。

在读取文件的时候,我把1×1200维的向量reshape成了12*100维的向量。prepareData.lua

for _,dataset_name in ipairs({"train","valid","test"}) do
  datas=nil
  classes=nil
  path_prefix=os.getenv('HOME').."/data/weibo/"
  th_output_prefix=os.getenv('HOME').."/workspace/torch7/"
  path_surfix=".txt"
  for _,index in ipairs({0,1,2,3,4}) do
    data_n={}
    classes_n={}
    file=io.open(path_prefix..dataset_name..index..path_surfix,'r')
    for line in file:lines() do
      line_vector={}
      for element in string.gmatch(line,"%S+") do 
        table.insert(line_vector,element) 
      end
      table.insert(data_n,line_vector)
    end
    data_tensor_n=torch.Tensor(data_n)
    data_tensor_n=data_tensor_n:resize(data_tensor_n:size(1),data_tensor_n:size(2)/100,100)
    classes_tensor_n=torch.Tensor(data_tensor_n:size(1)):fill(index)
    print(data_tensor_n:size())
    print(classes_tensor_n:size())
    datas=datas and torch.cat(datas,data_tensor_n,1) or data_tensor_n
    classes=classes and torch.cat(classes,classes_tensor_n,1) or classes_tensor_n
  end
  classes=classes:int()
  print(datas:size())
  print(classes:size())
  data_object={datas,classes}
  torch.save(th_output_prefix..dataset_name..'.th7',data_object)
end

制作3个数据文件,分别取名为train.th7,valid.th7和test.th7。

  • datasource编写

    利用mnist的datasource改的。注意,我们需要的输入是一个SequenceView,也就是可以用来做1维卷积的View。SequenceView中的bwc分别代表“batch大小”,“句子长度”和“embedVector的大小”

local Weibo, DataSource = torch.class("dp.Weibo", "dp.DataSource")
Weibo.isMnist = true
Weibo._name = 'weibo'
Weibo._text_axes = 'bwc'
Weibo._classes = {0, 1, 2, 3, 4, }
function Weibo:__init(config)
	config = config or {}
	assert(torch.type(config) == 'table' and not config[1],
		"Constructor requires key-value arguments")
	local args, load_all, input_preprocess, target_preprocess
	args, self._valid_ratio, self._train_file, self._test_file, self._valid_file,
			self._data_path, self._scale, self._binarize, self._shuffle, load_all, input_preprocess,
			target_preprocess
		= xlua.unpack(
		{config},
		'Weibo',
		'Handwritten digit classification problem.' ..
		'Note: Train and valid sets are already shuffled.',
		{arg='valid_ratio', type='number', default=1/6,
		 help='proportion of training set to use for cross-validation.'},
		{arg='train_file', type='string', default='train.th7',
		 help='name of training file'},
		{arg='valid_file', type='string', default='valid.th7',
		 help='name of valid file'},
		{arg='test_file', type='string', default='test.th7',
		 help='name of test file'},
		{arg='data_path', type='string', default=dp.DATA_DIR,
		 help='path to data repository'},
		{arg='scale', type='table',
		 help='bounds to scale the values between. [Default={0,1}]'},
		{arg='binarize', type='boolean',
		 help='binarize the inputs (0s and 1s)', default=false},
		{arg='shuffle', type='boolean',
		 help='shuffle different sets', default=false},
		{arg='load_all', type='boolean',
		 help='Load all datasets : train, valid, test.', default=true},
		{arg='input_preprocess', type='table | dp.Preprocess',
		 help='to be performed on set inputs, measuring statistics ' ..
		 '(fitting) on the train_set only, and reusing these to ' ..
		 'preprocess the valid_set and test_set.'},
		{arg='target_preprocess', type='table | dp.Preprocess',
		 help='to be performed on set targets, measuring statistics ' ..
		 '(fitting) on the train_set only, and reusing these to ' ..
		 'preprocess the valid_set and test_set.'}
	)
  self:loadTrain()
  self:loadValid()
  self:loadTest()
	DataSource.__init(self, {
		train_set=self:trainSet(), valid_set=self:validSet(),
		test_set=self:testSet(), input_preprocess=input_preprocess,
		target_preprocess=target_preprocess
	})
end
function Weibo:loadTrain()
	local train_data = self:loadData(self._train_file)
	self:setTrainSet(
		self:createDataSet(train_data[1], train_data[2], 'train')
	)
  return self:trainSet()
end
function Weibo:loadValid()
	local valid_data = self:loadData(self._valid_file)
	self:setValidSet(
		self:createDataSet(valid_data[1], valid_data[2], 'valid')
	)
	return self:validSet()
end
function Weibo:loadTest()
	local test_data = self:loadData(self._test_file)
	self:setTestSet(
		self:createDataSet(test_data[1], test_data[2], 'test')
	)
	return self:testSet()
end
function Weibo:createDataSet(inputs, targets, which_set)
	if self._shuffle then
		local indices = torch.randperm(inputs:size(1)):long()
		inputs = inputs:index(1, indices)
		targets = targets:index(1, indices)
	end
	if self._binarize then
		DataSource.binarize(inputs, 128)
	end
	-- class 0 will have index 1, class 1 index 2, and so on.
	targets:add(1)
	-- construct inputs and targets dp.Views
	local input_v, target_v = dp.SequenceView(), dp.ClassView()
	input_v:forward(self._text_axes, inputs)
	target_v:forward('b', targets)
	target_v:setClasses(self._classes)
	-- construct dataset
	dataset= dp.DataSet{inputs=input_v,targets=target_v,which_set=which_set}
	--print(dataset)
	return dataset
end
function Weibo:loadData(file_name)
  local path="../"..file_name
  print(file_name)
	-- backwards compatible with old binary format
	local status, data = pcall(function() return torch.load(path) end)
	if not status then
		return torch.load(path, "binary")
	end
	return data
end
  • 实验代码编写

    使用cnn的方式处理,分为三层,第一层是一个一维卷积,第二层和第三层都是传统的神经网络写法。

require 'dp'
require 'weiboSource'
--[[hyperparameters]]--
opt = {
	nHidden = 100, --number of hidden units
	learningRate = 0.1, --training learning rate
	momentum = 0.9, --momentum factor to use for training
	maxOutNorm = 1, --maximum norm allowed for output neuron weights
	batchSize = 128, --number of examples per mini-batch
	maxTries = 100, --maximum number of epochs without reduction in validation error.
	maxEpoch = 1000, --maximum number of epochs of training
	cuda =false,
	useDevice =1,
	inputEmbeddingSize =100,
	outputEmbeddingSize=100,
	convOutputSize=50,
	convKernelSize=2,
	convKernelStride=1,
	convPoolSize=2,
	convPoolStride=2,
	contextSize=4,
	decayPoint=100 ,--epoch at which learning rate is decayed
	decayFactor=0.1, --'factory by which learning rate is decayed at decay point'
}
local datasource=dp.Weibo()
inputModel = dp.Convolution1D{
		input_size = opt.inputEmbeddingSize,
		output_size = opt.convOutputSize,
		kernel_size = opt.convKernelSize,
		kernel_stride = opt.convKernelStride,
		pool_size = opt.convPoolSize,
		pool_stride = opt.convPoolStride,
		transfer = nn.Tanh(),
		dropout = opt.dropout and nn.Dropout() or nil,
		acc_update = opt.accUpdate
}
local nOutputFrame = inputModel:outputSize(opt.contextSize, 'bwc')
dp.vprint(not opt.silent, "Convolution has "..nOutputFrame.." output Frames")
inputSize = nOutputFrame*opt.convOutputSize
--print(hiddenModel)
softmax = dp.Neural{
		input_size = opt.outputEmbeddingSize,
		output_size = table.length(datasource:classes()),
		transfer = nn.LogSoftMax(),
		dropout = opt.dropout and nn.Dropout() or nil,
		acc_update = opt.accUpdate
}
mlp = dp.Sequential{
	models = {
		inputModel,
		dp.Neural{
			input_size = inputSize,
			output_size = opt.outputEmbeddingSize,
			transfer = nn.Tanh(),
			dropout = opt.dropout and nn.Dropout() or nil,
			acc_update = opt.accUpdate
		}
	}
}
--[[Propagators]]--
train = dp.Optimizer{
	loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
	visitor = {
		dp.Learn{
			learning_rate = opt.learningRate,
			observer = dp.LearningRateSchedule{
				schedule = {[opt.decayPoint]=opt.learningRate*opt.decayFactor}
			}
		},
		dp.MaxNorm{max_out_norm=opt.maxOutNorm, period=opt.maxNormPeriod}
	},
	feedback = dp.Perplexity(),
	sampler = dp.Sampler{ --shuffle sample takes too much mem
		epoch_size = opt.trainEpochSize, batch_size = opt.batchSize
	},
	progress = opt.progress
}
valid = dp.Evaluator{
	 loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
	 feedback = dp.Perplexity(),
	 sampler = dp.Sampler{
		 epoch_size = opt.validEpochSize,
		 batch_size = opt.softmaxtree and 1024 or opt.batchSize
	 },
	 progress = opt.progress
 }
 tester = dp.Evaluator{
	 loss = opt.softmaxtree and dp.TreeNLL() or dp.NLL(),
	 feedback = dp.Perplexity(),
	 sampler = dp.Sampler{batch_size = opt.softmaxtree and 1024 or opt.batchSize}
 }
 --[[Experiment]]--
xp = dp.Experiment{
	model = mlp,
	optimizer = train,
	validator = valid,
	tester = tester,
	observer = (not opt.trainOnly) and {
		dp.FileLogger(),
		dp.EarlyStopper{max_epochs = opt.maxTries}
	} or nil,
	random_seed = os.time(),
	max_epoch = opt.maxEpoch
}
--[[GPU or CPU]]--
if opt.cuda then
	require 'cutorch'
	require 'cunn'
	if opt.softmaxtree or opt.softmaxforest then
		require 'cunnx'
	end
	cutorch.setDevice(opt.useDevice)
	xp:cuda()
end
print"dp.Models :"
print(mlp)
print"nn.Modules :"
trainset=datasource:trainSet():sub(1,32)
print(mlp:toModule(datasource:trainSet():sub(1,32)))
xp:run(datasource)
  • 实验结果

    实验结果在测试集上5分类达到了70%+,令我感到十分意外,真是意外之喜

你可能感兴趣的:(torch)