基于paddle的自定义数据集文本分类

文本分类可以用在NLP的很多领域,比如情感分析,意图识别,领域识别等等,先总结一波使用paddle进行文本分类的整体流程
NLP任务的整体流程一般如下:
1.数据处理(将数据变成可以放入到模型的格式)
2.模型构建(构建你想使用的模型)
3.训练和评估模型

本文使用paddle这个框架来实现一般模型的文本分类和后续使用ernie这种预训练模型来进行文本分类

1.数据处理部分
这个部分包括从文件中读取数据(训练集)
然后处理数据一般包括padding (类似长度不够补0或着过长文本截断)和 映射(将tokens、lable映射到词表字典的index和label字典的index)这部分其实和框架没太大的关系,但是按框架的流程走会清晰很多。
在paddle中的reader就是完成数据生成的功能,生成出模型对应的数据格式,本文使用情感分析为例。
数据格式如下:
基于paddle的自定义数据集文本分类_第1张图片
上面可以看出context部分是中文,我们需要将中文转成index(数字),而lable部分已经是[1,0]这种格式就没必要再转了,如果label是[正向,负向]这种,需要将其转成[1,0]。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import io
import sys
import random
import numpy as np
import os
from paddle import fluid


#读取字典文件生成字典
def load_vocab(file_path):
    """
    load the given vocabulary
    """
    vocab = {}
    with io.open(file_path, 'r', encoding='utf8') as f:
        wid = 0
        for line in f:
            if line.strip() not in vocab:
                vocab[line.strip()] = wid
                wid += 1
    vocab[""] = len(vocab)
    return vocab

#定义生成器reader
def data_reader(file_path, word_dict, num_examples, phrase, epoch, max_seq_len):
    """
    Convert word sequence into slot
    """
    unk_id = word_dict.get('')
    pad_id = 0
    all_data = []
    with io.open(file_path, "r", encoding='utf8') as fin:
        for line in fin:
            if line.startswith('label'):
                continue
            cols = line.strip().split("\t")
            if len(cols) != 2:
                sys.stderr.write("[NOTICE] Error Format Line!")
                continue
            label = int(cols[0])
            wids = [word_dict[x] if x in word_dict else unk_id
                    for x in cols[1].split(" ")]
            seq_len = len(wids)
            if seq_len < max_seq_len:
                for i in range(max_seq_len - seq_len):
                    wids.append(pad_id)
            else:
                wids = wids[:max_seq_len]
                seq_len = max_seq_len
            all_data.append((wids, label, seq_len))

    if phrase == "train":
        random.shuffle(all_data)

    num_examples[phrase] = len(all_data)
        
    def reader():
        """
        Reader Function
        """
        for epoch_index in range(epoch):
            for doc, label, seq_len in all_data:
                yield doc, label, seq_len
    return reader

#定义自己的数据生成器
class SentaProcessor(object):
    """
    Processor class for data convertors for senta
    """

    def __init__(self,
                 data_dir,
                 vocab_path,
                 random_seed,
                 max_seq_len):
        self.data_dir = data_dir
        self.vocab = load_vocab(vocab_path)
        self.num_examples = {"train": -1, "dev": -1, "infer": -1}
        np.random.seed(random_seed)
        self.max_seq_len = max_seq_len

    def get_train_examples(self, data_dir, epoch, max_seq_len):
        """
        Load training examples
        """
        return data_reader((self.data_dir + "/train.tsv"), self.vocab, self.num_examples, "train", epoch, max_seq_len)

    def get_dev_examples(self, data_dir, epoch, max_seq_len):
        """
        Load dev examples
        """
        return data_reader((self.data_dir + "/dev.tsv"), self.vocab, self.num_examples, "dev", epoch, max_seq_len)

    def get_test_examples(self, data_dir, epoch, max_seq_len):
        """
        Load test examples
        """
        return data_reader((self.data_dir + "/test.tsv"), self.vocab, self.num_examples, "infer", epoch, max_seq_len)

    def get_labels(self):
        """
        Return Labels
        """
        return ["0", "1","2"]

    def get_num_examples(self, phase):
        """
        Return num of examples in train, dev, test set
        """
        if phase not in ['train', 'dev', 'infer']:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'infer'].")
        return self.num_examples[phase]

    def get_train_progress(self):
        """
        Get train progress
        """
        return self.current_train_example, self.current_train_epoch
	#生成batch
    def data_generator(self, batch_size, phase='train', epoch=1, shuffle=True):
        """
        Generate data for train, dev or infer
        """
        if phase == "train":
            return fluid.io.batch(self.get_train_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
            #return self.get_train_examples(self.data_dir, epoch, self.max_seq_len)
        elif phase == "dev":
            return fluid.io.batch(self.get_dev_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
        elif phase == "infer":
            return fluid.io.batch(self.get_test_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
        else:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'infer'].")


if __name__ == '__main__':
    dev_path='data/dev.tsv'
    word_dict=load_vocab('data/vocab.txt')
    num_examples={'train':-1,'dev':-1,'test':-1}
    reader=data_reader(dev_path,word_dict,num_examples,'dev',1,24)
    print(next(reader))

2.构建模型
定义模型结构,用最简单的bow_net举例

import paddle.fluid as fluid
def bow_net(data,
            seq_len,
            label,
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2,
            is_prediction=False):
    """
    Bow net
    """
    # embedding layer
    emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
    emb = fluid.layers.sequence_unpad(emb, length=seq_len)
    # bow layer
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
    bow_tanh = fluid.layers.tanh(bow)
    # full connect layer
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    # softmax layer
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
    if is_prediction:
        return prediction
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)

    return avg_cost, prediction

这个模型就是将一个句子中的词向量进行相加再进过全连接层和softmax进行分类
3.训练和评估模型
先吧模型的输入和输出,以及使用哪个模型进行创建(都是走流程,这里只有一个bow_net模型)

from bow_net_blog import bow_net
import reader_blog as reader
from paddle import fluid
import argparse
import os
import time
import numpy as np

def create_model(args, pyreader_name, num_labels, is_prediction=False):

    data = fluid.data(
        name="src_ids", shape=[None, args.max_seq_len], dtype='int64')
    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    seq_len = fluid.data(name="seq_len", shape=[None], dtype="int64")

    data_reader = fluid.io.DataLoader.from_generator(
        feed_list=[data, label, seq_len], capacity=4, iterable=False)

    network = bow_net

    if is_prediction:
        probs = network(
            data, seq_len, None, args.vocab_size, is_prediction=is_prediction)
        print("create inference model...")
        return data_reader, probs, [data.name, seq_len.name]

    ce_loss, probs = network(
        data, seq_len, label, args.vocab_size, is_prediction=is_prediction)
    # loss = fluid.layers.mean(x=ce_loss)
    num_seqs = fluid.layers.create_tensor(dtype='int64')
    accuracy = fluid.layers.accuracy(input=probs, label=label, total=num_seqs)
    return data_reader, ce_loss, accuracy, num_seqs

然后进行数据读取和训练

def main():

    args=get_args()
    """
    Main Function
    """
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = 1
    exe = fluid.Executor(place)

    # task_name = args.task_name.lower()
    processor = reader.SentaProcessor(
        data_dir=args.data_dir,
        vocab_path=args.vocab_path,
        random_seed=args.random_seed,
        max_seq_len=args.max_seq_len)
    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_infer):
        raise ValueError("For args `do_train`, `do_val` and `do_infer`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size / dev_count,
            phase='train',
            epoch=args.epoch,
            shuffle=True)

        num_train_examples = processor.get_num_examples(phase="train")

        max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)

        train_program = fluid.Program()
        # if args.enable_ce and args.random_seed is not None:
            # train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, loss, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='train_reader',
                    num_labels=num_labels,
                    is_prediction=False)

                sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr)
                sgd_optimizer.minimize(loss)

        if args.verbose:
            lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val:
        test_data_generator = processor.data_generator(
            batch_size=args.batch_size / dev_count,
            phase='dev',
            epoch=1,
            shuffle=False)
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_reader, loss, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='test_reader',
                    num_labels=num_labels,
                    is_prediction=False)

        test_prog = test_prog.clone(for_test=True)

    if args.do_infer:
        infer_data_generator = processor.data_generator(
            batch_size=args.batch_size / dev_count,
            phase='infer',
            epoch=1,
            shuffle=False)
        infer_prog = fluid.Program()
        with fluid.program_guard(infer_prog, startup_prog):
            with fluid.unique_name.guard():
                infer_reader, prop, _ = create_model(
                    args,
                    pyreader_name='infer_reader',
                    num_labels=num_labels,
                    is_prediction=True)
        infer_prog = infer_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint:
            init_checkpoint(
                exe, args.init_checkpoint, main_program=startup_prog)

    elif args.do_val or args.do_infer:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        train_exe = exe
        train_reader.set_sample_list_generator(train_data_generator)
    else:
        train_exe = None
    if args.do_val:
        test_exe = exe
        test_reader.set_sample_list_generator(test_data_generator)
    if args.do_infer:
        test_exe = exe
        infer_reader.set_sample_list_generator(infer_data_generator)

    if args.do_train:
        train_reader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                #print("steps...")
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, accuracy.name, num_seqs.name]
                else:
                    fetch_list = []

                outputs = train_exe.run(program=train_program,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                #print("finished one step")
                if steps % args.skip_steps == 0:
                    np_loss, np_acc, np_num_seqs = outputs
                    np_loss = np.array(np_loss)
                    np_acc = np.array(np_acc)
                    np_num_seqs = np.array(np_num_seqs)
                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_reader.queue.size(
                        )
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("step: %d, ave loss: %f, "
                          "ave acc: %f, speed: %f steps/s" %
                          (steps, np.sum(total_cost) / np.sum(total_num_seqs),
                           np.sum(total_acc) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps), "checkpoint")
                    fluid.save(train_program, save_path)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        print("do evalatation")
                        evaluate(exe, test_prog, test_reader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps),
                                         "checkpoint")
                fluid.save(train_program, save_path)
                train_reader.reset()
                break

    # final eval on dev set
    if args.do_val:
        print("Final validation result:")
        evaluate(exe, test_prog, test_reader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_infer:
        print("Final test result:")
        inference(exe, infer_prog, infer_reader, [prop.name], "infer")

以上就是在paddle中训练自己模型的具体流程了

代码和数据集可以进入:
https://github.com/lcyuanjiang/paddle_nlp

你可能感兴趣的:(paddle)