基于keras深度学习模型新闻标签一二级分类

新闻标签一二级深度学习分类模型
对一篇新闻的标题/正文/来源数据进行建模分析,以及标签正确率90%,二级标签正确率72%,解决一二级标签不一致的情况在这里插入代码片

导入所需的包

import tensorflow as tf
import pandas as pd
import numpy as np
from keras_bert import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from scipy import sparse
from tensorflow.keras.optimizers import Adam
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import json
import argparse

“”"
模型需求:一二级标签分类需求
时间:2020/12
模型描述:运用textCNN深度学习模型搭建而成
“”"

def parse_args():
“”“解析参数.”""
parser = argparse.ArgumentParser(usage=“it’s usage tip.”,
description=“predict news type”)
parser.add_argument("–label-sample-num", default=10000, help=“每个标签样本数量”)
parser.add_argument("–embedding-size", default=128, type=int,
help=“字向量嵌入规格”)
parser.add_argument("–max-title-size", default=32, type=int, help=“最大入参标题字数”)
parser.add_argument("–max-content-size", default=512, type=int,
help=“最大文章内容字数”)
parser.add_argument("–max-source-size", default=32, type=int, help=“最大文章来源字数”)
parser.add_argument("–batch-size", default=16, type=int,
help=“模型训练每个批次样本数量”)
parser.add_argument("–epochs", default=5, type=int,
help=“模型训练次数”)
parser.add_argument("–model-save-path", type=str, help=“模型存储路径”)
parser.add_argument("–model-desc-save-path", type=str, help=“模型描述文件存储路径”)
parser.add_argument("–con1-size", default=128, type=int,
help=“第一层卷积规格”)
parser.add_argument("–con2-size", default=64, type=int, help=“第二层卷积规格”)
parser.add_argument("–dense-size", default=128, type=int,
help=“全链接规格”)
parser.add_argument("–learning-rate", default=0.001, type=float,
help=“模型学习率”)
return parser.parse_args()

创建模型类

class TextCNN(object):
“”"文本深度学习标签分类模型

    新闻文章的标题/来源/正文作为入参字段,经过数据编码处理深度学习然后输出文章一二级标签分类

    参数:
        vocab: 字向量字典
        embedding_size: 相关字嵌入的规格
        max_title: 标题最长取值
        max_content: 内容最长取值
        max_source: 来源最长取值
        first_class_num: 一级标签数量
        second_class_num: 二级标签数量

    返回:
        model1, model2: 返回训练好的模型
    """

FIRST_CLASS = "first_class"
SECOND_CLASS = "second_class"

def __init__(self, vocab, embedding_size, max_title, max_content, max_source, first_class_num, second_class_num):
    self.vocab = vocab
    self.max_title = max_title
    self.max_content = max_content
    self.max_source = max_source
    self.tokenizer, self.vocab_size = self.__get_tokenizer()
    self.embedding_size = embedding_size
    self.first_class_num = first_class_num
    self.second_class_num = second_class_num

def __get_tokenizer(self):
    """
    建立字对应的字典索引,返回每个字及对应的索引号
    """
    token_dict = {}
    with open(self.vocab, 'r', encoding='utf-8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    return Tokenizer(token_dict), len(token_dict)

def get_tokenizer(self):
    return self.tokenizer

def encode(self, text, max_len):
    """
    输入文本,按最长取值截取,返回每个字对应的索引,不足的以0填充
    """
    return self.tokenizer.encode(first=text, max_len=max_len)[0]

def get_model(self, class_map, con1_size, con2_size, dense_size, learning_rate):
    """
    输入样本的标题/来源/内容进行深度学习,返回文章一二级标签分类
    """
    title = Input(shape=(self.max_title,), name='title_ids', dtype=tf.float32)
    content = Input(shape=(self.max_content,), name='content_ids', dtype=tf.float32)
    source = Input(shape=(self.max_source,), name='source_ids', dtype=tf.float32)
    embedding_layer = Embedding(self.vocab_size + 1, self.embedding_size)
    mask_layer = Embedding(self.first_class_num, self.second_class_num, weights=[class_map],
                           trainable=False)  # 消除一致性问题
    embedding_title = embedding_layer(title)
    embedding_content = embedding_layer(content)
    embedding_source = embedding_layer(source)
    flat_layers = []
    for embedding_layer in [embedding_title, embedding_content, embedding_source]:
        layers = []
        for i in [3, 5, 7]:
            conv = Conv1D(con1_size, i, padding='same', strides=1, activation='relu')(embedding_layer)
            pool = MaxPooling1D(pool_size=3, padding='same')(conv)
            conv = Conv1D(con2_size, i, padding='same', strides=1, activation='relu')(pool)
            pool = MaxPooling1D(pool_size=3, padding='same')(conv)
            layers += [pool]
        flat = Flatten()(concatenate(layers, axis=-1))
        flat = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat)))
        flat_layers += [flat]
    flat_concat = concatenate(flat_layers, axis=-1)
    dense = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat_concat)))
    output_first = Dense(self.first_class_num, activation='softmax')(dense)
    first_class_value = Lambda(lambda x: K.argmax(x), name=self.FIRST_CLASS)(output_first)
    mask = mask_layer(first_class_value)
    second = Dense(self.second_class_num, activation=None)(dense)
    second = Multiply()([second, mask])
    output_second = Activation("softmax")(second)
    second_class_value = Lambda(lambda x: K.argmax(x), name=self.SECOND_CLASS)(output_second)
    model1 = Model(inputs=[title, content, source], outputs=[output_first, output_second])
    model1.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
    model2 = Model(inputs=[title, content, source], outputs=[first_class_value, second_class_value])
    model2.summary()
    return model1, model2

定义模型描述类

class ModelDescription(object):
“”"分类模型描述文件

    描述分类模型的入参出参等字段

    参数:
        dim: 数据维度
        map_key: 模型入参字段和输入字段对应字段
        data_type: 模型入参字段名
        handler: 处理器
        fill_value: 字段不足时填充值

    返回:
        model: 模型描述文件字典
    """

model = {}

def __init__(self):
    self.model['model_desc'] = {}
    self.model['model_desc']['signature_name'] = ""
    self.model['model_desc']['inputs'] = {}
    self.model['model_desc']['outputs'] = []
    pass

def build_context_field(self, dim, map_key, tensor_name, data_type="int", handler="tokenizer", fill_value=0):
    field = {'dim': dim, 'map_key': map_key, 'tensor_name': tensor_name, 'data_type': data_type, 'handler': handler,
             'fill_value': fill_value}
    return field

def build_source(self, len, tensor_name):
    return self.build_context_field(len, "source", tensor_name)

def build_title(self, len, tensor_name):
    return self.build_context_field(len, "title", tensor_name)

def build_content(self, len, tensor_name):
    return self.build_context_field(len, "content", tensor_name)

def set_context(self, source_len, source_tensor_name, title_len, title_tensor_name, content_len,
                content_tensor_name):
    source = self.build_source(source_len, source_tensor_name)
    title = self.build_title(title_len, title_tensor_name)
    content = self.build_content(content_len, content_tensor_name)
    self.model['model_desc']['inputs']['context'] = [source, title, content]

def add_out_put(self, map_key, tensor_name, tag_name):
    output = {"map_key": map_key, "tensor_name": tensor_name, "data_type": "int", "handler": "tags",
              "tag_name": tag_name, "fill_value": "0", "dim": -1}
    self.model['model_desc']['outputs'] = self.model['model_desc']['outputs'] + [output]

def to_json(self):
    return json.dumps(self.model, ensure_ascii=False)

def news_classify_algo():
# 调用pyspark并获取sample数据
spark = SparkSession
.builder
.config(“spark.sql.broadcastTimeout”, “3000”)
.master(“yarn”)
.enableHiveSupport()
.getOrCreate()

args = parse_args()

# 从HIVE表读取数据
sql = '''select news_id,title,content,type,source,content_type,first_label,second_label
from dp_dm.content_center_news_classify_sample_data
where rank <={}'''.format(args.label_sample_num)
news_sample = spark.sql(sql).toPandas()

# 简单过滤缺失/重复数据
news = news_sample[
    ['news_id', 'title', 'content', 'first_label', 'source', 'second_label']].dropna().drop_duplicates()

# 建立类别索引
category = sorted(np.unique(news['first_label'].dropna().values))
sub_category = sorted(np.unique(news['second_label'].dropna().values))
category_map = dict(zip(category, np.arange(len(category))))
sub_category_map = dict(zip(sub_category, np.arange(len(sub_category))))
# 初始化模型
text_cnn = TextCNN(vocab="vocab.txt", embedding_size=args.embedding_size, max_title=args.max_title_size,
                   max_content=args.max_content_size, max_source=args.max_source_size
                   , first_class_num=len(category), second_class_num=len(sub_category))

# 对类别和特征进行编码
news['category'] = news['first_label'].map(category_map)
news['sub_category'] = news['second_label'].map(sub_category_map)
news['title_ids'] = news['title'].apply(lambda x: text_cnn.encode(x, text_cnn.max_title))
news['content_ids'] = news['content'].apply(lambda x: text_cnn.encode(x, text_cnn.max_content))
news['source_ids'] = news['source'].apply(lambda x: text_cnn.encode(x, text_cnn.max_source))

# 建立一二级标签映射字典
category_level_reverse_map = dict(zip(news['sub_category'], news['category']))

# 切分数据集为训练集和验证集
train_x, test_x, train_y, test_y = train_test_split(news[['title_ids', 'content_ids', 'source_ids']]
                                                    , news[['category', 'sub_category']])

# 建立分类矩阵
def get_class_matrix(class_dict):
    data = np.ones(len(class_dict))
    indice = list(class_dict.values())
    indictor = list(class_dict.keys())
    map_mat = sparse.csr_matrix((data, (indice, indictor))).todense()
    return map_mat

# 确立数据集x,y值
tx_title = np.array(train_x['title_ids'].values.tolist()).astype(np.float32)
tx_content = np.array(train_x['content_ids'].values.tolist()).astype(np.float32)
tx_source = np.array(train_x['source_ids'].values.tolist()).astype(np.float32)
tx = [tx_title, tx_content, tx_source]
ty_cate = np.array(train_y['category'].values.tolist()).astype(np.float32)
ty_subcate = np.array(train_y['sub_category'].values.tolist()).astype(np.float32)
ty = [ty_cate, ty_subcate]
ex_title = np.array(test_x['title_ids'].values.tolist()).astype(np.float32)
ex_content = np.array(test_x['content_ids'].values.tolist()).astype(np.float32)
ex_source = np.array(test_x['source_ids'].values.tolist()).astype(np.float32)
ex = [ex_title, ex_content, ex_source]
ey_cate = np.array(test_y['category'].values.tolist()).astype(np.float32)
ey_subcate = np.array(test_y['sub_category'].values.tolist()).astype(np.float32)
ey = [ey_cate, ey_subcate]

model1, model2 = text_cnn.get_model(get_class_matrix(category_level_reverse_map),
                                    args.con1_size,
                                    args.con2_size,
                                    args.dense_size,
                                    args.learning_rate)

# 模型训练
model1.fit(x=tx, y=ty, batch_size=args.batch_size, validation_data=(ex, ey), epochs=args.epochs)

# 保存模型到HDFS
model2.save(args.model_save_path)

# 以下是为了保存模型描述文件
news_model = ModelDescription()
news_model.set_context(args.max_source_size, 'source_ids', args.max_title_size, 'title_ids', args.max_content_size, 'content_ids')
news_model.add_out_put('一级标签', text_cnn.FIRST_CLASS, list(category_map.keys()))
news_model.add_out_put('二级标签', text_cnn.SECOND_CLASS, list(sub_category_map.keys()))
sc = spark.sparkContext
fs_class = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
conf_class = sc._gateway.jvm.org.apache.hadoop.conf.Configuration
fs = fs_class.get(conf_class())
path_class = sc._gateway.jvm.org.apache.hadoop.fs.Path

def save_file(path: str, data: str):
    """保存文件至hdfs.
    参数:
        path(str): hdfs上的路径
        data(str): 数据
    """
    output = fs.create(path_class(path))
    output.write(data.encode())
    output.flush()
    output.close()

# 保存描述文件到HDFS
data = news_model.to_json()
save_file(args.model_desc_save_path, data)

主函数入口

if name == ‘main’:
news_classify_algo()

你可能感兴趣的:(深度学习,卷积,python)