新闻标签一二级深度学习分类模型
对一篇新闻的标题/正文/来源数据进行建模分析,以及标签正确率90%,二级标签正确率72%,解决一二级标签不一致的情况在这里插入代码片
import tensorflow as tf
import pandas as pd
import numpy as np
from keras_bert import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from scipy import sparse
from tensorflow.keras.optimizers import Adam
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import json
import argparse
“”"
模型需求:一二级标签分类需求
时间:2020/12
模型描述:运用textCNN深度学习模型搭建而成
“”"
def parse_args():
“”“解析参数.”""
parser = argparse.ArgumentParser(usage=“it’s usage tip.”,
description=“predict news type”)
parser.add_argument("–label-sample-num", default=10000, help=“每个标签样本数量”)
parser.add_argument("–embedding-size", default=128, type=int,
help=“字向量嵌入规格”)
parser.add_argument("–max-title-size", default=32, type=int, help=“最大入参标题字数”)
parser.add_argument("–max-content-size", default=512, type=int,
help=“最大文章内容字数”)
parser.add_argument("–max-source-size", default=32, type=int, help=“最大文章来源字数”)
parser.add_argument("–batch-size", default=16, type=int,
help=“模型训练每个批次样本数量”)
parser.add_argument("–epochs", default=5, type=int,
help=“模型训练次数”)
parser.add_argument("–model-save-path", type=str, help=“模型存储路径”)
parser.add_argument("–model-desc-save-path", type=str, help=“模型描述文件存储路径”)
parser.add_argument("–con1-size", default=128, type=int,
help=“第一层卷积规格”)
parser.add_argument("–con2-size", default=64, type=int, help=“第二层卷积规格”)
parser.add_argument("–dense-size", default=128, type=int,
help=“全链接规格”)
parser.add_argument("–learning-rate", default=0.001, type=float,
help=“模型学习率”)
return parser.parse_args()
class TextCNN(object):
“”"文本深度学习标签分类模型
新闻文章的标题/来源/正文作为入参字段,经过数据编码处理深度学习然后输出文章一二级标签分类
参数:
vocab: 字向量字典
embedding_size: 相关字嵌入的规格
max_title: 标题最长取值
max_content: 内容最长取值
max_source: 来源最长取值
first_class_num: 一级标签数量
second_class_num: 二级标签数量
返回:
model1, model2: 返回训练好的模型
"""
FIRST_CLASS = "first_class"
SECOND_CLASS = "second_class"
def __init__(self, vocab, embedding_size, max_title, max_content, max_source, first_class_num, second_class_num):
self.vocab = vocab
self.max_title = max_title
self.max_content = max_content
self.max_source = max_source
self.tokenizer, self.vocab_size = self.__get_tokenizer()
self.embedding_size = embedding_size
self.first_class_num = first_class_num
self.second_class_num = second_class_num
def __get_tokenizer(self):
"""
建立字对应的字典索引,返回每个字及对应的索引号
"""
token_dict = {}
with open(self.vocab, 'r', encoding='utf-8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
return Tokenizer(token_dict), len(token_dict)
def get_tokenizer(self):
return self.tokenizer
def encode(self, text, max_len):
"""
输入文本,按最长取值截取,返回每个字对应的索引,不足的以0填充
"""
return self.tokenizer.encode(first=text, max_len=max_len)[0]
def get_model(self, class_map, con1_size, con2_size, dense_size, learning_rate):
"""
输入样本的标题/来源/内容进行深度学习,返回文章一二级标签分类
"""
title = Input(shape=(self.max_title,), name='title_ids', dtype=tf.float32)
content = Input(shape=(self.max_content,), name='content_ids', dtype=tf.float32)
source = Input(shape=(self.max_source,), name='source_ids', dtype=tf.float32)
embedding_layer = Embedding(self.vocab_size + 1, self.embedding_size)
mask_layer = Embedding(self.first_class_num, self.second_class_num, weights=[class_map],
trainable=False) # 消除一致性问题
embedding_title = embedding_layer(title)
embedding_content = embedding_layer(content)
embedding_source = embedding_layer(source)
flat_layers = []
for embedding_layer in [embedding_title, embedding_content, embedding_source]:
layers = []
for i in [3, 5, 7]:
conv = Conv1D(con1_size, i, padding='same', strides=1, activation='relu')(embedding_layer)
pool = MaxPooling1D(pool_size=3, padding='same')(conv)
conv = Conv1D(con2_size, i, padding='same', strides=1, activation='relu')(pool)
pool = MaxPooling1D(pool_size=3, padding='same')(conv)
layers += [pool]
flat = Flatten()(concatenate(layers, axis=-1))
flat = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat)))
flat_layers += [flat]
flat_concat = concatenate(flat_layers, axis=-1)
dense = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat_concat)))
output_first = Dense(self.first_class_num, activation='softmax')(dense)
first_class_value = Lambda(lambda x: K.argmax(x), name=self.FIRST_CLASS)(output_first)
mask = mask_layer(first_class_value)
second = Dense(self.second_class_num, activation=None)(dense)
second = Multiply()([second, mask])
output_second = Activation("softmax")(second)
second_class_value = Lambda(lambda x: K.argmax(x), name=self.SECOND_CLASS)(output_second)
model1 = Model(inputs=[title, content, source], outputs=[output_first, output_second])
model1.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
model2 = Model(inputs=[title, content, source], outputs=[first_class_value, second_class_value])
model2.summary()
return model1, model2
class ModelDescription(object):
“”"分类模型描述文件
描述分类模型的入参出参等字段
参数:
dim: 数据维度
map_key: 模型入参字段和输入字段对应字段
data_type: 模型入参字段名
handler: 处理器
fill_value: 字段不足时填充值
返回:
model: 模型描述文件字典
"""
model = {}
def __init__(self):
self.model['model_desc'] = {}
self.model['model_desc']['signature_name'] = ""
self.model['model_desc']['inputs'] = {}
self.model['model_desc']['outputs'] = []
pass
def build_context_field(self, dim, map_key, tensor_name, data_type="int", handler="tokenizer", fill_value=0):
field = {'dim': dim, 'map_key': map_key, 'tensor_name': tensor_name, 'data_type': data_type, 'handler': handler,
'fill_value': fill_value}
return field
def build_source(self, len, tensor_name):
return self.build_context_field(len, "source", tensor_name)
def build_title(self, len, tensor_name):
return self.build_context_field(len, "title", tensor_name)
def build_content(self, len, tensor_name):
return self.build_context_field(len, "content", tensor_name)
def set_context(self, source_len, source_tensor_name, title_len, title_tensor_name, content_len,
content_tensor_name):
source = self.build_source(source_len, source_tensor_name)
title = self.build_title(title_len, title_tensor_name)
content = self.build_content(content_len, content_tensor_name)
self.model['model_desc']['inputs']['context'] = [source, title, content]
def add_out_put(self, map_key, tensor_name, tag_name):
output = {"map_key": map_key, "tensor_name": tensor_name, "data_type": "int", "handler": "tags",
"tag_name": tag_name, "fill_value": "0", "dim": -1}
self.model['model_desc']['outputs'] = self.model['model_desc']['outputs'] + [output]
def to_json(self):
return json.dumps(self.model, ensure_ascii=False)
def news_classify_algo():
# 调用pyspark并获取sample数据
spark = SparkSession
.builder
.config(“spark.sql.broadcastTimeout”, “3000”)
.master(“yarn”)
.enableHiveSupport()
.getOrCreate()
args = parse_args()
# 从HIVE表读取数据
sql = '''select news_id,title,content,type,source,content_type,first_label,second_label
from dp_dm.content_center_news_classify_sample_data
where rank <={}'''.format(args.label_sample_num)
news_sample = spark.sql(sql).toPandas()
# 简单过滤缺失/重复数据
news = news_sample[
['news_id', 'title', 'content', 'first_label', 'source', 'second_label']].dropna().drop_duplicates()
# 建立类别索引
category = sorted(np.unique(news['first_label'].dropna().values))
sub_category = sorted(np.unique(news['second_label'].dropna().values))
category_map = dict(zip(category, np.arange(len(category))))
sub_category_map = dict(zip(sub_category, np.arange(len(sub_category))))
# 初始化模型
text_cnn = TextCNN(vocab="vocab.txt", embedding_size=args.embedding_size, max_title=args.max_title_size,
max_content=args.max_content_size, max_source=args.max_source_size
, first_class_num=len(category), second_class_num=len(sub_category))
# 对类别和特征进行编码
news['category'] = news['first_label'].map(category_map)
news['sub_category'] = news['second_label'].map(sub_category_map)
news['title_ids'] = news['title'].apply(lambda x: text_cnn.encode(x, text_cnn.max_title))
news['content_ids'] = news['content'].apply(lambda x: text_cnn.encode(x, text_cnn.max_content))
news['source_ids'] = news['source'].apply(lambda x: text_cnn.encode(x, text_cnn.max_source))
# 建立一二级标签映射字典
category_level_reverse_map = dict(zip(news['sub_category'], news['category']))
# 切分数据集为训练集和验证集
train_x, test_x, train_y, test_y = train_test_split(news[['title_ids', 'content_ids', 'source_ids']]
, news[['category', 'sub_category']])
# 建立分类矩阵
def get_class_matrix(class_dict):
data = np.ones(len(class_dict))
indice = list(class_dict.values())
indictor = list(class_dict.keys())
map_mat = sparse.csr_matrix((data, (indice, indictor))).todense()
return map_mat
# 确立数据集x,y值
tx_title = np.array(train_x['title_ids'].values.tolist()).astype(np.float32)
tx_content = np.array(train_x['content_ids'].values.tolist()).astype(np.float32)
tx_source = np.array(train_x['source_ids'].values.tolist()).astype(np.float32)
tx = [tx_title, tx_content, tx_source]
ty_cate = np.array(train_y['category'].values.tolist()).astype(np.float32)
ty_subcate = np.array(train_y['sub_category'].values.tolist()).astype(np.float32)
ty = [ty_cate, ty_subcate]
ex_title = np.array(test_x['title_ids'].values.tolist()).astype(np.float32)
ex_content = np.array(test_x['content_ids'].values.tolist()).astype(np.float32)
ex_source = np.array(test_x['source_ids'].values.tolist()).astype(np.float32)
ex = [ex_title, ex_content, ex_source]
ey_cate = np.array(test_y['category'].values.tolist()).astype(np.float32)
ey_subcate = np.array(test_y['sub_category'].values.tolist()).astype(np.float32)
ey = [ey_cate, ey_subcate]
model1, model2 = text_cnn.get_model(get_class_matrix(category_level_reverse_map),
args.con1_size,
args.con2_size,
args.dense_size,
args.learning_rate)
# 模型训练
model1.fit(x=tx, y=ty, batch_size=args.batch_size, validation_data=(ex, ey), epochs=args.epochs)
# 保存模型到HDFS
model2.save(args.model_save_path)
# 以下是为了保存模型描述文件
news_model = ModelDescription()
news_model.set_context(args.max_source_size, 'source_ids', args.max_title_size, 'title_ids', args.max_content_size, 'content_ids')
news_model.add_out_put('一级标签', text_cnn.FIRST_CLASS, list(category_map.keys()))
news_model.add_out_put('二级标签', text_cnn.SECOND_CLASS, list(sub_category_map.keys()))
sc = spark.sparkContext
fs_class = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
conf_class = sc._gateway.jvm.org.apache.hadoop.conf.Configuration
fs = fs_class.get(conf_class())
path_class = sc._gateway.jvm.org.apache.hadoop.fs.Path
def save_file(path: str, data: str):
"""保存文件至hdfs.
参数:
path(str): hdfs上的路径
data(str): 数据
"""
output = fs.create(path_class(path))
output.write(data.encode())
output.flush()
output.close()
# 保存描述文件到HDFS
data = news_model.to_json()
save_file(args.model_desc_save_path, data)
if name == ‘main’:
news_classify_algo()