下文实现仅仅是比较粗糙的一种方式,可以改进的点还有很多,是真的很多!重点是,不讲解原理,就是这么没道理…
#一些经验
#架构(sg):skip-gram(慢、对罕见字有利)vs CBOW(快)
#训练算法(hs):分层softmax(对罕见字有利)vs 负采样(对常见词和低纬向量有利)
#欠采样频繁词(sample):可以提高结果的准确性和速度(适用范围1e-3到1e-5)
#文本大小(window):skip-gram通常在10附近,CBOW通常在5附近
#大语料下,建议提高min_count,减少iter
#内存占用大约公式:词汇数*8*size/1000/1000/1000(GB)
#硬盘占用大约公式:词汇数*8/1000/1000/1000(GB)(实际上考虑到其模型的其他文件,最好再*10的大小)
# 训练算法,0为CBOW算法,1为skip-gram算法,默认为0
sg=1
# 特征向量的维度
size=300
# 词窗大小
window=5
# 最小词频
min_count=5
# 初始学习速率
alpha=0.025
# 0为负采样,1为softmax,默认为0
hs=1
#迭代次数
iter=10
# -*- coding:utf-8 -*-
"""
Description: 基于百度百科大语料的word2vec模型
@author: WangLeAi
@date: 2018/9/18
"""
import os
from util.DBUtil import DbPoolUtil
from util.JiebaUtil import jieba_util
from util.PropertiesUtil import prop
from gensim.models import word2vec
class OriginModel(object):
def __init__(self):
self.params = prop.get_config_dict("config/w2v.properties")
self.db_pool_util = DbPoolUtil(db_type="mysql")
self.train_data_path = "gen/ori_train_data.txt"
self.model_path = "model/oriw2v.model"
@staticmethod
def text_process(sentence):
"""
文本预处理
:param sentence:
:return:
"""
# 过滤任意非中文、非英文、非数字
# regex = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z\-·]+')
# sentence = regex.sub('', sentence)
words = jieba_util.jieba_cut(sentence)
return words
def get_train_data(self):
"""
获取训练数据,此处需要自行修改,最好写入文件而不是直接取到内存中!!!!!
:return:
"""
print("创建初始语料训练数据")
sql = """ """
sentences = self.db_pool_util.loop_row(origin_model, "text_process", sql)
with open(self.train_data_path, "w", encoding="utf-8") as f:
for sentence in sentences:
f.write(" ".join(sentence) + "\n")
def train_model(self):
"""
训练模型
:return:
"""
if not os.path.exists(self.train_data_path):
self.get_train_data()
print("训练初始模型")
sentences = word2vec.LineSentence(self.train_data_path)
model = word2vec.Word2Vec(sentences=sentences, sg=int(self.params["sg"]), size=int(self.params["size"]),
window=int(self.params["window"]), min_count=int(self.params["min_count"]),
alpha=float(self.params["alpha"]), hs=int(self.params["hs"]), workers=6,
iter=int(self.params["iter"]))
model.save(self.model_path)
print("训练初始模型完毕,保存模型")
origin_model = OriginModel()
# -*- coding:utf-8 -*-
"""
Description:word2vec fine tuning
基于对应类型的额外语料进行微调
@author: WangLeAi
@date: 2018/9/11
"""
import os
from util.DBUtil import DbPoolUtil
from util.JiebaUtil import jieba_util
from util.PropertiesUtil import prop
from gensim.models import word2vec
from algorithms.OriginModel import origin_model
class Word2VecModel(object):
def __init__(self):
self.db_pool_util = DbPoolUtil(db_type="mysql")
self.train_data_path = "gen/train_data.txt"
self.origin_model_path = "model/oriw2v.model"
self.model_path = "model/w2v.model"
self.model = None
# 未登录词进入需考虑最小词频
self.min_count = int(prop.get_config_value("config/w2v.properties", "min_count"))
@staticmethod
def text_process(sentence):
"""
文本预处理
:param sentence:
:return:
"""
# 过滤任意非中文、非英文、非数字等
# regex = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z\-·]+')
# sentence = regex.sub('', sentence)
words = jieba_util.jieba_cut(sentence)
return words
def get_train_data(self):
"""
获取训练数据,此处需要自行修改,最好写入文件而不是直接取到内存中!!!!!
:return:
"""
print("创建额外语料训练数据")
sql = """ """
sentences = self.db_pool_util.loop_row(w2v_model, "text_process", sql)
with open(self.train_data_path, "a", encoding="utf-8") as f:
for sentence in sentences:
f.write(" ".join(sentence) + "\n")
def train_model(self):
"""
训练模型
:return:
"""
if not os.path.exists(self.origin_model_path):
print("无初始模型,进行初始模型训练")
origin_model.train_model()
model = word2vec.Word2Vec.load(self.origin_model_path)
print("初始模型加载完毕")
if not os.path.exists(self.train_data_path):
self.get_train_data()
print("额外语料训练")
extra_sentences = word2vec.LineSentence(self.train_data_path)
model.build_vocab(extra_sentences, update=True)
model.train(extra_sentences, total_examples=model.corpus_count, epochs=model.iter)
model.save(self.model_path)
print("额外语料训练完毕")
def load_model(self):
"""
载入模型
:return:
"""
print("载入词嵌入模型")
if not os.path.exists(self.model_path):
print("无词嵌入模型,进行训练")
self.train_model()
self.model = word2vec.Word2Vec.load(self.model_path)
print("词嵌入模型加载完毕")
def get_word_vector(self, words, extra=0):
"""
获取词语向量,需要先载入模型
:param words:
:param extra:是否考虑未登录词,0不考虑,1考虑
:return:
"""
if extra:
if words not in self.model:
more_sentences = [[words, ] for i in range(self.min_count)]
self.model.build_vocab(more_sentences, update=True)
self.model.train(more_sentences, total_examples=self.model.corpus_count, epochs=self.model.iter)
self.model.save(self.model_path)
rst = None
if words in self.model:
rst = self.model[words]
return rst
def get_sentence_vector(self, sentence, extra=0):
"""
获取文本向量,需要先载入模型
:param sentence:
:param extra: 是否考虑未登录词,0不考虑,1考虑
:return:
"""
words = jieba_util.jieba_cut_flag(sentence)
if not words:
words = jieba_util.jieba_cut(sentence)
if not words:
print("存在无法切出有效词的句子:" + sentence)
# raise Exception("存在无法切出有效词的句子:" + sentence)
if extra:
for item in words:
if item not in self.model:
more_sentences = [words for i in range(self.min_count)]
self.model.build_vocab(more_sentences, update=True)
self.model.train(more_sentences, total_examples=self.model.corpus_count, epochs=self.model.iter)
self.model.save(self.model_path)
break
return self.get_sentence_embedding(words)
def get_sentence_embedding(self, words):
"""
获取短文本向量,仅推荐短文本使用
句中所有词权重总和求平均获取文本向量,不适用于长文本的原因在于受频繁词影响较大
长文本推荐使用gensim的doc2vec
:param words:
:return:
"""
count = 0
vector = None
for item in words:
if item in self.model:
count += 1
if vector is not None:
vector = vector + self.model[item]
else:
vector = self.model[item]
if vector is not None:
vector = vector / count
return vector
w2v_model = Word2VecModel()
# -*- coding:utf-8 -*-
"""
Description:
@author: WangLeAi
@date: 2018/9/18
"""
import os
from algorithms.Word2VecModel import w2v_model
def main():
root_path = os.path.split(os.path.realpath(__file__))[0]
if not os.path.exists(root_path + "/model"):
os.mkdir(root_path + "/model")
w2v_model.load_model()
print(w2v_model.get_sentence_vector("不知不觉间我已经忘记了爱"))
if __name__ == "__main__":
main()
下载地址