pip install word2vec
import word2vec
word2vec.word2vec('corpusSegDone.txt', 'corpusWord2Vec.bin', size=300, verbose=True)
pip install gensim
import logging
import multiprocessing
import os.path
import sys
import jieba
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences
if __name__ == '__main__':
# 日志信息输出
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
# if len(sys.argv) < 4:
# print(globals()['__doc__'] % locals())
# sys.exit(1)
# input_dir, outp1, outp2 = sys.argv[1:4]
input_dir = 'segment'
outp1 = 'baike.model'
outp2 = 'word2vec_format'
fileNames = os.listdir(input_dir)
# 训练模型
# 输入语料目录:PathLineSentences(input_dir)
# embedding size:256 共现窗口大小:10 去除出现次数5以下的词,多线程运行,迭代10次
model = Word2Vec(PathLineSentences(input_dir),
size=256, window=10, min_count=5,
workers=multiprocessing.cpu_count(), iter=10)
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
# 运行命令:输入训练文件目录 python word2vec_model.py data baike.model baike.vector
extract_features.py——句向量生成文件
1.TensorFlow 模型文件(bert_model.ckpt) :包含预训练模型的权重,模型文件有三个
2.字典文件(vocab.txt) :记录词条与 id 的映射关系
3.配置文件(bert_config.json ) :记录模型的超参数
--input_file="./data/input.txt"
--output_file="./data/output.jsonl"
--vocab_file="./chinese_L-12_H-768_A-12/vocab.txt"
--bert_config_file="./chinese_L-12_H-768_A-12/bert_config.json"
--init_checkpoint="./chinese_L-12_H-768_A-12/bert_model.ckpt"
--layers=-2
--max_seq_length=128
--batch_size=8
layers: 是输出那些层的参数,-1就是最后一层,-2是倒数第二层,一次类推
max_seq_length: 是最大句子长度,根据自己的任务配置。如果你的GPU内存比较小,可以减小这个值,节省存储
{
"linex_index": 1, "features": [{
"token": "[CLS]", "layers": [{
"index": -1, "values": [-0.2844, 0.450896, 0.285645, 0.421341, 0.411053, ...
pip install bert-serving-server # server
pip install bert-serving-client # client, independent of `bert-serving-server`
cd D:\Anaconda3\Scripts
bert-serving-start -model_dir E:/chinese_L-12_H-768_A-12 -num_worker=1
from bert_serving.client import BertClient
bc = BertClient()
vec = bc.encode(["今天天气真好", "我感冒了"])
print(vec)
from bert_serving.client import BertClient
import numpy as np
bc = BertClient()
result = []
value = 0.90
f = open('1.txt', 'r', encoding='utf-8-sig')
for line in f:
result.append(line.strip('\n'))
Input = bc.encode(result)
print(Input)
bert-serving-start -pooling_strategy NONE -model_dir E:/chinese_L-12_H-768_A-12/
from bert_serving.client import BertClient
bc = BertClient()
vec = bc.encode(['hey you', 'whats up?'])
print('vec.shape:', vec.shape)
vec # [2, 25, 768]
vec[0] # [1, 25, 768], sentence embeddings forhey you
vec[0][0] # [1, 1, 768], word embedding for[CLS]
vec[0][1] # [1, 1, 768], word embedding forhey
vec[0][2] # [1, 1, 768], word embedding foryou
vec[0][3] # [1, 1, 768], word embedding for[SEP]
vec[0][4] # [1, 1, 768], word embedding for padding symbol
vec[0][25] # error, out of index!
from bert.extrac_feature import BertVector
bv = BertVector()
bv.encode(['今天天气不错'])
[[ 1.21984698e-01 7.84057677e-02 -1.06496774e-01 -3.25891018e-01
4.94978607e-01 -4.69692767e-01 2.54333645e-01 -8.82656407e-03...