分词:
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs, sys
def cut_words(sentence):
# print sentence
return " ".join(jieba.cut(sentence)).encode('utf-8')
f = codecs.open('wiki.zh.jian.text', 'r', encoding="utf8")
target = codecs.open("zh.jian.wiki.seg-1.3g.txt", 'w', encoding="utf8")
print('open files')
line_num = 1
line = f.readline()
while line:
print('---- processing ', line_num, ' article----------------')
line_seg = " ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num + 1
line = f.readline()
f.close()
target.close()
exit()
while line:
curr = []
for oneline in line:
# print(oneline)
curr.append(oneline)
after_cut = map(cut_words, curr)
target.writelines(after_cut)
print('saved', line_num, 'articles')
exit()
line = f.readline1()
f.close()
target.close()
模型训练:
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print (globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.model.wv.save_word2vec_format(outp2, binary=False)
模型测试:
from gensim.models import Word2Vec
en_wiki_word2vec_model = Word2Vec.load('wiki.zh.text.model')
testwords = ['股票', '有色金属', '南京大学', '白痴', '篮球']
for i in range(5):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print(testwords[i])
print(res)
股票
[('普通股', 0.7026801109313965), ('期货', 0.694530725479126), ('股价', 0.6763377785682678), ('公司股票', 0.6681246757507324), ('投资人', 0.6606836318969727), ('新股', 0.6576610803604126), ('优先股', 0.6528714895248413), ('万股', 0.6510456204414368), ('债券', 0.6507710218429565), ('权证', 0.647611141204834)]
有色金属
[('石油化工', 0.8271920680999756), ('深加工', 0.7927677631378174), ('精细化工', 0.7625068426132202), ('稀土', 0.7617228031158447), ('耐火材料', 0.7608179450035095), ('黑色金属', 0.7601447105407715), ('冶金工业', 0.7555592060089111), ('焦煤', 0.7522554397583008), ('冶金', 0.751286506652832), ('铝矾土', 0.7485761046409607)]
南京大学
[('东南大学', 0.7753866910934448), ('武汉大学', 0.7542777061462402), ('北京师范大学', 0.7464444041252136), ('四川大学', 0.7437261343002319), ('浙江大学', 0.743655800819397), ('华东师范大学', 0.7429168820381165), ('华中科技大学', 0.7348958253860474), ('复旦大学', 0.7330772280693054), ('杭州大学', 0.7318748235702515), ('湖南大学', 0.7235416173934937)]
白痴
[('书呆子', 0.6184146404266357), ('疯子', 0.6043859720230103), ('笨蛋', 0.5833420753479004), ('小聪明', 0.5805025696754456), ('爱哭鬼', 0.5668667554855347), ('骗子', 0.5608910918235779), ('傻子', 0.5529210567474365), ('天才', 0.5403788089752197), ('傻瓜', 0.5321439504623413), ('变态', 0.5319798588752747)]
篮球
[('美式足球', 0.6267459392547607), ('男子篮球', 0.5999912619590759), ('冰球', 0.5870978832244873), ('棒球', 0.5742350816726685), ('橄榄球', 0.5719608068466187), ('篮球队', 0.5555766820907593), ('排球', 0.5517430305480957), ('篮球运动', 0.5373520851135254), ('足球', 0.5356222987174988), ('曲棍球', 0.5111766457557678)]