首先下载wiki中文语料(大约1.7G)
https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
下载的文件名为“zhwiki-latest-pages-articles.xml.bz2”
这个大家都知道,然后没什么卵用,因为墙太强大,所以下载不下来,只能另辟蹊径了。。。
因此搜来搜去找到2个开源语料库
https://github.com/brightmart/nlp_chinese_corpus
https://github.com/crownpku/Awesome-Chinese-NLP 这个有丰富的语料资源
维基中文语料库地址
https://dumps.wikimedia.org/zhwiki/latest/
https://dumps.wikimedia.org/zhwiki/
里面包含了wike百科语料,希望项目越来越大,内容越来越丰富
随着研究的不断深入,发展中国在语料积累上还很欠缺
都是扯犊子 下了半天每一次下载成功的,最后改用迅雷下载,终于可以下载了
http://academictorrents.com/
语料库地址目录
文本预处理 data_pre_process.py
# -*- coding: utf-8 -*-
from gensim.corpora import WikiCorpus
import jieba
from langconv import *
def my_function():
space = ' '
i = 0
l = []
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
f = open('./data/reduce_zhiwiki.txt', 'w')
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
for text in wiki.get_texts():
for temp_sentence in text:
temp_sentence = Converter('zh-hans').convert(temp_sentence)
seg_list = list(jieba.cut(temp_sentence))
for temp_term in seg_list:
l.append(temp_term)
f.write(space.join(l) + '\n')
l = []
i = i + 1
if(i % 200 == 0):
print('Saved ' + str(i) + ' articles')
f.close()
if __name__ == '__main__':
my_function()
word2vec训练代码
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def my_function():
wiki_news = open('D:\\yangyang\\wiki\\wikipre\\reduce_zhiwiki.txt', 'r', encoding='utf8')
# sg = 0 表示用CBOW模型训练 1表示用SKIP-GRAM模型训练
# size表示词向量的维度
# window 表示当前词与预测词最大距离
# min_count表示最小出现次数
# workers 表示训练词向量时所使用的线程数
model = Word2Vec(LineSentence(wiki_news), sg=0, size=192, window=5, min_count=5, workers=9)
model.save('zhiwiki_news.word2vec')
if __name__ == '__main__':
my_function()
# 2019/6/17 19:51
# 2019-06-18 11:15:10,509 : INFO : EPOCH 1 - PROGRESS: at 0.70% examples, 65 words/s, in_qsize 18, out_qsize 0
# 训练速度太慢,要用服务器才行 24小时还不到1% 单机训练完要100天
test
# coding=utf-8
import gensim
def my_function():
model = gensim.models.Word2Vec.load('./data/zhiwiki_news.word2vec')
print(model.similarity('西红柿', '番茄')) # 相似度为0.63
print(model.similarity('西红柿', '香蕉')) # 相似度为0.44
word = '中国'
if word in model.wv.index2word:
print(model.most_similar(word))
if __name__ == '__main__':
my_function()
繁体字转简体字
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from copy import deepcopy
try:
import psyco
psyco.full()
except:
pass
try:
from zh_wiki import zh2Hant, zh2Hans
except ImportError:
# from zhtools.zh_wiki import zh2Hant, zh2Hans
import zhtools.zh_wiki
import sys
py3k = sys.version_info >= (3, 0, 0)
if py3k:
UEMPTY = ''
else:
_zh2Hant, _zh2Hans = {}, {}
for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
for k, v in old.items():
new[k.decode('utf8')] = v.decode('utf8')
zh2Hant = _zh2Hant
zh2Hans = _zh2Hans
UEMPTY = ''.decode('utf8')
# states
(START, END, FAIL, WAIT_TAIL) = list(range(4))
# conditions
(TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
MAPS = {}
class Node(object):
def __init__(self, from_word, to_word=None, is_tail=True,
have_child=False):
self.from_word = from_word
if to_word is None:
self.to_word = from_word
self.data = (is_tail, have_child, from_word)
self.is_original = True
else:
self.to_word = to_word or from_word
self.data = (is_tail, have_child, to_word)
self.is_original = False
self.is_tail = is_tail
self.have_child = have_child
def is_original_long_word(self):
return self.is_original and len(self.from_word)>1
def is_follow(self, chars):
return chars != self.from_word[:-1]
def __str__(self):
return '' % (repr(self.from_word),
repr(self.to_word), self.is_tail, self.have_child)
__repr__ = __str__
class ConvertMap(object):
def __init__(self, name, mapping=None):
self.name = name
self._map = {}
if mapping:
self.set_convert_map(mapping)
def set_convert_map(self, mapping):
convert_map = {}
have_child = {}
max_key_length = 0
for key in sorted(mapping.keys()):
if len(key)>1:
for i in range(1, len(key)):
parent_key = key[:i]
have_child[parent_key] = True
have_child[key] = False
max_key_length = max(max_key_length, len(key))
for key in sorted(have_child.keys()):
convert_map[key] = (key in mapping, have_child[key],
mapping.get(key, UEMPTY))
self._map = convert_map
self.max_key_length = max_key_length
def __getitem__(self, k):
try:
is_tail, have_child, to_word = self._map[k]
return Node(k, to_word, is_tail, have_child)
except:
return Node(k)
def __contains__(self, k):
return k in self._map
def __len__(self):
return len(self._map)
class StatesMachineException(Exception): pass
class StatesMachine(object):
def __init__(self):
self.state = START
self.final = UEMPTY
self.len = 0
self.pool = UEMPTY
def clone(self, pool):
new = deepcopy(self)
new.state = WAIT_TAIL
new.pool = pool
return new
def feed(self, char, map):
node = map[self.pool+char]
if node.have_child:
if node.is_tail:
if node.is_original:
cond = UNMATCHED_SWITCH
else:
cond = MATCHED_SWITCH
else:
cond = CONNECTOR
else:
if node.is_tail:
cond = TAIL
else:
cond = ERROR
new = None
if cond == ERROR:
self.state = FAIL
elif cond == TAIL:
if self.state == WAIT_TAIL and node.is_original_long_word():
self.state = FAIL
else:
self.final += node.to_word
self.len += 1
self.pool = UEMPTY
self.state = END
elif self.state == START or self.state == WAIT_TAIL:
if cond == MATCHED_SWITCH:
new = self.clone(node.from_word)
self.final += node.to_word
self.len += 1
self.state = END
self.pool = UEMPTY
elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
if self.state == START:
new = self.clone(node.from_word)
self.final += node.to_word
self.len += 1
self.state = END
else:
if node.is_follow(self.pool):
self.state = FAIL
else:
self.pool = node.from_word
elif self.state == END:
# END is a new START
self.state = START
new = self.feed(char, map)
elif self.state == FAIL:
raise StatesMachineException('Translate States Machine '
'have error with input data %s' % node)
return new
def __len__(self):
return self.len + 1
def __str__(self):
return '' % (
id(self), self.pool, self.state, self.final)
__repr__ = __str__
class Converter(object):
def __init__(self, to_encoding):
self.to_encoding = to_encoding
self.map = MAPS[to_encoding]
self.start()
def feed(self, char):
branches = []
for fsm in self.machines:
new = fsm.feed(char, self.map)
if new:
branches.append(new)
if branches:
self.machines.extend(branches)
self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
all_ok = True
for fsm in self.machines:
if fsm.state != END:
all_ok = False
if all_ok:
self._clean()
return self.get_result()
def _clean(self):
if len(self.machines):
self.machines.sort(key=lambda x: len(x))
# self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
self.final += self.machines[0].final
self.machines = [StatesMachine()]
def start(self):
self.machines = [StatesMachine()]
self.final = UEMPTY
def end(self):
self.machines = [fsm for fsm in self.machines
if fsm.state == FAIL or fsm.state == END]
self._clean()
def convert(self, string):
self.start()
for char in string:
self.feed(char)
self.end()
return self.get_result()
def get_result(self):
return self.final
def registery(name, mapping):
global MAPS
MAPS[name] = ConvertMap(name, mapping)
registery('zh-hant', zh2Hant)
registery('zh-hans', zh2Hans)
del zh2Hant, zh2Hans
def run():
import sys
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-e', type='string', dest='encoding',
help='encoding')
parser.add_option('-f', type='string', dest='file_in',
help='input file (- for stdin)')
parser.add_option('-t', type='string', dest='file_out',
help='output file')
(options, args) = parser.parse_args()
if not options.encoding:
parser.error('encoding must be set')
if options.file_in:
if options.file_in == '-':
file_in = sys.stdin
else:
file_in = open(options.file_in)
else:
file_in = sys.stdin
if options.file_out:
if options.file_out == '-':
file_out = sys.stdout
else:
file_out = open(options.file_out, 'wb')
else:
file_out = sys.stdout
c = Converter(options.encoding)
for line in file_in:
file_out.write(c.convert(line.rstrip('\n')))
if __name__ == '__main__':
run()
计算相似度
# -*- coding: utf-8 -*-
import codecs
import numpy
import gensim
import numpy as np
from keyword_extract import *
wordvec_size = 192
def get_char_pos(string, char):
chPos=[]
try:
chPos=list(((pos) for pos, val in enumerate(string) if(val == char)))
except:
pass
return chPos
def word2vec(file_name, model):
with codecs.open(file_name, 'r') as f:
word_vec_all = numpy.zeros(wordvec_size)
for data in f:
space_pos = get_char_pos(data, ' ')
first_word = data[0:space_pos[0]]
if model.__contains__(first_word):
word_vec_all = word_vec_all+model[first_word]
for i in range(len(space_pos) - 1):
word = data[space_pos[i]:space_pos[i + 1]]
if model.__contains__(word):
word_vec_all = word_vec_all+model[word]
return word_vec_all
# 计算2个向量的余弦相似度
def simlarityCalu(vector1, vector2):
vector1Mod = np.sqrt(vector1.dot(vector1))
vector2Mod = np.sqrt(vector2.dot(vector2))
if vector2Mod != 0 and vector1Mod != 0:
simlarity = (vector1.dot(vector2))/(vector1Mod*vector2Mod)
else:
simlarity = 0
return simlarity
if __name__ == '__main__':
model = gensim.models.Word2Vec.load('data/zhiwiki_news.word2vec')
p1 = './data/P1.txt'
p2 = './data/P2.txt'
p1_keywords = './data/P1_keywords.txt'
p2_keywords = './data/P2_keywords.txt'
getKeywords(p1, p1_keywords)
getKeywords(p2, p2_keywords)
p1_vec = word2vec(p1_keywords, model)
p2_vec = word2vec(p2_keywords, model)
print(simlarityCalu(p1_vec, p2_vec))
doc2vec
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import gensim.models as g
from gensim.corpora import WikiCorpus
import logging
from langconv import *
# enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
docvec_size = 192
class TaggedWikiDocument(object):
def __init__(self, wiki):
self.wiki = wiki
self.wiki.metadata = True
def __iter__(self):
import jieba
for content, (page_id, title) in self.wiki.get_texts():
yield g.doc2vec.LabeledSentence(words=[w for c in content
for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])
def my_function():
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
documents = TaggedWikiDocument(wiki)
model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8)
model.save('data/zhiwiki_news.doc2vec')
if __name__ == '__main__':
my_function()