wiki中文文本语料下载,在维基百科中文语料训练Word2vec and doc2vec 文本向量化代码示例

首先下载wiki中文语料(大约1.7G)
https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
下载的文件名为“zhwiki-latest-pages-articles.xml.bz2”
这个大家都知道,然后没什么卵用,因为墙太强大,所以下载不下来,只能另辟蹊径了。。。

因此搜来搜去找到2个开源语料库

https://github.com/brightmart/nlp_chinese_corpus

https://github.com/crownpku/Awesome-Chinese-NLP  这个有丰富的语料资源

维基中文语料库地址

https://dumps.wikimedia.org/zhwiki/latest/

wiki中文文本语料下载,在维基百科中文语料训练Word2vec and doc2vec 文本向量化代码示例_第1张图片

https://dumps.wikimedia.org/zhwiki/

里面包含了wike百科语料,希望项目越来越大,内容越来越丰富

随着研究的不断深入,发展中国在语料积累上还很欠缺

都是扯犊子   下了半天每一次下载成功的,最后改用迅雷下载,终于可以下载了

wiki中文文本语料下载,在维基百科中文语料训练Word2vec and doc2vec 文本向量化代码示例_第2张图片

http://academictorrents.com/

wiki中文文本语料下载,在维基百科中文语料训练Word2vec and doc2vec 文本向量化代码示例_第3张图片

语料库地址目录

 

文本预处理  data_pre_process.py

# -*- coding: utf-8 -*-
from gensim.corpora import WikiCorpus
import jieba
from langconv import *


def my_function():
    space = ' '
    i = 0
    l = []
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./data/reduce_zhiwiki.txt', 'w')
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = Converter('zh-hans').convert(temp_sentence)
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        f.write(space.join(l) + '\n')
        l = []
        i = i + 1
        if(i % 200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()


if __name__ == '__main__':
    my_function()

word2vec训练代码

# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


def my_function():
    wiki_news = open('D:\\yangyang\\wiki\\wikipre\\reduce_zhiwiki.txt', 'r', encoding='utf8')
    # sg = 0 表示用CBOW模型训练 1表示用SKIP-GRAM模型训练
    # size表示词向量的维度
    # window 表示当前词与预测词最大距离
    # min_count表示最小出现次数
    # workers 表示训练词向量时所使用的线程数
    model = Word2Vec(LineSentence(wiki_news), sg=0, size=192, window=5, min_count=5, workers=9)
    model.save('zhiwiki_news.word2vec')


if __name__ == '__main__':
    my_function()

# 2019/6/17 19:51
# 2019-06-18 11:15:10,509 : INFO : EPOCH 1 - PROGRESS: at 0.70% examples, 65 words/s, in_qsize 18, out_qsize 0
# 训练速度太慢,要用服务器才行 24小时还不到1% 单机训练完要100天

test

# coding=utf-8
import gensim


def my_function():

    model = gensim.models.Word2Vec.load('./data/zhiwiki_news.word2vec')
    print(model.similarity('西红柿', '番茄'))  # 相似度为0.63
    print(model.similarity('西红柿', '香蕉'))  # 相似度为0.44

    word = '中国'
    if word in model.wv.index2word:
        print(model.most_similar(word))


if __name__ == '__main__':
    my_function()

繁体字转简体字

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from copy import deepcopy

try:
    import psyco
    psyco.full()
except:
    pass

try:
    from zh_wiki import zh2Hant, zh2Hans
except ImportError:
    # from zhtools.zh_wiki import zh2Hant, zh2Hans
    import zhtools.zh_wiki

import sys
py3k = sys.version_info >= (3, 0, 0)

if py3k:
    UEMPTY = ''
else:
    _zh2Hant, _zh2Hans = {}, {}
    for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
        for k, v in old.items():
            new[k.decode('utf8')] = v.decode('utf8')
    zh2Hant = _zh2Hant
    zh2Hans = _zh2Hans
    UEMPTY = ''.decode('utf8')

# states
(START, END, FAIL, WAIT_TAIL) = list(range(4))
# conditions
(TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))

MAPS = {}

class Node(object):
    def __init__(self, from_word, to_word=None, is_tail=True,
            have_child=False):
        self.from_word = from_word
        if to_word is None:
            self.to_word = from_word
            self.data = (is_tail, have_child, from_word)
            self.is_original = True
        else:
            self.to_word = to_word or from_word
            self.data = (is_tail, have_child, to_word)
            self.is_original = False
        self.is_tail = is_tail
        self.have_child = have_child

    def is_original_long_word(self):
        return self.is_original and len(self.from_word)>1

    def is_follow(self, chars):
        return chars != self.from_word[:-1]

    def __str__(self):
        return '' % (repr(self.from_word),
                repr(self.to_word), self.is_tail, self.have_child)

    __repr__ = __str__

class ConvertMap(object):
    def __init__(self, name, mapping=None):
        self.name = name
        self._map = {}
        if mapping:
            self.set_convert_map(mapping)

    def set_convert_map(self, mapping):
        convert_map = {}
        have_child = {}
        max_key_length = 0
        for key in sorted(mapping.keys()):
            if len(key)>1:
                for i in range(1, len(key)):
                    parent_key = key[:i]
                    have_child[parent_key] = True
            have_child[key] = False
            max_key_length = max(max_key_length, len(key))
        for key in sorted(have_child.keys()):
            convert_map[key] = (key in mapping, have_child[key],
                    mapping.get(key, UEMPTY))
        self._map = convert_map
        self.max_key_length = max_key_length

    def __getitem__(self, k):
        try:
            is_tail, have_child, to_word  = self._map[k]
            return Node(k, to_word, is_tail, have_child)
        except:
            return Node(k)

    def __contains__(self, k):
        return k in self._map

    def __len__(self):
        return len(self._map)

class StatesMachineException(Exception): pass

class StatesMachine(object):
    def __init__(self):
        self.state = START
        self.final = UEMPTY
        self.len = 0
        self.pool = UEMPTY

    def clone(self, pool):
        new = deepcopy(self)
        new.state = WAIT_TAIL
        new.pool = pool
        return new

    def feed(self, char, map):
        node = map[self.pool+char]

        if node.have_child:
            if node.is_tail:
                if node.is_original:
                    cond = UNMATCHED_SWITCH
                else:
                    cond = MATCHED_SWITCH
            else:
                cond = CONNECTOR
        else:
            if node.is_tail:
                cond = TAIL
            else:
                cond = ERROR

        new = None
        if cond == ERROR:
            self.state = FAIL
        elif cond == TAIL:
            if self.state == WAIT_TAIL and node.is_original_long_word():
                self.state = FAIL
            else:
                self.final += node.to_word
                self.len += 1
                self.pool = UEMPTY
                self.state = END
        elif self.state == START or self.state == WAIT_TAIL:
            if cond == MATCHED_SWITCH:
                new = self.clone(node.from_word)
                self.final += node.to_word
                self.len += 1
                self.state = END
                self.pool = UEMPTY
            elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
                if self.state == START:
                    new = self.clone(node.from_word)
                    self.final += node.to_word
                    self.len += 1
                    self.state = END
                else:
                    if node.is_follow(self.pool):
                        self.state = FAIL
                    else:
                        self.pool = node.from_word
        elif self.state == END:
            # END is a new START
            self.state = START
            new = self.feed(char, map)
        elif self.state == FAIL:
            raise StatesMachineException('Translate States Machine '
                    'have error with input data %s' % node)
        return new

    def __len__(self):
        return self.len + 1

    def __str__(self):
        return '' % (
                id(self), self.pool, self.state, self.final)
    __repr__ = __str__

class Converter(object):
    def __init__(self, to_encoding):
        self.to_encoding = to_encoding
        self.map = MAPS[to_encoding]
        self.start()

    def feed(self, char):
        branches = []
        for fsm in self.machines:
            new = fsm.feed(char, self.map)
            if new:
                branches.append(new)
        if branches:
            self.machines.extend(branches)
        self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
        all_ok = True
        for fsm in self.machines:
            if fsm.state != END:
                all_ok = False
        if all_ok:
            self._clean()
        return self.get_result()

    def _clean(self):
        if len(self.machines):
            self.machines.sort(key=lambda x: len(x))
            # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
            self.final += self.machines[0].final
        self.machines = [StatesMachine()]

    def start(self):
        self.machines = [StatesMachine()]
        self.final = UEMPTY

    def end(self):
        self.machines = [fsm for fsm in self.machines
                if fsm.state == FAIL or fsm.state == END]
        self._clean()

    def convert(self, string):
        self.start()
        for char in string:
            self.feed(char)
        self.end()
        return self.get_result()

    def get_result(self):
        return self.final


def registery(name, mapping):
    global MAPS
    MAPS[name] = ConvertMap(name, mapping)

registery('zh-hant', zh2Hant)
registery('zh-hans', zh2Hans)
del zh2Hant, zh2Hans


def run():
    import sys
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option('-e', type='string', dest='encoding',
            help='encoding')
    parser.add_option('-f', type='string', dest='file_in',
            help='input file (- for stdin)')
    parser.add_option('-t', type='string', dest='file_out',
            help='output file')
    (options, args) = parser.parse_args()
    if not options.encoding:
        parser.error('encoding must be set')
    if options.file_in:
        if options.file_in == '-':
            file_in = sys.stdin
        else:
            file_in = open(options.file_in)
    else:
        file_in = sys.stdin
    if options.file_out:
        if options.file_out == '-':
            file_out = sys.stdout
        else:
            file_out = open(options.file_out, 'wb')
    else:
        file_out = sys.stdout

    c = Converter(options.encoding)
    for line in file_in:
        file_out.write(c.convert(line.rstrip('\n')))


if __name__ == '__main__':
    run()

计算相似度

# -*- coding: utf-8 -*-
import codecs
import numpy
import gensim
import numpy as np
from keyword_extract import *

wordvec_size = 192


def get_char_pos(string, char):
    chPos=[]
    try:
        chPos=list(((pos) for pos, val in enumerate(string) if(val == char)))
    except:
        pass
    return chPos


def word2vec(file_name, model):
    with codecs.open(file_name, 'r') as f:
        word_vec_all = numpy.zeros(wordvec_size)
        for data in f:
            space_pos = get_char_pos(data, ' ')
            first_word = data[0:space_pos[0]]
            if model.__contains__(first_word):
                word_vec_all = word_vec_all+model[first_word]

            for i in range(len(space_pos) - 1):
                word = data[space_pos[i]:space_pos[i + 1]]
                if model.__contains__(word):
                    word_vec_all = word_vec_all+model[word]
        return word_vec_all


# 计算2个向量的余弦相似度
def simlarityCalu(vector1, vector2):
    vector1Mod = np.sqrt(vector1.dot(vector1))
    vector2Mod = np.sqrt(vector2.dot(vector2))
    if vector2Mod != 0 and vector1Mod != 0:
        simlarity = (vector1.dot(vector2))/(vector1Mod*vector2Mod)
    else:
        simlarity = 0
    return simlarity


if __name__ == '__main__':
    model = gensim.models.Word2Vec.load('data/zhiwiki_news.word2vec')
    p1 = './data/P1.txt'
    p2 = './data/P2.txt'
    p1_keywords = './data/P1_keywords.txt'
    p2_keywords = './data/P2_keywords.txt'
    getKeywords(p1, p1_keywords)
    getKeywords(p2, p2_keywords)
    p1_vec = word2vec(p1_keywords, model)
    p2_vec = word2vec(p2_keywords, model)

    print(simlarityCalu(p1_vec, p2_vec))

doc2vec

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gensim.models as g
from gensim.corpora import WikiCorpus
import logging
from langconv import *


# enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

docvec_size = 192


class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True

    def __iter__(self):
        import jieba
        for content, (page_id, title) in self.wiki.get_texts():
            yield g.doc2vec.LabeledSentence(words=[w for c in content
                                                   for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])


def my_function():
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    documents = TaggedWikiDocument(wiki)

    model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8)
    model.save('data/zhiwiki_news.doc2vec')


if __name__ == '__main__':
    my_function()

 

 

你可能感兴趣的:(gensim,gensim,word2vec,doc2vec,wiki,中文语料)