import logging
import random
import numpy as np
import torch
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
# split data to 10 fold
fold_num = 10
data_file = '../data/train_set.csv'
import pandas as pd
def all_data2fold(fold_num, num=10000):
fold_data = []
f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
texts = f['text'].tolist()[:num]
labels = f['label'].tolist()[:num]
total = len(labels)
index = list(range(total))
np.random.shuffle(index)
all_texts = []
all_labels = []
for i in index:
all_texts.append(texts[i])
all_labels.append(labels[i])
label2id = {}
for i in range(total):
label = str(all_labels[i])
if label not in label2id:
label2id[label] = [i]
else:
label2id[label].append(i)
all_index = [[] for _ in range(fold_num)]
for label, data in label2id.items():
# print(label, len(data))
batch_size = int(len(data) / fold_num)
other = len(data) - batch_size * fold_num
for i in range(fold_num):
cur_batch_size = batch_size + 1 if i < other else batch_size
# print(cur_batch_size)
batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
all_index[i].extend(batch_data)
batch_size = int(total / fold_num)
other_texts = []
other_labels = []
other_num = 0
start = 0
for fold in range(fold_num):
num = len(all_index[fold])
texts = [all_texts[i] for i in all_index[fold]]
labels = [all_labels[i] for i in all_index[fold]]
if num > batch_size:
fold_texts = texts[:batch_size]
other_texts.extend(texts[batch_size:])
fold_labels = labels[:batch_size]
other_labels.extend(labels[batch_size:])
other_num += num - batch_size
elif num < batch_size:
end = start + batch_size - num
fold_texts = texts + other_texts[start: end]
fold_labels = labels + other_labels[start: end]
start = end
else:
fold_texts = texts
fold_labels = labels
assert batch_size == len(fold_labels)
# shuffle
index = list(range(batch_size))
np.random.shuffle(index)
shuffle_fold_texts = []
shuffle_fold_labels = []
for i in index:
shuffle_fold_texts.append(fold_texts[i])
shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
return fold_data
fold_data = all_data2fold(10)
2020-07-18 23:30:04,912 INFO: Fold lens [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]
# build train data for word2vec
fold_id = 9
train_texts = []
for i in range(0, fold_id):
data = fold_data[i]
train_texts.extend(data['text'])
logging.info('Total %d docs.' % len(train_texts))
2020-07-18 23:30:04,929 INFO: Total 9000 docs.
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec
num_features = 100 # Word vector dimensionality
num_workers = 8 # Number of threads to run in parallel
train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)
# save model
model.save("./word2vec.bin")
结果得:
2020-07-18 23:30:04,938 INFO: Start training…
2020-07-18 23:30:05,545 INFO: collecting all words and their counts
2020-07-18 23:30:05,546 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-18 23:30:06,231 INFO: collected 5295 word types from a corpus of 8191447 raw words and 9000 sentences
2020-07-18 23:30:06,231 INFO: Loading a fresh vocabulary
2020-07-18 23:30:06,305 INFO: effective_min_count=5 retains 4335 unique words (81% of original 5295, drops 960)
2020-07-18 23:30:06,305 INFO: effective_min_count=5 leaves 8189498 word corpus (99% of original 8191447, drops 1949)
2020-07-18 23:30:06,314 INFO: deleting the raw counts dictionary of 5295 items
2020-07-18 23:30:06,316 INFO: sample=0.001 downsamples 61 most-common words
2020-07-18 23:30:06,316 INFO: downsampling leaves estimated 7070438 word corpus (86.3% of prior 8189498)
2020-07-18 23:30:06,324 INFO: estimated required memory for 4335 words and 100 dimensions: 5635500 bytes
2020-07-18 23:30:06,325 INFO: resetting layer weights
2020-07-18 23:30:06,356 INFO: training model with 8 workers on 4335 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-07-18 23:30:07,360 INFO: EPOCH 1 - PROGRESS: at 63.20% examples, 4434412 words/s, in_qsize 15, out_qsize 0
2020-07-18 23:30:07,959 INFO: worker thread finished; awaiting finish of 7 more threads
2020-07-18 23:30:07,960 INFO: worker thread finished; awaiting finish of 6 more threads
2020-07-18 23:30:07,963 INFO: worker thread finished; awaiting finish of 5 more threads
2020-07-18 23:30:07,963 INFO: worker thread finished; awaiting finish of 4 more threads
2020-07-18 23:30:07,964 INFO: worker thread finished; awaiting finish of 3 more threads
2020-07-18 23:30:07,965 INFO: worker thread finished; awaiting finish of 2 more threads
2020-07-18 23:30:07,968 INFO: worker thread finished; awaiting finish of 1 more threads
2020-07-18 23:30:07,969 INFO: worker thread finished; awaiting finish of 0 more threads
2020-07-18 23:30:07,970 INFO: EPOCH - 1 : training on 8191447 raw words (7021120 effective words) took 1.6s, 4357567 effective words/s
2020-07-18 23:30:08,979 INFO: EPOCH 2 - PROGRESS: at 59.39% examples, 4143643 words/s, in_qsize 15, out_qsize 0
2020-07-18 23:30:09,661 INFO: worker thread finished; awaiting finish of 7 more threads
2020-07-18 23:30:09,663 INFO: worker thread finished; awaiting finish of 6 more threads
2020-07-18 23:30:09,663 INFO: worker thread finished; awaiting finish of 5 more threads
2020-07-18 23:30:09,664 INFO: worker thread finished; awaiting finish of 4 more threads
2020-07-18 23:30:09,667 INFO: worker thread finished; awaiting finish of 3 more threads
2020-07-18 23:30:09,667 INFO: worker thread finished; awaiting finish of 2 more threads
2020-07-18 23:30:09,670 INFO: worker thread finished; awaiting finish of 1 more threads
2020-07-18 23:30:09,672 INFO: worker thread finished; awaiting finish of 0 more threads
2020-07-18 23:30:09,672 INFO: EPOCH - 2 : training on 8191447 raw words (7021506 effective words) took 1.7s, 4144060 effective words/s
2020-07-18 23:30:10,681 INFO: EPOCH 3 - PROGRESS: at 59.52% examples, 4154672 words/s, in_qsize 15, out_qsize 0
2020-07-18 23:30:11,356 INFO: worker thread finished; awaiting finish of 7 more threads
2020-07-18 23:30:11,356 INFO: worker thread finished; awaiting finish of 6 more threads
2020-07-18 23:30:11,358 INFO: worker thread finished; awaiting finish of 5 more threads
2020-07-18 23:30:11,359 INFO: worker thread finished; awaiting finish of 4 more threads
2020-07-18 23:30:11,362 INFO: worker thread finished; awaiting finish of 3 more threads
2020-07-18 23:30:11,362 INFO: worker thread finished; awaiting finish of 2 more threads
2020-07-18 23:30:11,365 INFO: worker thread finished; awaiting finish of 1 more threads
2020-07-18 23:30:11,366 INFO: worker thread finished; awaiting finish of 0 more threads
2020-07-18 23:30:11,367 INFO: EPOCH - 3 : training on 8191447 raw words (7020706 effective words) took 1.7s, 4163417 effective words/s
2020-07-18 23:30:12,378 INFO: EPOCH 4 - PROGRESS: at 58.80% examples, 4102329 words/s, in_qsize 15, out_qsize 0
2020-07-18 23:30:13,072 INFO: worker thread finished; awaiting finish of 7 more threads
2020-07-18 23:30:13,078 INFO: worker thread finished; awaiting finish of 6 more threads
2020-07-18 23:30:13,079 INFO: worker thread finished; awaiting finish of 5 more threads
2020-07-18 23:30:13,079 INFO: worker thread finished; awaiting finish of 4 more threads
2020-07-18 23:30:13,080 INFO: worker thread finished; awaiting finish of 3 more threads
2020-07-18 23:30:13,080 INFO: worker thread finished; awaiting finish of 2 more threads
2020-07-18 23:30:13,081 INFO: worker thread finished; awaiting finish of 1 more threads
2020-07-18 23:30:13,082 INFO: worker thread finished; awaiting finish of 0 more threads
2020-07-18 23:30:13,083 INFO: EPOCH - 4 : training on 8191447 raw words (7021984 effective words) took 1.7s, 4117851 effective words/s
2020-07-18 23:30:14,091 INFO: EPOCH 5 - PROGRESS: at 58.99% examples, 4115963 words/s, in_qsize 16, out_qsize 0
2020-07-18 23:30:14,769 INFO: worker thread finished; awaiting finish of 7 more threads
2020-07-18 23:30:14,770 INFO: worker thread finished; awaiting finish of 6 more threads
2020-07-18 23:30:14,770 INFO: worker thread finished; awaiting finish of 5 more threads
2020-07-18 23:30:14,771 INFO: worker thread finished; awaiting finish of 4 more threads
2020-07-18 23:30:14,773 INFO: worker thread finished; awaiting finish of 3 more threads
2020-07-18 23:30:14,776 INFO: worker thread finished; awaiting finish of 2 more threads
2020-07-18 23:30:14,777 INFO: worker thread finished; awaiting finish of 1 more threads
2020-07-18 23:30:14,779 INFO: worker thread finished; awaiting finish of 0 more threads
2020-07-18 23:30:14,779 INFO: EPOCH - 5 : training on 8191447 raw words (7021532 effective words) took 1.7s, 4156171 effective words/s
2020-07-18 23:30:14,780 INFO: training on a 40957235 raw words (35106848 effective words) took 8.4s, 4167675 effective words/s
2020-07-18 23:30:14,780 INFO: precomputing L2-norms of word weight vectors
2020-07-18 23:30:14,782 INFO: saving Word2Vec object under ./word2vec.bin, separately None
2020-07-18 23:30:14,783 INFO: not storing attribute vectors_norm
2020-07-18 23:30:14,783 INFO: not storing attribute cum_table
2020-07-18 23:30:14,820 INFO: saved ./word2vec.bin
# load model
model = Word2Vec.load("./word2vec.bin")
# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)
结果得:
2020-07-18 23:30:14,825 INFO: loading Word2Vec object from ./word2vec.bin
2020-07-18 23:30:14,958 INFO: loading wv recursively from ./word2vec.bin.wv.* with mmap=None
2020-07-18 23:30:14,958 INFO: setting ignored attribute vectors_norm to None
2020-07-18 23:30:14,959 INFO: loading vocabulary recursively from ./word2vec.bin.vocabulary.* with mmap=None
2020-07-18 23:30:14,959 INFO: loading trainables recursively from ./word2vec.bin.trainables.* with mmap=None
2020-07-18 23:30:14,959 INFO: setting ignored attribute cum_table to None
2020-07-18 23:30:14,959 INFO: loaded ./word2vec.bin
2020-07-18 23:30:14,965 INFO: storing 4335x100 projection weights into ./word2vec.txt