数据集:https://github.com/alivelxj/m5c-DFRESG/tree/main/m5c/data
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def load_data(file):
seq_sum=[]
label_sum=[]
f = open(file).readlines()
for i in range(0, len(f)-1,2):
seq = f[i+1].strip('\n')
seq_sum.append(seq)
if f[i] == '>+sample\n':
f[i] = f[i].replace('>+sample\n', '1').strip('\n')
label_sum.append(int(f[i]))
if f[i] == '>-sample\n':
f[i] = f[i].replace('>-sample', '0').strip('\n')
label_sum.append(int(f[i]))
return seq_sum, label_sum
X_A_train,y_A_train = load_data(r'm5c-DFRESG-main\m5c\data\Athaliana\Arabidopsis_train.fasta')
X_A_ind, y_A_ind =load_data(r'm5c-DFRESG-main\m5c\data\Athaliana\Arabidopsis_indep.fasta')
y_A_ind=np.array(y_A_ind)
y_A_ind=y_A_ind.reshape(-1,1)
class word2vec(object):
def __init__(self, seq, test_seq, k, num_feature, min_count, num_workers, context, seed, epoch, model_name):
self.seq = seq
self.test_seq = test_seq
self.k = k
self.num_feature = num_feature
self.min_count = min_count
self.num_works= num_workers
self.context = context
self.seed = seed
self.epoch = epoch
self.model_name = model_name
''' 通过调用gensim.word2vec来提取词向量
seq:构建语料库所需的数据
test_seq:
k:使用k-mer分词,k:int 1,2,3....
num_feature : 最终词向量的维度
min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
num_workers :参数控制训练的并行数
context:表示当前词与预测词在一个句子中的最大距离是多少
seed:用于随机数发生器。与初始化词向量有关
epoch:迭代次数
model_name:训练好后的模型文件
'''
#分词
def seperate_word(self,single_seq):
sentence = ''
for i in range(len(single_seq)-self.k+1):
sentence += single_seq[i:i+self.k]+ " "
sentence = sentence[0:len(sentence)-1]
return sentence
def process_all_seq(self):
seq_list = []
for i in range(len(self.seq)):
i = self.seq[i].strip('\n')
line = self.seperate_word(i)
# line = line+'\n'
seq_list.append(line)
return seq_list
# 如果需要重新构建词汇表,需要执行下面函数
# def covert_file(self):
# seq = open('train', 'w')
# for i in range(len(self.process_all_seq())):
# corpus = self.process_all_seq()[i].strip('')
# seq.write(corpus+'\n')
# seq.close()
#构建词汇表之后,利用词汇表训练word2vec模型并保存
def word_model(self):
sentence = LineSentence('train', max_sentence_length=15000)
model = Word2Vec(sentence, workers=self.num_works, vector_size=self.num_feature, min_count=self.min_count,
window=self.context, seed=self.seed, epochs=self.epoch)
model.init_sims(replace=False)
model.save("train_model")
return model
# 需要训练的数据进行分词
def getseq_split(self):
seq_list = []
for i in self.test_seq:
seq_list.append(self.seperate_word(i).split(" "))
return seq_list
# def process_test_seq(self):
#
# seq_list = []
# for i in range(len(self.test_seq)):
# i = self.seq[i].strip('\n')
# line = self.seperate_word(i).strip('')
# # line = line+'\n'
# seq_list.append(line)
#
# return seq_list
# def covert_test(self):
# seq = open('test', 'w')
# for i in range(len(self.process_test_seq())):
# corpus = self.process_test_seq()[i].strip('')
# seq.write(corpus+'\n')
# seq.close()
# def concatenate(self,pos_seq,neg_seq,total_seq):
# pos_file = open(pos_seq)
# neg_file = open(neg_seq)
# total_file = open(total_seq,'w')
# for i in pos_file:
# if '>' not in i:
# total_file.write('1\t'+i)
# for i in neg_file:
# if '>' not in i:
# total_file.write('0\t'+i)
# pos_file.close()
# neg_file.close()
# total_file.close()
# 返回特征矩阵
def get_feature_matrix(self):
test_data = self.getseq_split()
print(len(test_data))
feature_matrix = []
for i in test_data:
feature_matrix.append(np.mean(Word2Vec.load(self.model_name).wv[i],axis=0))
return feature_matrix
def concat_feature_label(self, label_file):
feature = self.get_feature_matrix()
label = label_file
feature_file = np.concatenate([feature, label], axis=1)
return feature_file
if __name__=='__main__':
test =word2vec(X_A_train,X_A_ind, 2, 100, 1, 10, 20, 42, 10, 'train_model')
a=test.concat_feature_label(y_A_ind)
pd.DataFrame(a).to_csv("feature.csv")
总结:最终会生成一个shape为(seq_length, num_feature) 的数组
不足:语料库比较小,可能效果较差