在Keras模型中使用预训练的词向量

0. 数据预处理

0.1 添加新数据

file_handle = open('new_data.txt', 'r')
new_data = []
api_list = pd.read_pickle(r'api_list.pkl')
api_list = [i.lower() for i in api_list]
old_data_set = set(api_list)

for line in file_handle.readlines():
    new_line = line.strip().split(' ')
    new_data_set = set(new_line)
    if len(new_data_set - old_data_set) == 0:
        new_data.append(new_line)

0.2 转换原数据

import zipfile
import pandas as pd
import numpy as np
from functools import partial

#提取压缩文件中的csv的内容
def get_data(path):
    with zipfile.ZipFile(path, 'r') as z:  #传入训练集或测试集数据路径
        if len(z.filelist) == 1:
            filename = z.filelist[0].filename
            if filename.endswith('.csv'):
                f = z.open(filename)
                data = pd.read_csv(f)
                return data

def keep_elements(api_list, seq):
    api_set = set(api_list)
    seq_set = set(seq)
  
    if len(seq_set - api_set) != 0:
        return None
    list_value = []
    for index, element in enumerate(seq):
        list_value.append(element.lower())
    return list_value
    
#由于同一个文件包括了多个API序列,将所有API序列合并到一起
def get_sequence(df, period_idx):
    seq_list = [] #list of list
    #第一个到倒数第二个文件的API
    for _id,begin in enumerate(period_idx[:-1]):
        seq_list.append(df.iloc[begin: period_idx[_id+1]]['api'].values)
      
   #最后一个文件的API
    seq_list.append(df.iloc[period_idx[-1]:]['api'].values)
    return seq_list

unique_api = pd.read_pickle('api_list.pkl')
train = get_data('security_train.zip')
test = get_data('security_test.zip')
keep_all_elements= partial(keep_elements, unique_api)

#api2index = {item:(i+1) for i,item in enumerate(unique_api)} #留0进行padding
#index2api = {(i+1):item for i,item in enumerate(unique_api)}

#train['api_idx'] = train['api'].map(api2index)
train_period_idx = train.file_id.drop_duplicates(keep='first').index.values #train_peroid_idx表示的是每个file最开始的index
train_df = train[['file_id','label']].drop_duplicates(keep='first')
train_df['seq'] = get_sequence(train, train_period_idx)
train_df['seq'] = train_df['seq'].apply(keep_all_elements)
train_seq = train_df[train_df['seq'].notnull()]['seq'].values
train_seq = train_seq.tolist()

#test['api_idx'] = test['api'].map(api2index)
test_period_idx = test.file_id.drop_duplicates(keep='first').index.values #test_peroid_idx表示的是每个file最开始的index
test_df = test[['file_id']].drop_duplicates(keep='first')
test_df['seq'] = get_sequence(test, test_period_idx)
test_df['seq'] = test_df['seq'].apply(keep_all_elements)
test_seq = test_df[test_df['seq'].notnull()]['seq'].values
test_seq = test_seq.tolist()

1. 训练词向量

1.1 Word2Vec词向量

import gensim
import pickle

vector_size = 100 #词向量维度
sentences = train_seq + test_seq + new_data
model = gensim.models.Word2Vec(sentences=sentences, size=vector_size, window=5, min_count=1, workers=8, sg=0, iter=5) #window=5效果更好
wv = model.wv
vocab_list = wv.index2word

word_idx_dict = {}
for idx, word in enumerate(vocab_list):
    word_idx_dict[word] = idx + 1

vectors_arr = wv.vectors
vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#此处0位置的向量指代的是padding

f_vectors = open('./word_seg_vectors_arr.pkl', 'wb')
pickle.dump(vectors_arr, f_vectors)
f_vectors.close()


import json
with open(r'word2idx_vec.json', 'w') as f:
	json.dump(word_idx_dict, f)

1.2 训练glove词向量

  1. git clone http://github.com/stanfordnlp/glove
  2. 生成文本
def keep_all_elements(seq):
    list_value = []
    for index, element in enumerate(seq):
        list_value.append(element)
    return list_value

train_df['seq'] = train_df['seq'].apply(keep_all_elements)
train_df['seq'] = train_df['seq'].apply(lambda x:' '.join(x))
train_array = train_df['seq'].tolist()
with open('train_seq.txt', 'w') as f:
    for item in train_array:
        f.write("%s\n" % item)

test_df['seq'] = test_df['seq'].apply(keep_all_elements)
test_df['seq'] = test_df['seq'].apply(lambda x:' '.join(x))
test_array = test_df['seq'].tolist()
with open('test_seq.txt', 'w') as f:
    for item in test_array:
        f.write("%s\n" % item)
  1. 修改参数,然后sh demo.sh即可产生glove词向量。

1.3 合并word2vec和glove词向量

  此步骤是根据需要进行。

import pandas as pd
import numpy as np

w2v = pd.read_csv(r'E:\work\competition\security\word_embedding\word2vec.vec', sep=' ', header = None, index_col=0, error_bad_lines=False, skiprows = [0])
w2v.reset_index(inplace=True)
w2v.columns = w2v.columns - 1
w2v.rename(columns={-1:'API'}, inplace=True)


glove = pd.read_csv(r'E:\work\competition\security\word_embedding\glove.txt', sep=' ', header = None, index_col=0, error_bad_lines=False)
glove.reset_index(inplace=True)
glove = glove.iloc[:-1,:]
glove.columns = glove.columns + 100
glove.rename(columns={100: 'API'}, inplace=True)

total_embeddings = pd.merge(w2v, glove, on='API')
total_embeddings_array = total_embeddings.iloc[:, 1:].values #去除第一列API的名称
total_embeddings_array = np.concatenate((np.zeros(150)[np.newaxis, :], total_embeddings_array), axis=0) #增加填充0对应的数据

pd.to_pickle(total_embeddings_array, r'E:\work\competition\security\word_embedding\word2vec\windows5\word_seg_vectors_arr_add_glove.pkl')

1.4 使用keras对API序列进行数字化

from keras.preprocessing.sequence import pad_sequences
import json

def get_dict_data(file_path):
    with open(file_path, 'r') as f:
        dict_data = json.load(f)
        return dict_data

def keep_all_elements_word2vec(seq):
    ret_value = []
    dict_data = get_dict_data(r'./new_data/word2idx_vec.json') #此路径需要修改
    ret_value = [dict_data[i] for i in seq]
    return ret_value

train_df['seq'] = train_df['seq'].apply(keep_all_elements_word2vec)
train_seq = pad_sequences(train_df['seq'], maxlen=50000, padding='post', truncating='post')

test_df['seq'] = test_df['seq'].apply(keep_all_elements_word2vec)
test_seq = pad_sequences(test_df['seq'], maxlen=50000, padding='post', truncating='post')

pd.to_pickle(train_seq, "train_word2vec_w10_seq.pkl")
pd.to_pickle(test_seq, "test_word2vec_w10_seq.pkl")

2. 使用预训练的词向量

embedding_matrix_path = 'word_seg_vectors_arr.pkl'
embedding_matrix = pickle.load(open(os.path.join(data_folder_path, embedding_matrix_path), 'rb'))

_embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero, weights=[embedding_matrix], trainable=False)(_input)

你可能感兴趣的:(在Keras模型中使用预训练的词向量)