file_handle = open('new_data.txt', 'r')
new_data = []
api_list = pd.read_pickle(r'api_list.pkl')
api_list = [i.lower() for i in api_list]
old_data_set = set(api_list)
for line in file_handle.readlines():
new_line = line.strip().split(' ')
new_data_set = set(new_line)
if len(new_data_set - old_data_set) == 0:
new_data.append(new_line)
import zipfile
import pandas as pd
import numpy as np
from functools import partial
#提取压缩文件中的csv的内容
def get_data(path):
with zipfile.ZipFile(path, 'r') as z: #传入训练集或测试集数据路径
if len(z.filelist) == 1:
filename = z.filelist[0].filename
if filename.endswith('.csv'):
f = z.open(filename)
data = pd.read_csv(f)
return data
def keep_elements(api_list, seq):
api_set = set(api_list)
seq_set = set(seq)
if len(seq_set - api_set) != 0:
return None
list_value = []
for index, element in enumerate(seq):
list_value.append(element.lower())
return list_value
#由于同一个文件包括了多个API序列,将所有API序列合并到一起
def get_sequence(df, period_idx):
seq_list = [] #list of list
#第一个到倒数第二个文件的API
for _id,begin in enumerate(period_idx[:-1]):
seq_list.append(df.iloc[begin: period_idx[_id+1]]['api'].values)
#最后一个文件的API
seq_list.append(df.iloc[period_idx[-1]:]['api'].values)
return seq_list
unique_api = pd.read_pickle('api_list.pkl')
train = get_data('security_train.zip')
test = get_data('security_test.zip')
keep_all_elements= partial(keep_elements, unique_api)
#api2index = {item:(i+1) for i,item in enumerate(unique_api)} #留0进行padding
#index2api = {(i+1):item for i,item in enumerate(unique_api)}
#train['api_idx'] = train['api'].map(api2index)
train_period_idx = train.file_id.drop_duplicates(keep='first').index.values #train_peroid_idx表示的是每个file最开始的index
train_df = train[['file_id','label']].drop_duplicates(keep='first')
train_df['seq'] = get_sequence(train, train_period_idx)
train_df['seq'] = train_df['seq'].apply(keep_all_elements)
train_seq = train_df[train_df['seq'].notnull()]['seq'].values
train_seq = train_seq.tolist()
#test['api_idx'] = test['api'].map(api2index)
test_period_idx = test.file_id.drop_duplicates(keep='first').index.values #test_peroid_idx表示的是每个file最开始的index
test_df = test[['file_id']].drop_duplicates(keep='first')
test_df['seq'] = get_sequence(test, test_period_idx)
test_df['seq'] = test_df['seq'].apply(keep_all_elements)
test_seq = test_df[test_df['seq'].notnull()]['seq'].values
test_seq = test_seq.tolist()
import gensim
import pickle
vector_size = 100 #词向量维度
sentences = train_seq + test_seq + new_data
model = gensim.models.Word2Vec(sentences=sentences, size=vector_size, window=5, min_count=1, workers=8, sg=0, iter=5) #window=5效果更好
wv = model.wv
vocab_list = wv.index2word
word_idx_dict = {}
for idx, word in enumerate(vocab_list):
word_idx_dict[word] = idx + 1
vectors_arr = wv.vectors
vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#此处0位置的向量指代的是padding
f_vectors = open('./word_seg_vectors_arr.pkl', 'wb')
pickle.dump(vectors_arr, f_vectors)
f_vectors.close()
import json
with open(r'word2idx_vec.json', 'w') as f:
json.dump(word_idx_dict, f)
def keep_all_elements(seq):
list_value = []
for index, element in enumerate(seq):
list_value.append(element)
return list_value
train_df['seq'] = train_df['seq'].apply(keep_all_elements)
train_df['seq'] = train_df['seq'].apply(lambda x:' '.join(x))
train_array = train_df['seq'].tolist()
with open('train_seq.txt', 'w') as f:
for item in train_array:
f.write("%s\n" % item)
test_df['seq'] = test_df['seq'].apply(keep_all_elements)
test_df['seq'] = test_df['seq'].apply(lambda x:' '.join(x))
test_array = test_df['seq'].tolist()
with open('test_seq.txt', 'w') as f:
for item in test_array:
f.write("%s\n" % item)
此步骤是根据需要进行。
import pandas as pd
import numpy as np
w2v = pd.read_csv(r'E:\work\competition\security\word_embedding\word2vec.vec', sep=' ', header = None, index_col=0, error_bad_lines=False, skiprows = [0])
w2v.reset_index(inplace=True)
w2v.columns = w2v.columns - 1
w2v.rename(columns={-1:'API'}, inplace=True)
glove = pd.read_csv(r'E:\work\competition\security\word_embedding\glove.txt', sep=' ', header = None, index_col=0, error_bad_lines=False)
glove.reset_index(inplace=True)
glove = glove.iloc[:-1,:]
glove.columns = glove.columns + 100
glove.rename(columns={100: 'API'}, inplace=True)
total_embeddings = pd.merge(w2v, glove, on='API')
total_embeddings_array = total_embeddings.iloc[:, 1:].values #去除第一列API的名称
total_embeddings_array = np.concatenate((np.zeros(150)[np.newaxis, :], total_embeddings_array), axis=0) #增加填充0对应的数据
pd.to_pickle(total_embeddings_array, r'E:\work\competition\security\word_embedding\word2vec\windows5\word_seg_vectors_arr_add_glove.pkl')
from keras.preprocessing.sequence import pad_sequences
import json
def get_dict_data(file_path):
with open(file_path, 'r') as f:
dict_data = json.load(f)
return dict_data
def keep_all_elements_word2vec(seq):
ret_value = []
dict_data = get_dict_data(r'./new_data/word2idx_vec.json') #此路径需要修改
ret_value = [dict_data[i] for i in seq]
return ret_value
train_df['seq'] = train_df['seq'].apply(keep_all_elements_word2vec)
train_seq = pad_sequences(train_df['seq'], maxlen=50000, padding='post', truncating='post')
test_df['seq'] = test_df['seq'].apply(keep_all_elements_word2vec)
test_seq = pad_sequences(test_df['seq'], maxlen=50000, padding='post', truncating='post')
pd.to_pickle(train_seq, "train_word2vec_w10_seq.pkl")
pd.to_pickle(test_seq, "test_word2vec_w10_seq.pkl")
embedding_matrix_path = 'word_seg_vectors_arr.pkl'
embedding_matrix = pickle.load(open(os.path.join(data_folder_path, embedding_matrix_path), 'rb'))
_embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero, weights=[embedding_matrix], trainable=False)(_input)