第20列:is_guest_login:离散类型:如果是guest登陆则为1,否则为0
代码如下:
#coding:utf-8
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
DATAPATH = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/data")
def parse_data():
test_data = list()
label_data = list()
FULLPATH = DATAPATH + "/kddcup.data.corrected"
with open(FULLPATH, "r") as f:
for line in f.readlines():
lines = line.split(",")
test_data.append(lines[9:21])
if lines[-1] == 'rootkit':
label_data.append(1)
else:
label_data.append(0)
return [test_data,label_data]
if __name__ == '__main__':
test_data, label_data = parse_data()
neigh = KNeighborsClassifier(n_neighbors = 3)
scores = cross_val_score(neigh, test_data, label_data, cv=10)
print scores
print "precision:", np.mean(scores)*100
算法里的特征,是先把所有文件里的系统调用组成一个词集,然后特征就是用的每一个文件里的系统调用在词集里的分布情况组成的向量空间。
代码如下:
#coding:utf-8
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
DATAPATH = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/data")
#生成词集
def get_words():
FULLPATH = DATAPATH + "/ADFA-LD/Attack_Data_Master"
files = os.listdir(FULLPATH)
words = set()
for item in files:
if item in ['.', '..']:
continue
CPATH = FULLPATH + "/" + item
cfiles = os.listdir(CPATH)
for citem in cfiles:
if citem.endswith(".txt"):
fullpath = FULLPATH + "/" + item + "/" + citem
with open(fullpath, "r") as f:
for line in f.readlines():
lines = line.split()
for ccitem in lines:
words.add(ccitem)
return list(words)
def parse_data(words):
test_data = list()
label_data = list()
words_len = len(words)
FULLPATH = DATAPATH + "/ADFA-LD/Attack_Data_Master"
files = os.listdir(FULLPATH)
for item in files:
if item in ['.', '..']:
continue
CPATH = FULLPATH + "/" + item
cfiles = os.listdir(CPATH)
label = 0
if item.startswith("Web_Shell"):
label = 1
for citem in cfiles:
if citem.endswith(".txt"):
fullpath = FULLPATH + "/" + item + "/" + citem
with open(fullpath, "r") as f:
c_test_data = [0]*words_len
words_dict = dict()
for line in f.readlines():
lines = line.split()
for ccitem in lines:
if ccitem not in words_dict.keys():
words_dict[ccitem] = 1
else:
words_dict[ccitem] = words_dict[ccitem] + 1
for key in words_dict.keys():
for i in range(0, words_len):
if key == words[i]:
c_test_data[i] = words_dict[key]
break
test_data.append(c_test_data)
label_data.append(label)
return [test_data,label_data]
if __name__ == '__main__':
words = get_words()
test_data, label_data = parse_data(words)
neigh = KNeighborsClassifier(n_neighbors = 3)
score = cross_val_score(neigh, test_data, label_data, cv=10)
print score
print "precision:",np.mean(score)*100
10轮交叉验证效果如下: