在网上找了一些KDD CUP99数据预处理的代码,结合自己写的一些。主要包括数值化和归一化。
参考原博客:https://blog.csdn.net/qq_35733521/article/details/87889480?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase
import csv
import pandas as pd
import numpy as np
def get_col_types():
protocol_type = ['icmp', 'tcp', 'udp']
service_type = ['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain',
'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
'hostnames', 'http', 'http_443', 'icmp', 'imap4', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link',
'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp',
'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje',
'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i',
'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois','http_8001','aol','http_2784',
'harvest']
flag_type = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
train_label_type = ['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.',
'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.',
'warezmaster.']
test_label_type = ['apache2.', 'back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'httptunnel.', 'imap.',
'ipsweep.', 'land.', 'loadmodule.', 'mailbomb.', 'mscan.', 'multihop.', 'named.', 'neptune.',
'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'processtable.', 'ps.', 'rootkit.',
'saint.', 'satan.', 'sendmail.', 'smurf.', 'snmpgetattack.', 'snmpguess.', 'sqlattack.',
'teardrop.', 'udpstorm.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.', 'xterm.']
label_type = [['normal.'],
['ipsweep.', 'mscan.', 'nmap.', 'portsweep.', 'saint.', 'satan.'],
['apache2.', 'back.', 'land.', 'mailbomb.', 'neptune.', 'pod.', 'processtable.', 'smurf.', 'teardrop.', 'udpstorm.'],
['buffer_overflow.', 'httptunnel.', 'loadmodule.', 'perl.', 'ps.', 'rootkit.', 'sqlattack.', 'xterm.'],
['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'named.', 'phf.', 'sendmail.', 'snmpgetattack.',
'snmpguess.', 'spy.', 'warezclient.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.']]
return protocol_type,service_type,flag_type,label_type
label_type_1=['normal.']
def handle_data():
protocol_type,service_type,flag_type,label_type = get_col_types()
source_file = 'KDDTrain+.csv'
handled_file = 'KDDTrain+number.csv' # write to csv file
data_file = open(handled_file, 'w', newline='')
csv_writer = csv.writer(data_file)
with open(source_file, 'r') as data_source:
csv_reader = csv.reader(data_source)
for row in csv_reader:
row[1] = protocol_type.index(row[1])
row[2] = service_type.index(row[2])
row[3] = flag_type.index(row[3])
for labels in label_type:
if labels.count(row[-1])>0:
row[-1] = label_type.index(labels)
csv_writer.writerow(row)
data_file.close()
test_source_file = 'KDDTest+.csv'
test_handled_file = 'KDDTest+number.csv' # write to csv file
test_data_file = open(test_handled_file, 'w', newline='')
test_csv_writer = csv.writer(test_data_file)
with open(test_source_file, 'r') as data_source:
csv_reader = csv.reader(data_source)
for row in csv_reader:
row[1] = protocol_type.index(row[1])
row[2] = service_type.index(row[2])
row[3] = flag_type.index(row[3])
for labels in label_type:
if labels.count(row[-1]) > 0:
row[-1] = label_type.index(labels)
test_csv_writer.writerow(row)
test_data_file.close()
print('pre process completed!')
接下来是数据归一化,有的博客提到不需要归一化,不影响数据分类,大家可以试试。在这里,也给出接着数值化后进行归一化的操作。
import csv
import numpy as np
my_matrix = np.loadtxt(open('KDDTrain+number.csv','r'),delimiter=",",skiprows=0)
#print(my_matrix)
mmax=np.max(my_matrix,axis=0)
mmin=np.min(my_matrix,axis=0)
for i in range(len(mmax)):
if mmax[i]==mmin[i]:
mmax[i]+=0.000001
res=(my_matrix-mmin)/(mmax-mmin)
#print(res)
with open("KDDTrain-last.csv", "w+",newline='') as csvfile:
writer = csv.writer(csvfile)
# 写入多行用writerows
writer.writerows(res)
归一化不需要处理标签,可以手动去除,也可以体现在代码中。
至此,数据预处理得到两个csv,数值化和归一化两个csv文件。