KDD CUP99数据预处理

在网上找了一些KDD CUP99数据预处理的代码,结合自己写的一些。主要包括数值化和归一化。
参考原博客:https://blog.csdn.net/qq_35733521/article/details/87889480?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase

import csv
import pandas as pd
import numpy as np
def get_col_types():
    protocol_type = ['icmp', 'tcp', 'udp']
    service_type = ['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain',
                    'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
                    'hostnames', 'http', 'http_443', 'icmp', 'imap4', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link',
                    'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp',
                    'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje',
                    'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i',
                    'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois','http_8001','aol','http_2784',
                    'harvest']
    flag_type = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
    train_label_type = ['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.',
                        'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
                        'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.',
                        'warezmaster.']
    test_label_type = ['apache2.', 'back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'httptunnel.', 'imap.',
                       'ipsweep.', 'land.', 'loadmodule.', 'mailbomb.', 'mscan.', 'multihop.', 'named.', 'neptune.',
                       'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'processtable.', 'ps.', 'rootkit.',
                       'saint.', 'satan.', 'sendmail.', 'smurf.', 'snmpgetattack.', 'snmpguess.', 'sqlattack.',
                       'teardrop.', 'udpstorm.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.', 'xterm.']
    label_type = [['normal.'],
                  ['ipsweep.', 'mscan.', 'nmap.', 'portsweep.', 'saint.', 'satan.'],
                  ['apache2.', 'back.', 'land.', 'mailbomb.', 'neptune.', 'pod.', 'processtable.', 'smurf.', 'teardrop.', 'udpstorm.'],
                  ['buffer_overflow.', 'httptunnel.', 'loadmodule.', 'perl.', 'ps.', 'rootkit.', 'sqlattack.', 'xterm.'],
                  ['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'named.', 'phf.', 'sendmail.', 'snmpgetattack.',
                   'snmpguess.', 'spy.', 'warezclient.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.']]
    return protocol_type,service_type,flag_type,label_type
label_type_1=['normal.']
def handle_data():
    protocol_type,service_type,flag_type,label_type = get_col_types()
    source_file = 'KDDTrain+.csv'
    handled_file = 'KDDTrain+number.csv'  # write to csv file
    data_file = open(handled_file, 'w', newline='')
    csv_writer = csv.writer(data_file)
    with open(source_file, 'r') as data_source:
        csv_reader = csv.reader(data_source)
        for row in csv_reader:
            row[1] = protocol_type.index(row[1])
            row[2] = service_type.index(row[2])
            row[3] = flag_type.index(row[3])
            for labels in label_type:
                if labels.count(row[-1])>0:
                    row[-1] = label_type.index(labels)
            csv_writer.writerow(row)
        data_file.close()
    test_source_file = 'KDDTest+.csv'
    test_handled_file = 'KDDTest+number.csv'  # write to csv file
    test_data_file = open(test_handled_file, 'w', newline='')
    test_csv_writer = csv.writer(test_data_file)
    with open(test_source_file, 'r') as data_source:
        csv_reader = csv.reader(data_source)
        for row in csv_reader:
            row[1] = protocol_type.index(row[1])
            row[2] = service_type.index(row[2])
            row[3] = flag_type.index(row[3])
            for labels in label_type:
                if labels.count(row[-1]) > 0:
                    row[-1] = label_type.index(labels)
            test_csv_writer.writerow(row)
        test_data_file.close()
    print('pre process completed!')

接下来是数据归一化,有的博客提到不需要归一化,不影响数据分类,大家可以试试。在这里,也给出接着数值化后进行归一化的操作。

import csv
import numpy as np
my_matrix = np.loadtxt(open('KDDTrain+number.csv','r'),delimiter=",",skiprows=0)
#print(my_matrix)
mmax=np.max(my_matrix,axis=0)
mmin=np.min(my_matrix,axis=0)
for i in range(len(mmax)):
    if mmax[i]==mmin[i]:
        mmax[i]+=0.000001
    res=(my_matrix-mmin)/(mmax-mmin)
#print(res)
with open("KDDTrain-last.csv", "w+",newline='') as csvfile:
    writer = csv.writer(csvfile)
    # 写入多行用writerows
    writer.writerows(res)

归一化不需要处理标签,可以手动去除,也可以体现在代码中。
至此,数据预处理得到两个csv,数值化和归一化两个csv文件。

你可能感兴趣的:(python编程)