利用关联规则找出IPv6地址分段之间的关联

对数据集进行数据清洗,并应用fpgrowth算法寻找频繁项集,最后找出关联规则,这里是吧 jupyter里运行的代码进行了封装

结果
利用关联规则找出IPv6地址分段之间的关联_第1张图片可以找到两端输入的地址切片之间的关联

代码:

import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules
from time import time


class FrequentPatternsIPv6(object):
    '''
    传入对应的两个ipv6切片,
    寻找这两个ipv6切片之间的关联规则
    '''

    def __init__(
            self, data_path, slice_ip_1_start, slice_ip_1_end,
            slice_ip_2_start, slice_ip_2_end
    ):
        assert (0 <= slice_ip_1_start <= 128 or 0 <= slice_ip_1_end <= 128 or
                0 <= slice_ip_2_start <= 128 or 0 <= slice_ip_2_end <= 128), \
            'please input the correct slice'
        # 平时说的ip都是从 1 开始数的,字符串是从 0 开始数的
        slice_ip_1_start -= 1
        slice_ip_1_end -= 1
        slice_ip_2_start -= 1
        slice_ip_2_end -= 1
        self.slice_ip_1 = str(slice_ip_1_start) + ':' + str(slice_ip_1_end)
        self.slice_ip_2 = str(slice_ip_2_start) + ':' + str(slice_ip_2_end)
        self.slice_ip_1_start = slice_ip_1_start
        self.slice_ip_1_end = slice_ip_1_end
        self.slice_ip_2_start = slice_ip_2_start
        self.slice_ip_2_end = slice_ip_2_end
        self.path = data_path

    def load_data(self):
        '''
        数据格式
        ip      label
        ...     ...
        ...     ...
        ...     ...

        默认 DHCP 的标签为 0
        '''
        return pd.read_csv(self.path)

    def replenish_ip(self, ip):
        '''
        补全ip中的 :: 为0000
        补全ip中前导0的省略
        '''
        org_length = 8
        splited_ = ip.split(':')
        if '' in splited_:
            length = len(splited_) - 1
        else:
            length = len(splited_)
        d = org_length - length
        if ip.endswith('::'):
            ip = ip.replace('::', ':0000' * d)
        else:
            # 对于末尾为 :: 的 ip也要补全 0
            ip = ip.replace('::', ':0000' * d + ':')
        _temp = ip.split(':')
        for index, i in enumerate(_temp):
            l = len(i)
            if l < 4:
                d = 4 - l
                _temp[index] = '0' * d + i

        return ':'.join(_temp)

    def hex_to_bin(self, ip, flag, slice_here):
        '''
        十六进制转二进制
        flag: 1 表示处理的是输入的第一个ip切片索引
        '''
        # 排除奇奇怪怪的东西 "2.at.pool.ntp.org", '[2a0a' 这样的东西
        if ':' not in ip or '[' in ip:
            return 'error_ip'  # 记住在进行apply完之后要检查一遍 'error_ip'
        ip = self.replenish_ip(ip)
        ip = ip.split(':')
        memory_list = []
        for i in ip:
            temp__ = int(i, 16)
            b = bin(temp__)[2:]
            x = len(b)
            if x < 16:
                d = 16 - x
                b = '0' * d + b
            memory_list.append(b)
        final = ''.join(memory_list)

        start, stop = slice_here.split(':')
        return final[int(start): int(stop)]

    def bin_to_hex(self, string):
        '''
        二进制转十六进制
        '''
        demical = int(string, 2)
        return hex(demical)[2:]

    def clean_data(self):
        '''
        数据清洗
        :return: 经过数据清洗后两个ipv6切片的 Series
        '''
        df = self.load_data()
        df = df[df['label'] == 0]  # 选出 DHCP
        temp = df['ip']
        temp_1 = temp.apply(self.hex_to_bin, args=(1, self.slice_ip_1))
        error_ip_num = temp_1[temp_1 == 'error_ip'].count()
        print(f'数据集中的异常ip数量为{error_ip_num}条')
        print('\n\n', '**'*30)

        err_index = temp_1[temp_1 == 'error_ip'].index
        temp_1.drop(index=err_index, inplace=True)
        pickle_name_1 = f'./from_{self.slice_ip_1_start}_to_{self.slice_ip_1_end}_first.pk'
        temp_1.to_pickle(pickle_name_1)

        temp_2 = temp.apply(self.hex_to_bin, args=(2, self.slice_ip_2))
        err_index = temp_2[temp_2 == 'error_ip'].index
        temp_2.drop(index=err_index, inplace=True)
        pickle_name_2 = f'./from_{self.slice_ip_2_start}_to_{self.slice_ip_2_end}_second.pk'
        temp_2.to_pickle(pickle_name_2)

        return temp_1, temp_2

    def slice_ipv6_describe(self):
        '''
        这个ip切片的统计信息:
        各个切片的占比
        '''
        series_1, series_2 = self.clean_data()

        def format_1(arg):
            x = len(arg)
            if x < 16:
                arg = '0'*(16 - x) + arg
            return arg

        def format_2(arg):
            x = len(arg)
            if x < 8:
                arg = '0'*(8 - x) + arg
            return arg
            
        series_1 = series_1.apply(format_1)
        series_2 = series_2.apply(format_2)     

        info_1 = series_1.value_counts() / series_1.count()
        info_2 = series_2.value_counts() / series_2.count()

        print(info_1)
        print('\n\n', '**'*30)
        print(info_2)

        series_1 = series_1.apply(self.bin_to_hex)
        series_2 = series_2.apply(self.bin_to_hex)

        # 补齐前导零
        def format_series_1(arg):
            l = len(arg)
            if l < 4:
                return '0' * (4 - l) + arg
            return arg

        def format_series_2(arg):
            if len(arg) == 1:
                return '0' + arg
            return arg

        series_1 = series_1.apply(format_series_1)
        series_2 = series_2.apply(format_series_2)

        return series_1, series_2

    def frequent_patterns_prepare(self, min_threshold=1000):
        '''
        Arg:
            min_thresshold: ip切片计数最小的阈值
        
        attention:
            当直接使用TransactionEncoder的时候,np.zeros()由于无法生成过大维度的数组会报错:
            Unable to allocate array with shape (1994891, 44205) and data type bool
            44205 = 104-120 的 unique值 + 120-128的 unique值,所以为了使用TransactionEncoder,要先除去support很小的值

            对于numpy的内存问题,在 linux上面跑的时候,并没有windows的这个限制,windows最大的m*n是10亿
        '''
        columns = [self.slice_ip_1, self.slice_ip_2]
        series_1, series_2 = self.slice_ipv6_describe()
        df = pd.DataFrame(list(zip(series_1, series_2)), columns=columns)

        # 获得 ip切片计数大于阈值的所有 ip切片的索引
        indices = series_1.value_counts()[series_1.value_counts() > min_threshold].index

        def filter_value(value):
            '''
            找出次数 >min_threshold 的对应的 df 切片
            不符合的标记为 冗余 'verbose'
            '''
            return value if value in indices else 'verbose'

        slice_df = df[self.slice_ip_1].apply(filter_value)

        # 把去除了冗余数据之后的 series 代替原来的 series,会产生很多 NaN,就是那些原本不对的数据
        df[self.slice_ip_1] = slice_df[slice_df != 'verbose']
        df.dropna(inplace=True)

        return df

    def apply_(self):
        df_ = self.frequent_patterns_prepare(min_threshold=1000)
        te = TransactionEncoder()  # 对数据集进行TransactionEncoder编码 
        df_tf = te.fit_transform(df_.values)

        df = pd.DataFrame(df_tf, columns=te.columns_)

        start = time()
        # 寻找频繁项集
        frequent_itemsets = fpgrowth(df, min_support=0.05, use_colnames=True)
        print('寻找频繁项集算法时耗:', time() - start)
        print()

        frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)
        print(f'freqSet:\n{frequent_itemsets}')
        print('\n\n', '**'*30)

        # 生成关联规则
        association_rule = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)  # 指标为置信度
        association_rule.sort_values(by='leverage', ascending=False, inplace=True)  # 关联规则按leverage排序

        print('关联规则:\n{}'.format(association_rule))


if __name__ == "__main__":
    start = time()
    fq_ = FrequentPatternsIPv6('D:/ipv6_label_lzx20190904.csv', 104, 120, 120, 128)
    fq_.apply_()
    end = time()
    d = end - start
    min_ = d // 60
    s = d % 60
    print(f'{min_}min{s}s')

中秋~
一个人过呗,哈哈哈

你可能感兴趣的:(数据分析,&,数据挖掘)