需要知道三个基本概念:支持度、置信度、频繁k项集。
Apriori算法就是挖掘同时满足最小支持度阈值和最小置信度阈值的关联规则。在安全领域,这个算法应用十分广泛,凡事需要挖掘潜在关联关系的都可以尝试使用。在挖掘XSS相关参数中,从xssed网站的样例以及WAF的拦截日志中提取XSS攻击日志作为样本。目的是分析出潜在的关联关系,然后作为SVM、KNN等分类算法的特征提取依据之一。
from apriori import apriori
from apriori import generateRules
import re
if __name__ == '__main__':
#对日志进行逐行文本向量化,最简单的方式就是按照一定的分隔符切割成单词向量
myDat=[]
with open("/Users/zhanglipeng/Data/xss-2000.txt") as f:
for line in f:
index=line.find("?")
if index>0:
line=line[index+1:len(line)]
tokens=re.split('\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',line)
#print "token:"
#print tokens
myDat.append(tokens)
f.close()
L, suppData = apriori(myDat, 0.15)
rules = generateRules(L, suppData, minConf=0.6)
#print 'rules:\n', rules
frozenset(['//']) --> frozenset(['']) conf: 1.0
frozenset(['/']) --> frozenset(['']) conf: 1.0
frozenset(['a']) --> frozenset(['']) conf: 1.0
frozenset(['1']) --> frozenset(['alert']) conf: 0.810010214505
frozenset(['/script']) --> frozenset(['']) conf: 1.0
frozenset(['1']) --> frozenset(['']) conf: 1.0
frozenset(['script']) --> frozenset(['alert']) conf: 0.767519466073
frozenset(['page']) --> frozenset(['42873']) conf: 0.925531914894
frozenset(['//']) --> frozenset(['alert']) conf: 0.994134897361
frozenset(['/script']) --> frozenset(['1']) conf: 0.637254901961
frozenset(['42873']) --> frozenset(['']) conf: 1.0
frozenset(['c']) --> frozenset(['']) conf: 1.0
frozenset(['page']) --> frozenset(['']) conf: 1.0
frozenset(['script']) --> frozenset(['']) conf: 1.0
frozenset(['script']) --> frozenset(['1']) conf: 0.638487208009
frozenset(['42873']) --> frozenset(['alert']) conf: 0.622485207101
frozenset(['c']) --> frozenset(['42873']) conf: 1.0
frozenset(['/script']) --> frozenset(['alert']) conf: 0.763616557734
frozenset(['/']) --> frozenset(['1']) conf: 0.993650793651
frozenset(['script']) --> frozenset(['/script']) conf: 1.0
frozenset(['/script']) --> frozenset(['script']) conf: 0.979302832244
frozenset(['alert']) --> frozenset(['']) conf: 1.0
frozenset(['']) --> frozenset(['alert']) conf: 0.713
frozenset(['c']) --> frozenset(['', '42873']) conf: 1.0
frozenset(['script']) --> frozenset(['1', '/script']) conf: 0.638487208009
frozenset(['/script']) --> frozenset(['1', 'script']) conf: 0.625272331155
frozenset(['/script']) --> frozenset(['', 'alert']) conf: 0.763616557734
frozenset(['script']) --> frozenset(['1', '']) conf: 0.638487208009
frozenset(['script']) --> frozenset(['', '/script']) conf: 1.0
frozenset(['/script']) --> frozenset(['', 'script']) conf: 0.979302832244
frozenset(['/']) --> frozenset(['1', '']) conf: 0.993650793651
frozenset(['/script']) --> frozenset(['1', '']) conf: 0.637254901961
frozenset(['1']) --> frozenset(['', 'alert']) conf: 0.810010214505
frozenset(['script']) --> frozenset(['', 'alert']) conf: 0.767519466073
frozenset(['//']) --> frozenset(['', 'alert']) conf: 0.994134897361
frozenset(['page']) --> frozenset(['', '42873']) conf: 0.925531914894
frozenset(['script']) --> frozenset(['/script', 'alert']) conf: 0.767519466073
frozenset(['/script']) --> frozenset(['alert', 'script']) conf: 0.751633986928
frozenset(['42873']) --> frozenset(['', 'alert']) conf: 0.622485207101
frozenset(['alert', 'script']) --> frozenset(['1', '/script']) conf: 0.744927536232
frozenset(['/script', 'alert']) --> frozenset(['1', 'script']) conf: 0.733238231098
frozenset(['1', 'alert']) --> frozenset(['/script', 'script']) conf: 0.648171500631
frozenset(['1', 'script']) --> frozenset(['/script', 'alert']) conf: 0.895470383275
frozenset(['1', '/script']) --> frozenset(['alert', 'script']) conf: 0.878632478632
frozenset(['/script', 'script']) --> frozenset(['1', '']) conf: 0.638487208009
frozenset(['', 'script']) --> frozenset(['1', '/script']) conf: 0.638487208009
frozenset(['', '/script']) --> frozenset(['1', 'script']) conf: 0.625272331155
frozenset(['1', 'script']) --> frozenset(['', '/script']) conf: 1.0
frozenset(['1', '/script']) --> frozenset(['', 'script']) conf: 0.981196581197
frozenset(['script']) --> frozenset(['1', '', '/script']) conf: 0.638487208009
frozenset(['/script']) --> frozenset(['1', '', 'script']) conf: 0.625272331155
frozenset(['alert', 'script']) --> frozenset(['', '/script']) conf: 1.0
frozenset(['/script', 'alert']) --> frozenset(['', 'script']) conf: 0.984308131241
frozenset(['/script', 'script']) --> frozenset(['', 'alert']) conf: 0.767519466073
frozenset(['', 'script']) --> frozenset(['/script', 'alert']) conf: 0.767519466073
frozenset(['', '/script']) --> frozenset(['alert', 'script']) conf: 0.751633986928
frozenset(['script']) --> frozenset(['', '/script', 'alert']) conf: 0.767519466073
frozenset(['/script']) --> frozenset(['', 'alert', 'script']) conf: 0.751633986928
frozenset(['alert', 'script']) --> frozenset(['1', '']) conf: 0.744927536232
frozenset(['1', 'alert']) --> frozenset(['', 'script']) conf: 0.648171500631
frozenset(['1', 'script']) --> frozenset(['', 'alert']) conf: 0.895470383275
frozenset(['/script', 'alert']) --> frozenset(['1', '']) conf: 0.748930099857
frozenset(['1', 'alert']) --> frozenset(['', '/script']) conf: 0.662042875158
frozenset(['1', '/script']) --> frozenset(['', 'alert']) conf: 0.897435897436
frozenset(['1', '/script', 'alert']) --> frozenset(['', 'script']) conf: 0.979047619048
frozenset(['1', '/script', 'script']) --> frozenset(['', 'alert']) conf: 0.895470383275
frozenset(['/script', 'alert', 'script']) --> frozenset(['', '1']) conf: 0.744927536232
frozenset(['1', 'alert', 'script']) --> frozenset(['', '/script']) conf: 1.0
frozenset(['', '1', '/script']) --> frozenset(['alert', 'script']) conf: 0.878632478632
frozenset(['', '/script', 'alert']) --> frozenset(['1', 'script']) conf: 0.733238231098
frozenset(['', '1', 'alert']) --> frozenset(['/script', 'script']) conf: 0.648171500631
frozenset(['', '1', 'script']) --> frozenset(['/script', 'alert']) conf: 0.895470383275
frozenset(['', 'alert', 'script']) --> frozenset(['1', '/script']) conf: 0.744927536232
frozenset(['1', '/script']) --> frozenset(['', 'alert', 'script']) conf: 0.878632478632
frozenset(['/script', 'alert']) --> frozenset(['', '1', 'script']) conf: 0.733238231098
frozenset(['1', 'alert']) --> frozenset(['', '/script', 'script']) conf: 0.648171500631
frozenset(['1', 'script']) --> frozenset(['', '/script', 'alert']) conf: 0.895470383275
frozenset(['alert', 'script']) --> frozenset(['', '1', '/script']) conf: 0.744927536232
修改支持度为0.01,即即使关联关系出现的概率只有百分之一,只要对应的是强关联,置信度超过0.99,也是一种有价值的关联。当然,这样做的结果其中之一就是需要巨大的时间。
这个算法是基于Apriori构建的,而经过上面算法耗费时长的经历,本算法减少了扫描次数大大加快算法速度,这是大家都喜闻乐见的事情。FP树是一种高级的数据结构,存储项集的出现频率,而每个项集会以路径的方式存储在树中。
这里宝书作者使用FP-growth算法挖掘疑似僵尸主机。在互联网环境下存在大量的僵尸主机的扫描行为,僵尸主机频繁更换IP,很难通过IP确定僵尸主机。通过使用FP-growth算法,分析防火墙的拦截日志,挖掘出浏览器的user-agent字段和目标URL之间的关联关系,初步确定潜在的僵尸主机。
import pyfpgrowth
transactions=[]
with open("/Users/zhanglipeng/Data/KnowledgeGraph/sample7.txt") as f:
for line in f:
line=line.strip('\n')
ip,ua,target=line.split(',')
print "Add (%s %s %s)" % (ip,ua,target)
transactions.append([ip,ua,target])
patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
rules = pyfpgrowth.generate_association_rules(patterns, 0.9)
print rules
Add (ip=ip1 ua=ua1 target=url1)
Add (ip=ip2 ua=ua1 target=url1)
Add (ip=ip3 ua=ua1 target=url1)
Add (ip=ip1 ua=ua1 target=url2)
Add (ip=ip2 ua=ua1 target=url2)
Add (ip=ip3 ua=ua1 target=url2)
Add (ip=ip4 ua=ua2 target=url2)
Add (ip=ip5 ua=ua3 target=url2)
Add (ip=ip5 ua=ua6 target=url6)
Add (ip=ip6 ua=ua3 target=url4)
Add (ip=ip7 ua=ua4 target=url4)
Add (ip=ip8 ua=ua5 target=url5)
{('target=url1',): (('ua=ua1',), 1.0)}
user-agent字段包含的信息很多,详情可见:https://blog.csdn.net/quintind/article/details/53006544