基于URL的轻量级恶意页面检测分类器

    本文是整理之前研究工作中的相关代码,这里留作后期学习使用,恶意页面检测已经有很多相关的研究工作,本文是基于一些论文中的思路进行了,具体不再多说明了,放上代码。

# !/usr/bin/python
#-*-coding:utf-8-*-

'''
__Author__:沂水寒城
功能:提取url的特征来训练机器学习模型
'''
import codecs
import sys
import re
import chardet
import warnings
from lxml import etree
from os.path import join as pjoin
import os
from bs4 import BeautifulSoup
from urllib2 import urlparse
from urlparse import urlparse
import codecs
from publicsuffix import PublicSuffixList
from urllib import splitport
import re

RED_KEYWORDS=["account", "admin", "administrator","auth", "bank", "client", "confirm", "email", 
              "host","password", "pay", "private", "safe", "secure", "security", "sign", "user", 
              "validation", "verification", "icbc"]
PATH_KEYWORDS = ["www", "net", "com", "cn"]


def url_split(url):
    '''
    url分割
    '''
    if not url.startswith('http'): 
        if not url.startswith('ftp'):
            url = 'http://' + url
    parts = urlparse(url)
    server, host, top, hostname = domain_split(parts.netloc)
    host, port = splitport(host)
    if port == None: port = ''
    return {'scheme': parts.scheme, 'server': server, 'host': host,
            'port': port, 'top': top, 'path': parts.path, 'domain': host + top,
            'params': parts.params, 'query': parts.query, 'fragment': parts.fragment}


def url_split_new(url):
    '''
    url分割
    '''
    if not url.startswith('http'):  
        url = 'http://' + url
    parts = urlparse(url)
    server, host, top, hostname = domain_split(parts.netloc)
    host, port = splitport(host)
    if port == None: port = ''
    return {'protocol': parts.scheme, 'hostname': hostname, 'path': parts.path}


#URL to Path Features
def len_procotol2path_os(url):
    result = url_split_new(url.strip())
    return len(os.path.split(url)[0])


def len_procotol2path():
    #result = url_split_new(url.strip())
    string = new_result_dict['protocol'] + '://' + new_result_dict['hostname']
    return len(string)


def ip_exist(url):
    compile_rule = re.compile(r'\d+.\d+.\d+.\d+')
    result_list = re.findall(compile_rule, url)
    for num_string in result_list:
        if num_string in url:
            return 1
        else:
            return 0

#Hostname Features
def len_hostname():
    #result = url_split_new(url.strip())
    return len(new_result_dict['hostname'])


def www_exist(url):
    if 'www' in url:
        return 0
    else:
        return 1


def TLD_exist():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        return 1
    else :
        return 0


def SLD_decimal():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    SLD = Hostname_list[-2]
    compile_rule = re.compile(r'\d+')
    sld_list = re.findall(compile_rule, SLD)
    if len(sld_list) !=0:
        return 1
    else:
        return 0


def TLD_decimal():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        TLD = Hostname_list[-3]
        compile_rule = re.compile(r'\d+')
        tld_list = re.findall(compile_rule, SLD)
        if len(tld_list) !=0:
            return 1
    else:
        return 0


def TLD_hex(url):
    result = url_split(url.strip())
    Hostname = result['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        TLD = Hostname_list[-3]
        compile_rule = re.compile(r'[0x0000-0xFFFF]+')
        tld_list = re.findall(compile_rule, TLD)
        if len(tld_list) != 0:
            return 1
    else :
        return 0


def len_path():
    #result = url_split_new(url.strip())
    return len(new_result_dict['path'])


def num_directory_max():
    #result = url_split_new(url.strip())
    if new_result_dict['path'] = '/':
        return [0,0]
    else:
        num_directory_list = new_result_dict['path'][1:].split('/')
        return [len(num_directory_list), max([len(d) for d in num_directory_list])]

def path_exist_date():
    #result = url_split_new(url.strip())
    path = new_result_dict['path']
    compile_rule = re.compile(r'\d+-\d+-\d+')
    date_list = re.findall(compile_rule, path)
    if len(date_list) != 0:
        return 1
    else:
        return 0


def path_exist_hex():
    #result = url_split(url.strip())
    path = old_result_dict['path']
    compile_rule = re.compile(r'[0x0000-0xFFFF]+')
    path_list = re.findall(compile_rule, path)
    if len(path_list) != 0:
        return 1
    else :
        return 0


def geturlat(url):
    '''
    判断URL中是否含有@,?,-,_等符号
    '''
    re_script = re.compile(r'@|-|_|\?|~')
    return 1 if re_script.search(url) else 0


def geturldot(url):
    '''
    判断URL中.的个数
    '''
    dotnum = 0
    for u in url:
        if u == '.':
            dotnum += 1
    return dotnum


def get_url_length(url):
    '''
    获得URL的总长度
    '''
    return len(url)


def get_url_number_length(url):
    '''
    获得URL的最长的数字串长度
    '''
    result = 0
    match = re.findall(r"\d+", url)
    if match:
        match.sort(key=lambda x: len(x), reverse=True)
        result = len(match[0])
    return result


def get_red_keyword(url):
    '''
    判断URL中是否包含敏感词汇
    '''
    url = url.lower()
    for key in RED_KEYWORDS:
        if url.find(key) != -1:
            return 1
    return 0


def get_path_key(url):
    '''
    判断URL的路径中是否包含敏感词汇 
    '''
    url_parse = urlparse(url)
    path = url_parse.path
    if path:
        for key in PATH_KEYWORDS:
            if path.lower().find(key) != -1:
                return 1
    return 0


def get_url_vector(url):
    '''
    获取url的全部特征
    '''
    heuristic_vector=[]
    heuristic_vector.append(geturlat(url))
    heuristic_vector.append(geturldot(url))
    heuristic_vector.append(get_url_length(url))
    heuristic_vector.append(get_url_number_length(url))
    heuristic_vector.append(get_red_keyword(url))
    return heuristic_vector


if __name__ == '__main__':
    test_url='http://www.baidu.com'
    print get_url_vector(test_url)

你可能感兴趣的:(机器学习,python实践)