
        水一篇爬取阿里漏洞库信息的文章,有好几周没写爬虫了,在爬取时感觉还是xpath爬取高效些,但是用的正则匹配(re库,python内置),正则虽说在写匹配样式时比较繁琐但是精确匹配还是得用它,只 爬取第1页信息,没有使用多线程或者异步进程,更没有选择将数据持久化到数据库。还是懒,后面会把爬取全部页面的源码及多线程和持久化加进去,先这样了!

# @Time : 2021/8/27 21:56
# @Author : huhu-Z
# @File :
# -*- coding: utf-8 -*-
import re

import requests
from random import randint

def get_onepage_content(url):
    user_agent = ['Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4482.0 Safari/537.36 Edg/92.0.874.0']
        response = requests.get(url, headers={'User-Agent':user_agent[randint(0,2)]})
        if response.status_code == 200:
            # print(response.text)
            return response.text
    except Exception:

def show_cve_content(res):
    match = re.compile('.*?target="_blank">(.*?).*?(.*?).*?(.*?).*?nowrap="nowrap">(.*?)' +
                       '.*?(.*?).*?', re.S)
    contents = re.findall(match, res)
    # print(contents)
    for content in contents:
        yield {
            'cve_id'   : content[0].strip(),
            'vul_name' : content[1],
            'cul_type' : content[2].strip(),
            'cve_date' : content[-2].strip(),
            'cvs_level': content[-1].strip()

def save_content_to_text(content):
    with open('ali_cve.txt','a+') as f:
        f.write(content + '\n')

def main():
    url = ''
    html = get_onepage_content(url)
    # print(html)
    for content in show_cve_content(html):
        # print(content)

if __name__ == "__main__":


{'cve_id': 'CVE-2021-26084', 'vul_name': 'Confluence远程代码执行漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-26', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-39136', 'vul_name': 'baserCMS Management System cross site scripting', 'cul_type': 'CWE-79', 'cve_date': '2021-08-26', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-39160', 'vul_name': 'nbgitpuller up to 0.10.1 code injection', 'cul_type': 'CWE-94', 'cve_date': '2021-08-26', 'cvs_level': '严重'}
{'cve_id': 'CVE-2021-28612', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28614', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28617', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28620', 'vul_name': 'Adobe Animate up to 21.0.6 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28621', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28622', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds write', 'cul_type': 'CWE-787', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28629', 'vul_name': 'Adobe Animate up to 21.0.6 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28630', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '低危'}
{'cve_id': 'CVE-2021-28600', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28602', 'vul_name': 'Adobe After Effects up to 18.2 memory corruption', 'cul_type': 'CWE-788', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28603', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28605', 'vul_name': 'Adobe After Effects up to 18.2 buffer overflow', 'cul_type': 'CWE-788', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28607', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28610', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28611', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28615', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28616', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28632', 'vul_name': 'Adobe Acrobat Reader use after free', 'cul_type': 'CWE-416', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-3486', 'vul_name': 'Corel Parallels Desktop 安全漏洞', 'cul_type': 'CWE-79', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-35987', 'vul_name': 'Adobe Acrobat 缓冲区错误漏洞', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28697', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28699', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28700', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28698', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28696', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28694', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28695', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
