水一篇爬取阿里漏洞库信息的文章,有好几周没写爬虫了,在爬取时感觉还是xpath爬取高效些,但是用的正则匹配(re库,python内置),正则虽说在写匹配样式时比较繁琐但是精确匹配还是得用它,只 爬取第1页信息,没有使用多线程或者异步进程,更没有选择将数据持久化到数据库。还是懒,后面会把爬取全部页面的源码及多线程和持久化加进去,先这样了!
# @Time : 2021/8/27 21:56
# @Author : huhu-Z
# @File : ali_cvd_detail.py
# -*- coding: utf-8 -*-
import re
import requests
from random import randint
def get_onepage_content(url):
user_agent = ['Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4482.0 Safari/537.36 Edg/92.0.874.0']
try:
response = requests.get(url, headers={'User-Agent':user_agent[randint(0,2)]})
if response.status_code == 200:
# print(response.text)
return response.text
return
except Exception:
return
def show_cve_content(res):
match = re.compile('.*?target="_blank">(.*?).*?(.*?) .*?(.*?).*?nowrap="nowrap">(.*?)' +
'.*?(.*?).*? ', re.S)
contents = re.findall(match, res)
# print(contents)
for content in contents:
yield {
'cve_id' : content[0].strip(),
'vul_name' : content[1],
'cul_type' : content[2].strip(),
'cve_date' : content[-2].strip(),
'cvs_level': content[-1].strip()
}
def save_content_to_text(content):
with open('ali_cve.txt','a+') as f:
f.write(content + '\n')
def main():
url = 'https://avd.aliyun.com/nvd/list?page=1'
html = get_onepage_content(url)
# print(html)
for content in show_cve_content(html):
save_content_to_text(str(content))
# print(content)
if __name__ == "__main__":
main()
爬取的结果如下:
{'cve_id': 'CVE-2021-26084', 'vul_name': 'Confluence远程代码执行漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-26', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-39136', 'vul_name': 'baserCMS Management System cross site scripting', 'cul_type': 'CWE-79', 'cve_date': '2021-08-26', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-39160', 'vul_name': 'nbgitpuller up to 0.10.1 code injection', 'cul_type': 'CWE-94', 'cve_date': '2021-08-26', 'cvs_level': '严重'}
{'cve_id': 'CVE-2021-28612', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28614', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28617', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28620', 'vul_name': 'Adobe Animate up to 21.0.6 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28621', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28622', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds write', 'cul_type': 'CWE-787', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28629', 'vul_name': 'Adobe Animate up to 21.0.6 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28630', 'vul_name': 'Adobe Animate up to 21.0.6 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '低危'}
{'cve_id': 'CVE-2021-28600', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28602', 'vul_name': 'Adobe After Effects up to 18.2 memory corruption', 'cul_type': 'CWE-788', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28603', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': 'CWE-122', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28605', 'vul_name': 'Adobe After Effects up to 18.2 buffer overflow', 'cul_type': 'CWE-788', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28607', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28610', 'vul_name': 'Adobe After Effects up to 18.2 heap-based overflow', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-28611', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28615', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28616', 'vul_name': 'Adobe After Effects up to 18.2 out-of-bounds read', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '中危'}
{'cve_id': 'CVE-2021-28632', 'vul_name': 'Adobe Acrobat Reader use after free', 'cul_type': 'CWE-416', 'cve_date': '2021-08-25', 'cvs_level': '高危'}
{'cve_id': 'CVE-2021-3486', 'vul_name': 'Corel Parallels Desktop 安全漏洞', 'cul_type': 'CWE-79', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-35987', 'vul_name': 'Adobe Acrobat 缓冲区错误漏洞', 'cul_type': 'CWE-125', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28697', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28699', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28700', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28698', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28696', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28694', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}
{'cve_id': 'CVE-2021-28695', 'vul_name': 'Xen安全漏洞', 'cul_type': '未定义', 'cve_date': '2021-08-25', 'cvs_level': '未知'}