# -*- coding: utf-8 -*-
import requests
from lxml import etree
import base64
import time
from urllib.parse import quote
import re
TimeSleep = 5 #爬取每一页等待的秒数,防止IP被ban
SearchKEY = '' #搜索内容
StartPage=1#开始页码
StopPage=5#结束页码
cookie="输入cookie值"
headers = {
"Connection": "keep-alive",
"cookie": cookie.encode("utf-8").decode("latin1")
}
searchbs64 = quote(str(base64.b64encode(SearchKEY.encode()), encoding='utf-8'))
print("爬取页面为:https://fofa.info/result?qbase64=" + searchbs64)
html = requests.get(url="https://fofa.info/result?qbase64=" + searchbs64, headers=headers).text
tree = etree.HTML(html)#etree.HTML()可以用来解析字符串格式的HTML文档对象,将传进去的字符串转变成_Element对象。作为_Element对象,可以方便的使用getparent()、remove()、xpath()等方法。
try:
pagenum = tree.xpath('//li[@class="number"]/text()')[-1]#查找页码数
except Exception as e:
print(e)
pagenum = '0'
print("fofa页码数为0")
print("该关键字存在页码: "+pagenum)#输出页码数
doc = open("hello_world.txt", "w",encoding='UTF-8')
for i in range(int(StartPage),int(pagenum)):
print("Now write " + str(i) + " page")
rep = requests.get('https://fofa.info/result?qbase64=' + searchbs64+"&page="+str(i)+"&page_size=10", headers=headers)
tree = etree.HTML(rep.text)
urllist=tree.xpath('//span[@class="hsxa-host"]/a/@href')
#urllist = tree.xpath('//span[contains(@class, "hsxa-copy-btn")]/@data-clipboard-text')
for item in urllist:
doc.write(item+"\n")
print(item)
if i==int(StopPage):
break
time.sleep(TimeSleep)
doc.close()
1、F12
2、点网络(Goole Chrome为例)
3、刷新网站