fofa爬虫

代码

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import base64
import time
from urllib.parse import quote
import re

TimeSleep = 5  #爬取每一页等待的秒数,防止IP被ban

SearchKEY = '' #搜索内容

StartPage=1#开始页码

StopPage=5#结束页码

cookie="输入cookie值"

headers = {
        "Connection": "keep-alive",
        "cookie": cookie.encode("utf-8").decode("latin1")
    }


searchbs64 = quote(str(base64.b64encode(SearchKEY.encode()), encoding='utf-8'))
print("爬取页面为:https://fofa.info/result?qbase64=" + searchbs64)

html = requests.get(url="https://fofa.info/result?qbase64=" + searchbs64, headers=headers).text
tree = etree.HTML(html)#etree.HTML()可以用来解析字符串格式的HTML文档对象,将传进去的字符串转变成_Element对象。作为_Element对象,可以方便的使用getparent()、remove()、xpath()等方法。

try:
        pagenum = tree.xpath('//li[@class="number"]/text()')[-1]#查找页码数
except Exception as e:
    print(e)
    pagenum = '0'
    print("fofa页码数为0")
print("该关键字存在页码: "+pagenum)#输出页码数



doc = open("hello_world.txt", "w",encoding='UTF-8')
for i in range(int(StartPage),int(pagenum)):
    print("Now write " + str(i) + " page")
    rep = requests.get('https://fofa.info/result?qbase64=' + searchbs64+"&page="+str(i)+"&page_size=10", headers=headers)
    tree = etree.HTML(rep.text)
    urllist=tree.xpath('//span[@class="hsxa-host"]/a/@href')
    #urllist = tree.xpath('//span[contains(@class, "hsxa-copy-btn")]/@data-clipboard-text')
    for item in urllist:
        doc.write(item+"\n")
        print(item)
    if i==int(StopPage):
        break
    time.sleep(TimeSleep)
doc.close()

获取cookie的办法

1、F12

2、点网络(Goole Chrome为例)

fofa爬虫_第1张图片

3、刷新网站

fofa爬虫_第2张图片

你可能感兴趣的:(python,爬虫)