黑板课爬虫闯关 - 第四关

  1. 第四关增加了登录验证和密码获取,主页面如下:
image.png
  1. 但是想要进入主页面需要先登录:
image.png
  1. 进去后可以随便试下密码,果然错误,然后系统提供了一个密码表
image.png
  1. 密码表


    image.png
  2. 然后就是繁杂的解密过程了,流程和之前其实差不多,同样的模拟登陆,不同的是这里增加了密码获取这一条,开始做的时候也是很懵逼,后来在网上大佬的提点下知道密码是有100位QAQ,而且页面加载极其之慢,所以这个过程非常煎熬
    关卡主要是想考察模拟登陆和多线程爬虫这一块,这样比较快嘛,但是也是可以暴力破解的嘛,下面提供三个版本参考
    one.py(单线程,直接获取密码组合到100位再进行测试)
import requests
from lxml import etree
import codecs
import csv
import re


se = requests.session()


headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}

class HBK():
    def __init__(self):
        self.login_url = "http://www.heibanke.com/accounts/login"
        self.username = "whaike"
        self.password = "12345654321"
        self.passwrods = ['' for i in range(101)]
        self.pwd = ''

        ##获取登陆之前的csrf
    def getCsrf(self):
        res = se.get(url=self.login_url,headers=headers,timeout=30).text
        tree = etree.HTML(res)
        self.csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]

    #登陆
    def login(self):
        self.getCsrf()
        data = {
            "csrfmiddlewaretoken":self.csrf,
            "username":self.username,
            "password":self.password
        }
        se.post(url=self.login_url,headers=headers,data=data,timeout=30)
        print('登陆成功')

    #获取登陆之后的csrf,也就是要进行第四关闯关的csrf
    def getNCsrf(self):
        url = 'http://www.heibanke.com/lesson/crawler_ex03/'
        res = se.get(url,headers=headers,timeout=30).text
        tree = etree.HTML(res)
        csrf = tree.xpath('//input[1]/@value')[0]
        return csrf

    #猜测密码是否正确
    def guesspwd(self):
        url = 'http://www.heibanke.com/lesson/crawler_ex03/'
        csrf = self.getNCsrf()
        data = {
            "csrfmiddlewaretoken":csrf,
            "username":"whaike",
            "password":self.pwd
        }
        res = se.post(url,headers=headers,data=data,timeout=30)
        if int(res.status_code) == 200:
            self.h3 = re.findall('

(.*?)

',res.text) return True else: return False #循环抓取第一页的随机值,直到密码长度为100时开始猜测,猜测失败继续执行,猜测成功停止运行 def getGasswords(self): print('获取第一页') url = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/?page=1' res = se.get(url,headers=headers,timeout=30).text tree = etree.HTML(res) trs = tree.xpath('/html/body/div/div/div[2]/table/tr')[1:] for tr in trs: p1 = tr.xpath('td[1]/text()')[0] #位置 p = int(re.findall('\d+',p1)[0]) #偶尔数字前会有一些其他字符出现,提取数字部分,转换为整数 w = tr.xpath('td[2]/text()')[0] #值 self.passwrods[p] = w self.pwd = ''.join(self.passwrods) length = len(self.pwd) #密码长度 print('当前密码:%s,长度%d'%(self.pwd,length)) if length == 100: print('满足条件,开始猜测...') if self.guesspwd(): print ('猜测成功,密码为:%s'%self.pwd) else: print ('猜测失败,继续执行') self.getGasswords() else: #如果密码长度不为100,则再次获取第一页的随机密码并组成新的密码 self.getGasswords() #递归 if __name__ == '__main__': print('开始闯关 - 第四关') spider = HBK() spider.login() spider.getGasswords() print(spider.h3)

two.py(多线程版,转自网上某大佬)

  #!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-09-02 22:25:21
# @Author  : bb ([email protected])
# @Word    : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue


dict1={}
vlauess=[]
web1="http://www.heibanke.com/accounts/login"
web2="http://www.heibanke.com/lesson/crawler_ex03/pw_list/"
web3="http://www.heibanke.com/lesson/crawler_ex03/"
global queuewz
global queuemm
queuewz=Queue()
queuemm=Queue()


class mythreads(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        work()
        while not queuemm.empty():
            try:
                dict1[str(queuewz.get())]=queuemm.get()
                print(dict1)
                print("字典长度为%s"%len(dict1))
                if int(len(dict1)) ==100:
                    print("凑到100啦!")
                    for i in range(1,101):
                        vlauess.append(dict1[str(i)])
                    c=vlauess[:100]
                    zzmm=''.join(c)
                    print("密码为%s"%zzmm)
                    print("正在登录.......")
                    dataWebsite1 = {'username': 'user','password': zzmm}
                    s=login_get()
                    res=s.post(web3, data=dataWebsite1).text
                    if u'恭喜' in res:
                        title=re.findall("(.*?)",res)
                        word=re.findall("

(.*?)

",res) word2=re.findall("

(.*?)

",res) html=re.findall('下一关',res) print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]])) break else: print("网页有问题哦!可以尝试手动将获得的正确密码登入进去哦!") break else: main() except IndexError: print("例表空了,下一页!") def login_get(): try: s = requests.Session() r=s.get(web1) # 访问登录页面获取登录要用的csrftoken token1 = r.cookies['csrftoken'] # 保存csrftoken # 将csrftoekn存入字段csrfmiddlewaretoken dataWebsite1 = {'username': 'user', 'password': 'password', 'csrfmiddlewaretoken': token1 } res=s.post(web1, data=dataWebsite1) except KeyError as e: pass return s def get_html(s): r=s.get(web2) res=r.text return res def get_dict(res): soup=BeautifulSoup(res,"html.parser") for a in soup.find_all('td',attrs={'title':'password_pos'}): wz=(a.string) queuewz.put(wz) for b in soup.find_all('td',attrs={'title':'password_val'}): mm=(b.string) queuemm.put(mm) def work(): res=get_html(s) get_dict(res) def main(): global s s=login_get() threads=[] threads_count=10 for i in range(threads_count): threads.append(mythreads()) for t in threads: t.start() for t in threads: t.join() if __name__ == '__main__': main()

three.py

import re
import requests
from threading import Thread
import time

def print_run_time(func):
    """
    装饰器函数,输出运行时间
    """
    def wrapper(self, *args, **kw):
        local_time = time.time()
        # print args),kw
        func(self)
        print('run time is {:.2f}:'.format(time.time() - local_time))
    return wrapper

class hbk_crawler(object):
    """黑板客爬虫闯关"""
    def __init__(self): pass

    def login(self):
        """登录函数 input:第几关"""
        self.url = 'http://www.heibanke.com/lesson/crawler_ex03'
        self.login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03'
        self.s = requests.session()
        print("正在登录第4关....")
        try:
            self.csrftoken = self.s.get(self.login_url).cookies['csrftoken']
        except:
            print("网络连接错误,请重试...")
            exit()
        self.payload = {'username': 'test', 'password': 'test123',
                        'csrfmiddlewaretoken': self.csrftoken}
        self.payload['csrfmiddlewaretoken'] = self.s.post(
            self.login_url, self.payload).cookies['csrftoken']
        print("登录成功....")
        return None

    def parseurl(self, url):
        """分析网页,查找密码位置和值"""
        while self.count < 100:
            response = self.s.get(url)
            if response.ok:
                content = response.text
                pos_pattern = r'_pos.>(.*)'
                val_pattern = r'_val.>(.*)'
                pos_list = re.findall(pos_pattern, content)
                val_list = re.findall(val_pattern, content)
                for pos, val in zip(pos_list, val_list):
                    if pos not in self.pw_dict:
                        self.pw_dict[pos] = val
                        self.count = self.count + 1
                print(str(self.count) + '%' + self.count // 2 * '*')

    def ex04(self, *args, **kw):
        """ 第4关:找密码,加入了登录验证,CSRF保护,密码长度100位,响应时间增加 """
        self.count = 0
        self.login()
        self.pw_dict = {}
        pw_url = ('http://www.heibanke.com/lesson/crawler_ex03/pw_list',)
        # 线程数,黑板客服务器15秒内最多响应2个请求,否则返回404.
        n = 2
        threads = [Thread(target=self.parseurl, args=(
            pw_url)) for i in range(n)]
        for t in threads:
            print(t.name, 'start...')
            t.start()
        for t in threads:
            t.join()
        self.pw_list = ['' for n in range(101)]
        for pos in self.pw_dict.keys():
            self.pw_list[int(pos)] = self.pw_dict[pos]
        password = int(''.join(self.pw_list))
        self.payload['password'] = password
        response = self.s.post(self.url, self.payload)
        pattern = r'

(.*)

' result = re.findall(pattern, response.text) result2 = re.findall('下一关',response.text) print(result[0]) print(result2) if __name__ == '__main__': Hbk_crawler = hbk_crawler() Hbk_crawler.ex04()

综合总结下来,几种方法原理都差不多,主要是多线程在数据获取上速度会快一些,关于多线程的板块,后面会进行更新,大家也可以去看看官方文档或者廖雪峰的教程,然后这里每次都要模拟登陆比较麻烦,注意代码不要冗余,在获取错误信息方面基本都差不多,用的re抓取,如果"text" in XXX的方式不适用的话,可以尝试构建result为抓取的错误信息,返回值为空则为正确,有返回值则说明有错误信息,页面加载比较慢,建议给点输出信息以免你觉得代码挂了

  • 更多代码详情参考我的Github

你可能感兴趣的:(黑板课爬虫闯关 - 第四关)