爬虫闯关:GlidedSky基础题

网站:http://glidedsky.com

第一题:计算网页上所有数字的和

注册进去就看到了

点进去待爬取的网站,发现全是数字,,,这个第一题确实简单,没啥讲的

爬虫闯关:GlidedSky基础题_第1张图片

第二题:同样题,请求1000次

爬虫闯关:GlidedSky基础题_第2张图片

这个题也是一样的,最简单的就是将上面的写的代码,改改就可以了,不过这样的话速度太慢了,可以试着自己优化一下,加线程或者直接使用协程,都是很不错的。当然我觉得协程应该能更快一点。没做具体测试。

运行的结果,这样是直接改的,没有加任何线程或协程,时间有点长。很基础,但是也要看你怎么优化了。

应该是还可以更优化,我写的代码偷懒了,使用协程写的。

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/18 0:33 
# @Author : zhao.jia
# @Site :  
# @File : glide_test.py 
# @Software: PyCharm

import requests
import tools
from lxml import etree
import aiohttp
import asyncio
import datetime
import time
from requests.adapters import HTTPAdapter


class TestGlidedsky:

    def __init__(self):

        self.headers = """
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            Accept-Encoding: gzip, deflate
            Accept-Language: zh-CN,zh;q=0.9
            Cache-Control: max-age=0
            Connection: keep-alive
            Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9
            Host: glidedsky.com
            Referer: http://glidedsky.com/login
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36
        """
        self.sess = requests.session()
        self.sess.headers = tools.headers_to_dict(self.headers)
        self.sum_count_2 = 0
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False


    def basic_one(self):
        sum_count = 0
        res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1")
        res_html = etree.HTML(res.text)
        nums = res_html.xpath('//div[@class="col-md-1"]/text()')
        for num in nums:
            sum_count += int(num.strip())
        print("sum=" + sum_count)

    # 第二题
    def basic_two(self):
        count = 1
        sum_count = 0
        while True:
            res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}")
            res_html = etree.HTML(res.text)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            for num in nums:
                sum_count += int(num.strip())
            count += 1
            if count == 1001:
                break
        print(sum_count)

    async def basic_two_2(self, url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp:
                res = await resp.text()
                res_html = etree.HTML(res)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                for num in nums:
                    self.sum_count_2 += int(num.strip())

    def sum_async_count(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
                 range(1, 500)]
        loop.run_until_complete(asyncio.gather(*tasks))
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
            range(500, 1001)]
        loop.run_until_complete(asyncio.gather(*tasks))
        print(self.sum_count_2)


if __name__ == '__main__':
    # 第二题
    # starttime = datetime.datetime.now()
    # TestGlidedsky().basic_two()
    # endtime = datetime.datetime.now()
    # count_time_1 = (endtime - starttime).seconds
    # print(count_time_1)
    # 第二题
    # starttime_2 = datetime.datetime.now()
    # TestGlidedsky().sum_async_count()
    # endtime_2 = datetime.datetime.now()
    # count_time_2 = (endtime_2 - starttime_2).seconds
    # print(count_time_2)

第三题:还是求和,

不过这次封禁ip,每个ip只能访问一次,这个题就有点恶心了,只能去找代理ip了,找免费的就行,想办法多重试。

爬虫闯关:GlidedSky基础题_第3张图片

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/27 11:00 
# @Author : Andrew
# @Site :  
# @File : python-abu.py 
# @Software: PyCharm

#! -*- encoding:utf-8 -*-

from urllib import request
import base64
from lxml import etree
import time
import requests
from requests.adapters import HTTPAdapter


class test:

    def __init__(self):
        self.sess = requests.session()
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False

    def abu_test(self):

        # 代理服务器
        proxyHost = "proxy.abuyun.com"
        proxyPort = "9020"

        # 代理隧道验证信息
        proxyUser = "H2T*****22WD"
        proxyPass = "7****10526D3F"
        proxy_dict = {'http': "http-dyn.abuyun.com:9020"}
        auth = f"{proxyUser}:{proxyPass}"
        auth = base64.b64encode(auth.encode('utf8'))
        proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()}
        self.get_html(proxy_dict, proxy_header)

    def get_html(self, proxy_dict, proxy_header):
        count = 1
        sum_count = 0

        headers = """
                Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                Accept-Encoding: gzip, deflate
                Accept-Language: zh-CN,zh;q=0.9
                Cache-Control: max-age=0
                Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832
                Host: glidedsky.com
                Proxy-Connection: keep-alive
                Referer: http://glidedsky.com/login
                Upgrade-Insecure-Requests: 1
                User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36
                """
        import tools
        headers = tools.headers_to_dict(headers)
        headers.update(proxy_header)
        # print(headers)
        while True:
            # if count == 37 or count == 38:
            #     continue
            try:
                res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers,
                                   proxies=proxy_dict, timeout=10)
            except Exception as e:
                print("异常")
                print(e)
                continue
            file_name = f'glidedsky_{count}.html'
            if res.status_code == 200:
                with open(file_name, 'w', encoding='utf8') as f:
                    f.write(res.text)
                res_html = etree.HTML(res.text)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                if nums:
                    print("zhaodao")
                    # with open(file_name, 'w', encoding='utf8') as f:
                    #     f.write(res.text)
                    for num in nums:
                        sum_count += int(num.strip())
                    count += 1
                    print(sum_count)
                    if count == 1001:
                        return sum_count
            # time.sleep(3)

    def parse_html(self):
        count = 1
        sum_count = 0
        while True:
            file_name = f'glidedsky_{count}.html'
            with open(file_name, 'r', encoding='utf8') as f:
                content = f.read()
            res_html = etree.HTML(content)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            if nums:
                for num in nums:
                    sum_count += int(num.strip())

                print("次数综合", count, sum_count)
                if count == 1001:
                    break
                    # return sum_count
            else:
                print("没有内容", file_name)
                continue
            count += 1
        print("总和", sum_count)


if __name__ == '__main__':
    # test().abu_test()
    test().parse_html()

爬虫闯关:GlidedSky基础题_第4张图片

结果:
file

本篇文章由一文多发平台 ArtiPub自动发布

你可能感兴趣的:(python)