网站:http://glidedsky.com
第一题:计算网页上所有数字的和
注册进去就看到了
点进去待爬取的网站,发现全是数字,,,这个第一题确实简单,没啥讲的
第二题:同样题,请求1000次
这个题也是一样的,最简单的就是将上面的写的代码,改改就可以了,不过这样的话速度太慢了,可以试着自己优化一下,加线程或者直接使用协程,都是很不错的。当然我觉得协程应该能更快一点。没做具体测试。
运行的结果,这样是直接改的,没有加任何线程或协程,时间有点长。很基础,但是也要看你怎么优化了。
应该是还可以更优化,我写的代码偷懒了,使用协程写的。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/8/18 0:33
# @Author : zhao.jia
# @Site :
# @File : glide_test.py
# @Software: PyCharm
import requests
import tools
from lxml import etree
import aiohttp
import asyncio
import datetime
import time
from requests.adapters import HTTPAdapter
class TestGlidedsky:
def __init__(self):
self.headers = """
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: max-age=0
Connection: keep-alive
Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9
Host: glidedsky.com
Referer: http://glidedsky.com/login
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36
"""
self.sess = requests.session()
self.sess.headers = tools.headers_to_dict(self.headers)
self.sum_count_2 = 0
self.sess.mount('http://', HTTPAdapter(max_retries=3))
self.sess.mount('https://', HTTPAdapter(max_retries=3))
self.sess.verify = False
def basic_one(self):
sum_count = 0
res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1")
res_html = etree.HTML(res.text)
nums = res_html.xpath('//div[@class="col-md-1"]/text()')
for num in nums:
sum_count += int(num.strip())
print("sum=" + sum_count)
# 第二题
def basic_two(self):
count = 1
sum_count = 0
while True:
res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}")
res_html = etree.HTML(res.text)
nums = res_html.xpath('//div[@class="col-md-1"]/text()')
for num in nums:
sum_count += int(num.strip())
count += 1
if count == 1001:
break
print(sum_count)
async def basic_two_2(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp:
res = await resp.text()
res_html = etree.HTML(res)
nums = res_html.xpath('//div[@class="col-md-1"]/text()')
for num in nums:
self.sum_count_2 += int(num.strip())
def sum_async_count(self):
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(
self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
range(1, 500)]
loop.run_until_complete(asyncio.gather(*tasks))
tasks = [asyncio.ensure_future(
self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
range(500, 1001)]
loop.run_until_complete(asyncio.gather(*tasks))
print(self.sum_count_2)
if __name__ == '__main__':
# 第二题
# starttime = datetime.datetime.now()
# TestGlidedsky().basic_two()
# endtime = datetime.datetime.now()
# count_time_1 = (endtime - starttime).seconds
# print(count_time_1)
# 第二题
# starttime_2 = datetime.datetime.now()
# TestGlidedsky().sum_async_count()
# endtime_2 = datetime.datetime.now()
# count_time_2 = (endtime_2 - starttime_2).seconds
# print(count_time_2)
第三题:还是求和,
不过这次封禁ip,每个ip只能访问一次,这个题就有点恶心了,只能去找代理ip了,找免费的就行,想办法多重试。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/8/27 11:00
# @Author : Andrew
# @Site :
# @File : python-abu.py
# @Software: PyCharm
#! -*- encoding:utf-8 -*-
from urllib import request
import base64
from lxml import etree
import time
import requests
from requests.adapters import HTTPAdapter
class test:
def __init__(self):
self.sess = requests.session()
self.sess.mount('http://', HTTPAdapter(max_retries=3))
self.sess.mount('https://', HTTPAdapter(max_retries=3))
self.sess.verify = False
def abu_test(self):
# 代理服务器
proxyHost = "proxy.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "H2T*****22WD"
proxyPass = "7****10526D3F"
proxy_dict = {'http': "http-dyn.abuyun.com:9020"}
auth = f"{proxyUser}:{proxyPass}"
auth = base64.b64encode(auth.encode('utf8'))
proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()}
self.get_html(proxy_dict, proxy_header)
def get_html(self, proxy_dict, proxy_header):
count = 1
sum_count = 0
headers = """
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: max-age=0
Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832
Host: glidedsky.com
Proxy-Connection: keep-alive
Referer: http://glidedsky.com/login
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36
"""
import tools
headers = tools.headers_to_dict(headers)
headers.update(proxy_header)
# print(headers)
while True:
# if count == 37 or count == 38:
# continue
try:
res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers,
proxies=proxy_dict, timeout=10)
except Exception as e:
print("异常")
print(e)
continue
file_name = f'glidedsky_{count}.html'
if res.status_code == 200:
with open(file_name, 'w', encoding='utf8') as f:
f.write(res.text)
res_html = etree.HTML(res.text)
nums = res_html.xpath('//div[@class="col-md-1"]/text()')
if nums:
print("zhaodao")
# with open(file_name, 'w', encoding='utf8') as f:
# f.write(res.text)
for num in nums:
sum_count += int(num.strip())
count += 1
print(sum_count)
if count == 1001:
return sum_count
# time.sleep(3)
def parse_html(self):
count = 1
sum_count = 0
while True:
file_name = f'glidedsky_{count}.html'
with open(file_name, 'r', encoding='utf8') as f:
content = f.read()
res_html = etree.HTML(content)
nums = res_html.xpath('//div[@class="col-md-1"]/text()')
if nums:
for num in nums:
sum_count += int(num.strip())
print("次数综合", count, sum_count)
if count == 1001:
break
# return sum_count
else:
print("没有内容", file_name)
continue
count += 1
print("总和", sum_count)
if __name__ == '__main__':
# test().abu_test()
test().parse_html()
本篇文章由一文多发平台 ArtiPub自动发布