import time
from multiprocessing.pool import ThreadPool
import threading
import json
import re
import requests
from lxml import etree
def get_introduction(name):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Host": "baike.baidu.com",
"Connection": "keep-alive",
}
url = r'https://baike.baidu.com/item/' + name
response = requests.get(url=url, headers=headers)
# 简介
introduction = etree.HTML(response.text).xpath('.//div[@class="lemma-summary"]//text()')
introduction = ''.join(introduction).replace('\n', '').replace(' ', '')
introduction_int = re.compile(r"\[\d{1}\]|\[\d{1, 2}\]").findall(introduction)
for _int in introduction_int:
introduction = introduction.replace(_int, '')
# print(introduction)
# 所有数据
info_all_data = []
dd_list = etree.HTML(response.text).xpath('.//dd[@class="basicInfo-item value"]')
for dd in dd_list:
info = ''.join(dd.xpath('.//text()'))
info = info.replace('\n', '、')
info = "".join(info.split())
info_int = re.compile(r"\[\d{1}\]|\[\d{1, 2}\]").findall(info)
for _int in info_int:
info = info.replace(_int, '')
info = info.strip('、')
info_all_data.append(info)
# 所有字段
title_all_data = []
dt_list = etree.HTML(response.text).xpath('.//dt[@class="basicInfo-item name"]')
for dt in dt_list:
title = ''.join(dt.xpath('.//text()'))
title = "".join(title.split())
title_all_data.append(title)
# 所有字段匹配数据
dic = {}
for i, tit in enumerate(title_all_data):
dic[tit] = info_all_data[i]
dic['简介'] = introduction
print(json.dumps(dic, indent=4, ensure_ascii=False))
# return dic
if __name__ == '__main__':
name_list = ['李白', '杜甫', '曹操', '杜牧', '刘备', '苏轼', '王安石', '李商隐', '李清照', '岳飞', '晏殊', '欧阳修',
'白居易', '孟浩然', '辛弃疾', '杨万里', '王维', '王勃', '范仲淹', '古称', '鲁迅',
'韩愈', '司马迁', '刘禹锡', '陶渊明', '屈原', '刘长卿', '文天祥', '柳宗元']
thread_list = []
start_time = time.time()
for name in name_list:
thread = threading.Thread(target=get_introduction, args=(name, ))
thread_list.append(thread)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
print("last time: {} s".format(time.time() - start_time))
last time: 1.8476762771606445 s
将上述主函数更改为一下内容
if __name__ == '__main__':
pool = ThreadPool(8) #八线程
name_list = ['李白', '杜甫', '曹操', '杜牧', '刘备', '苏轼', '王安石', '李商隐', '李清照', '岳飞', '晏殊', '欧阳修',
'白居易', '孟浩然', '辛弃疾', '杨万里', '王维', '王勃', '范仲淹', '古称', '鲁迅',
'韩愈', '司马迁', '刘禹锡', '陶渊明', '屈原', '刘长卿', '文天祥', '柳宗元']
start_time = time.time()
pool.map(get_introduction, name_list) #多线程工作
pool.close()
pool.join()
print("last time: {} s".format(time.time() - start_time))
last time: 1.915708303451538 s
event_loop:事件循环,相当于一个无限循环,我们可以把一些函数注册到这个事件循环上,当满足某些条件的时候,函数就会被循环执行。
coroutine:协程对象,我们可以将协程对象注册到事件循环中,它会被事件循环调用。我们可以使用 async 关键字来定义一个方法,这个方法在调用时不会立即被执行,而是返回一个协程对象。
task:任务,它是对协程对象的进一步封装,包含了任务的各个状态。
future:代表将来执行或还没有执行的任务,实际上和 task 没有本质区别。
async: 定义一个协程.
await :用来挂起阻塞方法的执行。
多协程实现:将多个任务放到列表中,一次性放到loop中进行事件循环。
import time
import asyncio
async def request(url):
print('正在下载', url)
# time.sleep(2)
await asyncio.sleep(2)
print('下载成功', url)
if __name__ == '__main__':
start_time = time.time()
urls = [
'www.baidu.com',
'www.sogou.com',
'www.douban.com'
]
stasks = []
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
stasks.append(task)
loop = asyncio.get_event_loop()
# 需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(stasks))
print(time.time() - start_time)
对4K风景图片进行分页爬取。
import requests
from lxml import etree
import time
import os
import aiohttp
import asyncio
from random import choice
# 进行UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
# 代理池
ip_list = [
'http://196.52.58.239:80',
'http://165.225.112.77:10605',
'http://165.225.76.70:10605',
'http://122.226.57.70:8888',
'http://123.57.84.116:8118',
'http://202.109.157.64:9000',
'http://139.155.41.15:8118',
'http://39.106.223.134:80',
'http://47.115.63.52:8888'
]
# 创建picture/fengjing文件夹
if not os.path.exists('./picture/fengjing'):
os.mkdir('./picture/fengjing')
base_pic_path = './picture/fengjing/'
# 进行图片爬取和存储
async def get_picture(dic):
url = dic['url']
pic_path = base_pic_path + dic['name']
# 协程
async with aiohttp.ClientSession() as session:
# get()、post():
# headers,params/data,proxy='http://ip:port'
proxy = choice(ip_list) # 随机选择代理
async with await session.get(url=url, proxy=proxy, headers=headers) as response:
# text()返回字符串形式的响应数据
# read()返回的二进制形式的响应数据
# json()返回的就是json对象
# 注意:获取响应数据操作之前一定要使用await进行手动挂起
# 爬取数据并保存
pic_data = await response.read()
await asyncio.sleep(1)
with open(pic_path, 'wb') as fp:
fp.write(pic_data)
print(dic['name'], '下载成功')
if __name__ == '__main__':
start = time.time() # 记录程序开始时间
url = 'http://pic.netbian.com/4kfengjing/index_%d.html' # 图片url
tasks = [] # 存放协程
for page in range(2, 50): # 分页,这里只爬取50页
new_url = format(url % page) # 每一页对应的url
page_text = requests.get(url=new_url, headers=headers).text # 发送requests.get请求
tree = etree.HTML(page_text) # 实例化etree,使用xpath
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
# 获取每张图片的src、name
img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
name = img_src.split('/')[-1]
# data = requests.get(url=img_src).content
# path = './libs/'+name
# with open(path,'wb') as fp:
# fp.write(data)
# print(name,'下载成功')
dic = {
'name': name,
'url': img_src
}
c = get_picture(dic) # 调用协程函数
task = asyncio.ensure_future(c)
tasks.append(task) # 多协程
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print('总耗时:', time.time() - start) # 计算程序运行时间
本项目GitHub