爬虫之requests异步抓取、多协成、多线程(六)

  • 1、多线程(threading)
  • 2、线程池(multiprocessing)
  • 3、协程
      • 2.1代理池、多协程分页爬取图片

1、多线程(threading)


import time
from multiprocessing.pool import ThreadPool
import threading
import json
import re
import requests
from lxml import etree


def get_introduction(name):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
        "Host": "baike.baidu.com",
        "Connection": "keep-alive",
    }
    url = r'https://baike.baidu.com/item/' + name
    response = requests.get(url=url, headers=headers)

    # 简介
    introduction = etree.HTML(response.text).xpath('.//div[@class="lemma-summary"]//text()')
    introduction = ''.join(introduction).replace('\n', '').replace(' ', '')
    introduction_int = re.compile(r"\[\d{1}\]|\[\d{1, 2}\]").findall(introduction)
    for _int in introduction_int:
        introduction = introduction.replace(_int, '')
    # print(introduction)

    # 所有数据
    info_all_data = []
    dd_list = etree.HTML(response.text).xpath('.//dd[@class="basicInfo-item value"]')
    for dd in dd_list:
        info = ''.join(dd.xpath('.//text()'))
        info = info.replace('\n', '、')
        info = "".join(info.split())
        info_int = re.compile(r"\[\d{1}\]|\[\d{1, 2}\]").findall(info)
        for _int in info_int:
            info = info.replace(_int, '')
        info = info.strip('、')
        info_all_data.append(info)

    # 所有字段
    title_all_data = []
    dt_list = etree.HTML(response.text).xpath('.//dt[@class="basicInfo-item name"]')
    for dt in dt_list:
        title = ''.join(dt.xpath('.//text()'))
        title = "".join(title.split())
        title_all_data.append(title)

    # 所有字段匹配数据
    dic = {}
    for i, tit in enumerate(title_all_data):
        dic[tit] = info_all_data[i]
    dic['简介'] = introduction
    print(json.dumps(dic, indent=4, ensure_ascii=False))
    # return dic

if __name__ == '__main__':
    name_list = ['李白', '杜甫', '曹操', '杜牧', '刘备', '苏轼', '王安石', '李商隐', '李清照', '岳飞', '晏殊', '欧阳修',
                 '白居易', '孟浩然', '辛弃疾', '杨万里', '王维', '王勃', '范仲淹', '古称', '鲁迅',
                 '韩愈', '司马迁', '刘禹锡', '陶渊明', '屈原', '刘长卿', '文天祥', '柳宗元']
    thread_list = []
    start_time = time.time()
    for name in name_list:
        thread = threading.Thread(target=get_introduction, args=(name, ))
        thread_list.append(thread)
    for thread in thread_list:
        thread.start()
    for thread in thread_list:
        thread.join()
    print("last time: {} s".format(time.time() - start_time))

last time: 1.8476762771606445 s

2、线程池(multiprocessing)

将上述主函数更改为一下内容

if __name__ == '__main__':
    pool = ThreadPool(8)        #八线程
    name_list = ['李白', '杜甫', '曹操', '杜牧', '刘备', '苏轼', '王安石', '李商隐', '李清照', '岳飞', '晏殊', '欧阳修',
                                  '白居易', '孟浩然', '辛弃疾', '杨万里', '王维', '王勃', '范仲淹', '古称', '鲁迅',
                                  '韩愈', '司马迁', '刘禹锡', '陶渊明', '屈原', '刘长卿', '文天祥', '柳宗元']
    start_time = time.time()
    pool.map(get_introduction, name_list)       #多线程工作
    pool.close()
    pool.join()
    print("last time: {} s".format(time.time() - start_time))

last time: 1.915708303451538 s

  • 线程池、进程池
    • 好处:我们可以降低系统对进程或者线程创建和销毁的一个频率,从而很好的降低系统的开销。
    • 弊端:池中线程或进程的数量是有上限。

3、协程

event_loop:事件循环,相当于一个无限循环,我们可以把一些函数注册到这个事件循环上,当满足某些条件的时候,函数就会被循环执行。
coroutine:协程对象,我们可以将协程对象注册到事件循环中,它会被事件循环调用。我们可以使用 async 关键字来定义一个方法,这个方法在调用时不会立即被执行,而是返回一个协程对象。
task:任务,它是对协程对象的进一步封装,包含了任务的各个状态。
future:代表将来执行或还没有执行的任务,实际上和 task 没有本质区别。
async: 定义一个协程.
await :用来挂起阻塞方法的执行。
多协程实现:将多个任务放到列表中,一次性放到loop中进行事件循环。

import time
import asyncio

async def request(url):
    print('正在下载', url)
    # time.sleep(2)
    await asyncio.sleep(2)
    print('下载成功', url)

if __name__ == '__main__':
	start_time = time.time()
    urls = [
        'www.baidu.com',
        'www.sogou.com',
        'www.douban.com'
    ]
    stasks = []
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        stasks.append(task)
    loop = asyncio.get_event_loop()
    # 需要将任务列表封装到wait中
    loop.run_until_complete(asyncio.wait(stasks))
    print(time.time() - start_time)

2.1代理池、多协程分页爬取图片

对4K风景图片进行分页爬取。

  • 协程获取网页数据方法:
    • text()返回字符串形式的响应数据
    • read()返回的二进制形式的响应数据
    • json()返回的就是json对象
      爬虫之requests异步抓取、多协成、多线程(六)_第1张图片
import requests
from lxml import etree
import time
import os
import aiohttp
import asyncio
from random import choice

# 进行UA伪装
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
# 代理池
ip_list = [
    'http://196.52.58.239:80',
    'http://165.225.112.77:10605',
    'http://165.225.76.70:10605',
    'http://122.226.57.70:8888',
    'http://123.57.84.116:8118',
    'http://202.109.157.64:9000',
    'http://139.155.41.15:8118',
    'http://39.106.223.134:80',
    'http://47.115.63.52:8888'
]
# 创建picture/fengjing文件夹
if not os.path.exists('./picture/fengjing'):
    os.mkdir('./picture/fengjing')
base_pic_path = './picture/fengjing/'


# 进行图片爬取和存储
async def get_picture(dic):
    url = dic['url']
    pic_path = base_pic_path + dic['name']
    # 协程
    async with aiohttp.ClientSession() as session:
        # get()、post():
        # headers,params/data,proxy='http://ip:port'
        proxy = choice(ip_list)  # 随机选择代理
        async with await session.get(url=url, proxy=proxy, headers=headers) as response:
            # text()返回字符串形式的响应数据
            # read()返回的二进制形式的响应数据
            # json()返回的就是json对象
            # 注意:获取响应数据操作之前一定要使用await进行手动挂起

            # 爬取数据并保存
            pic_data = await response.read()
            await asyncio.sleep(1)
            with open(pic_path, 'wb') as fp:
                fp.write(pic_data)
                print(dic['name'], '下载成功')


if __name__ == '__main__':
    start = time.time()  # 记录程序开始时间
    url = 'http://pic.netbian.com/4kfengjing/index_%d.html'  # 图片url
    tasks = []  # 存放协程
    for page in range(2, 50):  # 分页,这里只爬取50页
        new_url = format(url % page)  # 每一页对应的url
        page_text = requests.get(url=new_url, headers=headers).text  # 发送requests.get请求
        tree = etree.HTML(page_text)  # 实例化etree,使用xpath
        li_list = tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            # 获取每张图片的src、name
            img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
            name = img_src.split('/')[-1]
            # data = requests.get(url=img_src).content
            # path = './libs/'+name
            # with open(path,'wb') as fp:
            #     fp.write(data)
            #     print(name,'下载成功')
            dic = {
                'name': name,
                'url': img_src
            }
            c = get_picture(dic)  # 调用协程函数
            task = asyncio.ensure_future(c)
            tasks.append(task)  # 多协程
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    print('总耗时:', time.time() - start)  # 计算程序运行时间

本项目GitHub

你可能感兴趣的:(列表,python,多线程,js)