python爬虫:多线程,协程使用

文章目录

  • 前言
  • 一、首先正常请求加解析数据流程?
  • 二、多线程使用
  • 三、协程使用
  • 四、多线程,协程结合使用
  • 总结


前言

有时某些网站反爬不是很厉害的时候,这时候就需要提高爬取数据的效率,可以使用多线程,协程等方式,关于这些的知识有很多文章讲述,肯定比我讲的好,下面是我个人理解使用方式,可以说是我写代码的模板吧


一、首先正常请求加解析数据流程?

import requests
from bs4 import BeautifulSoup
import time
from loguru import logger
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
def f1(url):
    res_list = []
    logger.debug('请求%s'%url)
    response = requests.get(url=url,headers=headers)
    response.encoding = 'gbk'
    page_text = response.text
    soup = BeautifulSoup(page_text, 'lxml')
    dls = soup.findAll("dl")
    for dl in dls:
        brandId = dl.get("id")
        brandName = dl.find("dt").text
        cxzs = dl.find_all(class_="rank-list-ul")
        for cxz in cxzs:
            zm = cxz.findPrevious().text
            cxs = cxz.findAll("li")
            for cx in cxs:
                try:
                    cxId = cx.get("id").replace("s", "")
                    cxName = cx.find("a").text
                    res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
                except:
                    pass
    logger.info('请求结束%s' % (url))
    return dict(url=url,res_list=res_list,code='成功')
if __name__ == '__main__':
    # logger.error(time.time())
    a = time.time()
    urls = []
    for i in range(ord("A"), ord("Z") + 1):
        U = chr(i)
        urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
    for url in urls:
        f1(url)
    logger.error(time.time()-a)

有兴趣可以运行一下,例子是汽车之家A-Z请求数据,接下来多线程,协程都用这个当为例子,具体链接此文章链接

二、多线程使用

这是我习惯套的模板

import threading
from loguru import logger
import time
import pandas as pd
from sqlalchemy import create_engine


class ApiDataThread(object):

    def __init__(self,shop_info_list):

        self.shop_info_list = shop_info_list
        self.thread_lock = threading.Lock()

    def get_info(self):
        task = None
        self.thread_lock.acquire()
        if self.shop_info_list:
            task = self.shop_info_list.pop()
            """
            取数据
            """
        self.thread_lock.release()
        if task is not None:
            """
            业务逻辑
            """

            self.thread_lock.acquire()
            """
            存储数据
            """
            self.thread_lock.release()
    def main(self,n):
        pool = []
        for i in range(n):
            t = threading.Thread(target=self.get_info)
            t.start()
            pool.append(t)
        for j in pool:
            j.join()

if __name__ == '__main__':
    obj = ApiDataThread(list)
    obj.main(len(list))

下面是结合上述例子,一个url就是一个线程

import threading
from loguru import logger
import time
import requests
from bs4 import BeautifulSoup
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
class ApiDataThread(object):

    def __init__(self,shop_info_list):

        self.shop_info_list = shop_info_list
        self.thread_lock = threading.Lock()
        self.res_list = []

    def get_info(self):
        task = None
        self.thread_lock.acquire()
        if self.shop_info_list:
            task = self.shop_info_list.pop()
        self.thread_lock.release()
        if task is not None:
            logger.debug('请求%s'%task)
            response = requests.get(url=task,headers=headers)
            response.encoding = 'gbk'
            page_text = response.text
            soup = BeautifulSoup(page_text, 'lxml')
            dls = soup.findAll("dl")
            for dl in dls:
                brandId = dl.get("id")
                brandName = dl.find("dt").text
                cxzs = dl.find_all(class_="rank-list-ul")
                for cxz in cxzs:
                    zm = cxz.findPrevious().text
                    cxs = cxz.findAll("li")
                    for cx in cxs:
                        try:
                            cxId = cx.get("id").replace("s", "")
                            cxName = cx.find("a").text
                            self.thread_lock.acquire()
                            self.res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
                            self.thread_lock.release()
                        except:
                            pass

    def main(self,n):
        pool = []
        for i in range(n):
            t = threading.Thread(target=self.get_info)
            t.start()
            pool.append(t)
        for j in pool:
            j.join()
        return self.res_list

if __name__ == '__main__':
    a = time.time()
    urls = []
    for i in range(ord("A"), ord("Z") + 1):
        U = chr(i)
        urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
    obj = ApiDataThread(urls)
    print(obj.main(len(urls)))
    logger.error(time.time() - a)

运行是不是快了一点

三、协程使用

这里需要导入aiohttp,asyncio

import aiohttp
from loguru import logger
import asyncio
from bs4 import BeautifulSoup
import time

class AioHttps(object):

    def __init__(self,urls):
        self.urls = urls
        self.result = []

    async def fetch(self,session,url,headers=None,timeout=9):
        _headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
        }
        if headers:
            _headers = headers
        logger.debug('开始请求%s'%(url))
        try:
            async with session.get(url,headers=headers,timeout=timeout) as response:
                self.bs4_html(await response.read(),url)
        except Exception as e:
            self.result.append(dict(url=url,res_list='',exceptions=e))
        logger.info('请求结束%s'%(url))

    async def main(self):
        async with aiohttp.ClientSession() as client:
            tasks = []
            for url in self.urls:
                tasks.append(asyncio.create_task(self.fetch(client,url)))
            await asyncio.wait(tasks)

    def bs4_html(self,response,url):
        """
        解析html
        :param response:
        :param url:
        :return:
        """
        res_list = []
        soup = BeautifulSoup(response, 'lxml')
        dls = soup.findAll("dl")
        for dl in dls:
            brandId = dl.get("id")
            brandName = dl.find("dt").text
            cxzs = dl.find_all(class_="rank-list-ul")
            for cxz in cxzs:
                zm = cxz.findPrevious().text
                cxs = cxz.findAll("li")
                for cx in cxs:
                    try:
                        cxId = cx.get("id").replace("s", "")
                        cxName = cx.find("a").text
                        res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
                    except:
                        pass
        self.result.append(dict(url=url,res_list=res_list,code='成功'))

    def run(self):
        asyncio.get_event_loop().run_until_complete(self.main())
        return self.result

if __name__ == '__main__':
    a = time.time()
    urls = []
    for i in range(ord("A"), ord("Z") + 1):
        U = chr(i)
        urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U )
    obj = AioHttps(urls)
    obj.run()
    logger.error(time.time() - a)

运行感觉比多线程还快了一点,效果很好啊

四、多线程,协程结合使用

import aiohttp
from loguru import logger
import asyncio
from bs4 import BeautifulSoup
import time
import threading

class AioHttps(object):
    """
    协程类
    """

    def __init__(self,urls):
        self.urls = urls
        self.result = []

    async def fetch(self,session,url,headers=None,timeout=9):
        _headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
        }
        if headers:
            _headers = headers
        logger.debug('开始请求%s'%(url))
        try:
            async with session.get(url,headers=headers,timeout=timeout) as response:
                self.bs4_html(await response.read(),url)
        except Exception as e:
            self.result.append(dict(url=url,res_list='',exceptions=e))
        logger.info('请求结束%s'%(url))

    async def main(self):
        async with aiohttp.ClientSession() as client:
            tasks = []
            for url in self.urls:
                tasks.append(asyncio.create_task(self.fetch(client,url)))
            await asyncio.wait(tasks)

    def bs4_html(self,response,url):
        """
        解析html
        :param response:
        :param url:
        :return:
        """
        res_list = []
        soup = BeautifulSoup(response, 'lxml')
        dls = soup.findAll("dl")
        for dl in dls:
            brandId = dl.get("id")
            brandName = dl.find("dt").text
            cxzs = dl.find_all(class_="rank-list-ul")
            for cxz in cxzs:
                zm = cxz.findPrevious().text
                cxs = cxz.findAll("li")
                for cx in cxs:
                    try:
                        cxId = cx.get("id").replace("s", "")
                        cxName = cx.find("a").text
                        res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
                    except:
                        pass
        self.result.append(dict(url=url,res_list=res_list,code='成功'))

    def run(self):
    	# new_event_loop ,否则会报错
        asyncio.new_event_loop().run_until_complete(self.main())
        return self.result

class ApiDataThread(object):
    """
    多线程类
    """

    def __init__(self,shop_info_list):

        self.shop_info_list = shop_info_list
        self.thread_lock = threading.Lock()
        self.res_list = []

    def get_info(self):
        task = None
        self.thread_lock.acquire()
        if self.shop_info_list:
            task = self.shop_info_list.pop()
        self.thread_lock.release()
        if task is not None:

            self.thread_lock.acquire()
            obj = AioHttps(task)
            self.res_list += obj.run()
            self.thread_lock.release()


    def main(self,n):
        pool = []
        for i in range(n):
            t = threading.Thread(target=self.get_info)
            t.start()
            pool.append(t)
        for j in pool:
            j.join()
        return self.res_list

if __name__ == '__main__':
    a = time.time()
    urls = []
    for i in range(ord("A"), ord("Z") + 1):
        U = chr(i)
        urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
    lists = [urls[i:i + 10] for i in range(0, len(urls), 10)]
    obj = ApiDataThread(lists)
    print(obj.main(len(lists)))
    logger.error(time.time() - a)

运行一下发现,速度也很快,但差距不大可能是数据太少,体现不出来,


总结

以上是我经常用到一些套路模板,如果有其他见解和意见,随时欢迎一起交流讨论!

你可能感兴趣的:(python,爬虫)