python 爬虫多进程+异步 示例代码

from multiprocessing import Pool
import requests, asyncio, aiohttp
import os
import urllib.request
from time import sleep

class Cartoon():
    path = ''
    image_list = []
    key = 0
    curtitle = 0
    def str_dict(self):
        '''把字符串转化成字典,通常的请求头一个一个写成字典麻烦'''
        headers = {}
        heads = '''
        Host: www.canva.com
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0
        Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
        Referer: https://www.canva.com/_ajax/fonts/scripts/recommendations?find&locale=en&limit=57&includeHiddenFonts=false&includeNonLegacyFonts&includePremiumFonts&includePaidFonts=false&includeHanyiFonts&useExtendedStylesFontFamilies=false
        Content-Type: application/x-www-form-urlencoded
        Origin: https://www.canva.com
        Connection: keep-alive
        Cookie: __cfduid=dab37c60e818ac043e4327c7e7ac3fa8d1590496209; CDI=537a9753-9237-452a-b9d7-80f91a302029; CL=zh-CN; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22f0d86891-67fa-4f1b-b6b2-62dce9ce18bc%22; _gcl_au=1.1.1437216432.1590496303; _uetsid=4da25ae7-2449-5476-8e72-c4160ccd9aef; ab.storage.sessionId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22dd4a173d-62b5-4c16-a568-d1ae75d8a63c%22%2C%22e%22%3A1590498103213%2C%22c%22%3A1590496303214%2C%22l%22%3A1590496303214%7D; ab.storage.deviceId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22a76bae95-4f5e-301d-7af6-07ca1b599682%22%2C%22c%22%3A1590496303216%2C%22l%22%3A1590496303216%7D; cf_clearance=7a24b07ec0521a856146297a742690efbe495789-1590543393-0-250; CPA=-mZkhSgMVbsfE7_i2n2WGqvJsOinHvOZyTkXW0i_ZNd3xjgMj5mqpRZiKpeGqhbClLbYnsh2pJMSW4MbCvnas6a-25GUgyzX0lnluPGM5S9szocQdDMnkAj4Uu3lx9RLjJ7gYw; CCK=MA5NCilRPuKd2cxieUZN1w
        Upgrade-Insecure-Requests: 1
        '''
        heads = heads.split('\n')
        for head in heads:
            head = head.strip()
            if head:
                head_key,head_value = head.split(':',1)
                headers[head_key] = head_value.strip()
        return headers

    def __init__(self):
        self.mkdir('漫画')

    def mkdir(self, name):
        if name not in os.listdir('.'):
            os.mkdir(name)
            os.chdir(os.path.join(os.path.abspath('.'), name))
            return True
        else:
            return False

    def fetch_url(self, url, times = 0):
        try:
            response = requests.get(url, headers=self.str_dict())
            if response.status_code != 200:
                print('fetch ' + url + ' not success')
                self.fetch_url(url)
            return response
        except Exception as ex:
            print('fetch ' + url + ' error: ', ex)
            if times == 10:
                return
            times += 1
            return self.fetch_url(url, times)

    def auto_down(self, url, filename, times = 0):
        print(filename)
        try:
            urllib.request.urlretrieve(url, filename)
        except Exception as ex:
            print('download img error: ', url, '     ', ex)
            if times == 10:
                return
            times += 1
            self.auto_down(url, filename, times)

    def left_pad_zero(self, ori, target_len):
        str_len = len(ori)
        if str_len >= target_len:
            return ori
        paddingLen = target_len - str_len
        rst = "0" * paddingLen + ori
        return rst

    async def get_image(self, url, semaphore, times = 0):
        '''异步请求库aiohttp 加快图片 url 的网页请求'''
        try:
            async with semaphore:
                async with aiohttp.ClientSession() as session:
                    response = await session.get(url)
                    content = await response.read()
                    response.close()
#                    sleep(2)
                    return content
        except Exception as ex:
            print(self.curtitle, '    ', times, '    ', url, '  download ERROR: ', ex)
            if times == 10:
                return None
            times += 1
            return await self.get_image(url, semaphore, times)

    async def download_image(self, image, semaphore):
        html = await self.get_image(image[0], semaphore)
        if html is not None:
            with open(self.left_pad_zero(image[1], 5) + '.jpg','wb') as f:
                f.write(html)
            print(self.curtitle, '    download: ', image[1] + '    ' + image[0])

    def crawl_catalog(self):
        rootpath = os.path.abspath('.')

        response = self.fetch_url(
            "https://comiccdnhw.jsmlny.top/hcomic/home?channelNo=H5_MH_0000")
        data = response.json()
        comic_list = data['data']['columnList']
        for comic in comic_list:
            for list in comic['comicList']:
                os.chdir(rootpath)
                if ':' in list['title']:
                    list['title'] = list['title'].split(':')[-1]
                if list['title'] in os.listdir('.'):
                    print(list['title'], ' is exists')
                    continue
                if not self.mkdir(list['title']):
                    print(list['title'], ' mkdir fail')
                    continue
                response = self.fetch_url(
                "https://comiccdnhw.jsmlny.top/hcomic/qryComicInfoByComicId?channelNo=H5_MH_0000&comicId=" + str(list['comicId']))
                data = response.json()
                chapter_info = data['data']['comicBaseInfo']
                self.curtitle = chapter_info['title']
                for chapter in chapter_info['comicChapterList']:
                    chapter_id = str(chapter['chapterId'])
                    self.crawl_chapter(chapter_id)

                semaphore = asyncio.Semaphore(150)
                task = [asyncio.ensure_future(self.download_image((img_url, str(key+1)), semaphore)) for key, img_url in enumerate(self.image_list)]
                # 获取事件循环 Eventloop
                loop = asyncio.get_event_loop()
                # 执行协程
                loop.run_until_complete(asyncio.wait(task))
                

    def crawl_chapter(self, chapter_id):
        try:
            response = self.fetch_url("https://comiccdnhw.jsmlny.top/hcomic/chaptercontent?chapterId=" + chapter_id)
            data = response.json()
            chaptercontentlist = data['data']['chapterContentList']
            dir=os.path.abspath('.')
            for img in chaptercontentlist:
                print(self.curtitle, '     fetchurl: ', str(self.key + 1) + '    ' + img['content'])
                self.image_list.append(img['content'])
                self.key += 1
        except Exception as ex:
            print(self.curtitle, '  ERROR: ', ex)

if __name__ == "__main__":
    print('start')
    crawl = Cartoon()
    p = Pool(4)
    i = 0
    while i < 4:
        i += 1
        print(i)
        p.apply_async(crawl.crawl_catalog)
    p.close()
    p.join()
    print('end')

 

你可能感兴趣的:(#,爬虫)