python多线程爬取美图录网站图集按模特姓名存储到本地(二)

1.开启多进程解析页面,配合多线程下载爬取得更快。代码如下:

import requests,os,re,time
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from functools import partial
from multiprocessing import Pool
def save_picture(img_url,referer,path):
    file_name=path+os.sep+img_url.split('/')[-1]
    with open(file_name, 'wb+') as f:
        headers= {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'Accept-Encoding': "gzip, deflate, br",
        'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        'Connection': "keep-alive",
        'Cookie': 'UM_distinctid=169241b82e3d7-055c03980912f78-4c312f7f-e1000-169241b82e4370',
        'Referer':referer,
    }
        response= requests.get(img_url,headers=headers)
        f.write(response.content)
        f.close()
        time.sleep(1)


def parse_current_page(page):
    url=page[0]
    name=page[1]
    response = requests.get(url)
    response.encoding = 'utf-8'
    if response.status_code == 200:
        soup=BeautifulSoup(response.content,'lxml')
        img_items = soup.find('div', class_='content').find_all('img')
        img_srcs=[item['src'] for item in img_items]
        dir_path = 'E:\\picture\\'+name
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        pool=ThreadPool(4)
        pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)
        pool.close()
        pool.join()
        next_page=soup.find('div',id="pages").find_all('a')[-1]#
        next_page=next_page['href']
        current_page_num=soup.find('div',id="pages").find('span').text
        next_page_num=re.findall('\d+',next_page)[1]
        if next_page_num!=current_page_num:
            next_page="https://www.meitulu.com"+next_page
            return parse_current_page((next_page,name))
        else:
            return


def get_all_page(index_url):
    response=requests.get(index_url)
    if response.status_code==200:
        soup=BeautifulSoup(response.content,'lxml')

        items=soup.find('ul',class_='img').find_all('li')
        _list=[(item.find('a')['href'],item.find_all('p')[1].text) for item in items]
        return _list


def main():
    index_url='https://www.meitulu.com/'
    pool=Pool(5)
    pool.map(parse_current_page,get_all_page(index_url))
    pool.close()
    pool.join()
if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫)