python多线程爬取美图录网站图集按模特姓名存储到本地(一)

1.在workspace文件夹下新建spider.py文件

2.导入需要的库包,代码如下:

import requests,os,re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from functools import partial

3.编写主函数,代码如下:

def main():
	index_url='https://www.meitulu.com/'
   	pass
        
if __name__ == '__main__':
    main()

4.编写获取首页图集链接函数,代码如下:

def get_all_page(index_url):
    response=requests.get(index_url)
    if response.status_code==200:
        soup=BeautifulSoup(response.content,'lxml')
        items=soup.find('ul',class_='img').find_all('li')
        return [item.find('a')['href'] for item in items]

5.编写解析图集链接的函数,代码如下:

def parse_current_page(url,name):
    response = requests.get(url)
    response.encoding = 'utf-8'
    if response.status_code == 200:
        soup=BeautifulSoup(response.content,'lxml')#解析器
        img_items=soup.find('div',class_='content').find_all('img')#获取到图片所有项
        img_srcs=[item['src'] for item in img_items]#获取到所有图片链接
        dir_path = 'E:\\pictures\\'+name #图集路径
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        pool=ThreadPool(4)#创建4个线程
        pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)#partial偏函数可以传递多个参数到函数中去
        pool.close()
        pool.join()
        next_page=soup.find('div',id="pages").find_all('a')[-1]#获取下一页项
        next_page=next_page['href']#获取下一页链接
        current_page_num=soup.find('div',id="pages").find('span').text#目前页面底部高亮页码
        next_page_num=re.findall('\d+',next_page)[1]#正则匹配到链接中的页码
        if next_page_num!=current_page_num:     #如果不相同即代表还未到下一页
            next_page="https://www.meitulu.com"+next_page#加上前缀
            return parse_current_page(next_page)#循环爬取
        else:
            return#终止运行

关于 pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)中的partial函数解释如下(网上百度的):
map(参数1,参数2),这个函数里传递的是两个参数第一个参数是一个函数名,特别注意的是不带函数后面的(),第二个参数是个迭代器,如以下代码:

In[18]: def fun(t):
    		return t
		[i for i in map(fun,['a','b','c'])]
Out[18]: ['a', 'b', 'c']

map函数在python2返回的是列表,python3返回的是迭代器,节约内存。
一般情况第二个参数,传入一个迭代就够用了。由于这里需要传递一个存储路径,所以需要partial来传递多个参数。

6.编写保存图片函数,代码如下:

def save_picture(img_url,referer,path):
    file_name=path+os.sep+img_url.split('/')[-1]
    with open(file_name, 'wb+') as f:
        headers=headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'Accept-Encoding': "gzip, deflate, br",
        'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        'Connection': "keep-alive",
        'Cookie': 'UM_distinctid=169241b82e3d7-055c03980912f78-4c312f7f-e1000-169241b82e4370',
        'Referer':referer,
    }
        response= requests.get(img_url,headers=headers)
        f.write(response.content)
        f.close()

这里涉及到基本的反爬虫知识,如果在访问图片时不加请求头的话是会出现下载的图片只要1kb,原因就是服务器那边设置了反爬虫,请求头里面有个重要的参数就是referer,即该页面来源于主页面,这里我传入的是图集的链接。

7.完整代码如下:

import requests,os,re,time
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from functools import partial


def save_picture(img_url,referer,path):
    file_name=path+os.sep+img_url.split('/')[-1]
    with open(file_name, 'wb+') as f:
        headers=headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'Accept-Encoding': "gzip, deflate, br",
        'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        'Connection': "keep-alive",
        'Cookie': 'UM_distinctid=169241b82e3d7-055c03980912f78-4c312f7f-e1000-169241b82e4370',
        'Referer':referer,
    }
        response= requests.get(img_url,headers=headers)
        f.write(response.content)
        f.close()
        time.sleep(1)
        
        
def parse_current_page(url,name):
    response = requests.get(url)
    response.encoding = 'utf-8'
    if response.status_code == 200:
        soup=BeautifulSoup(response.content,'lxml')
        img_items=soup.find('div',class_='content').find_all('img')
        img_srcs=[item['src'] for item in img_items]
        dir_path = 'E:\\pictures\\'+name
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        pool=ThreadPool(4)
        pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)
        pool.close()
        pool.join()
        next_page=soup.find('div',id="pages").find_all('a')[-1]#
        next_page=next_page['href']
        current_page_num=soup.find('div',id="pages").find('span').text
        next_page_num=re.findall('\d+',next_page)[1]
        if next_page_num!=current_page_num:
            next_page="https://www.meitulu.com"+next_page
            return parse_current_page(next_page)
        else:
            return


def get_all_page(index_url):
    response=requests.get(index_url)
    if response.status_code==200:
        soup=BeautifulSoup(response.content,'lxml')
        items=soup.find('ul',class_='img').find_all('li')
        _list=[(item.find('a')['href'],item.find_all('p')[1].text) for item in items]
        return _list
    
    
def main():
    index_url='https://www.meitulu.com/'
    for page in get_all_page(index_url):
        parse_current_page(page[0],page[1])


if __name__ == '__main__':
    main()

8.下面开始运行,结果如下图:

python多线程爬取美图录网站图集按模特姓名存储到本地(一)_第1张图片

你可能感兴趣的:(爬虫)