使用多线程爬取数据可以显著提高效率
#coding:utf8
import os
import threading
import requests
import urllib
from bs4 import BeautifulSoup
base_page_url = 'https://www.doutula.com/photo/list/?page='
#页面url列表
page_url_list = []
#表情url列表
face_url_list = []
#全局锁
glock = threading.Lock()
for x in range(1,870):
url = base_page_url + str(x)
page_url_list.append(url)
def procuder():
while True:
glock.acquire()
if len(page_url_list) == 0:
glock.release()
break
else:
page_url = page_url_list.pop()
glock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'})
glock.acquire()
for img in img_list:
url = img['data-original']
if not url.startswith('http'):
url = 'http:' + url
face_url_list.append(url)
glock.release()
def customer():
while True:
glock.acquire()
if len(face_url_list)==0:
glock.release()
continue
else:
face_url = face_url_list.pop()
glock.release()
split_list = face_url.split('/')
filename = split_list.pop()
path = os.path.join('images', filename)
urllib.urlretrieve(face_url, filename=path)
def main():
#创建4个多线程作为生产者,爬取图片
for x in range(4):
th = threading.Thread(target=procuder)
th.start()
#创建5个多线程作为消费者,下载图片
for x in range(5):
th = threading.Thread(target=customer)
th.start()
if __name__ =='__main__':
main()