Python--网络爬虫单线程与多线程的比较

 一直在尝试将下载数据的速度提高,于是利用了Python里面多线程的技术(当然是简单的map方法),想不到速度快了一倍!

普通单线程网络爬虫图片下载(耗时:19.525s)

#-*—coding:utf8-*-
import requests
import re
import time
import sys
reload(sys)
sys.setdefaultencoding("utf-8")


hea = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'}
si = 0
time1 = time.time()
for i in range(2190,2206):
    my_url = 'http://jandan.net/ooxx/page-2206#comments'
    url = re.sub('page-(.*?)#comments','page-%d#comments'%i,my_url)
    # print url
    # html = requests.get('http://jandan.net/ooxx/page-2205#comments',headers = hea)
    html = requests.get(url,headers = hea)
    html.encoding = 'utf-8'
    target = re.findall('

'
,html.text) for each in target: # print each # print 'now downloading:' + each pic = requests.get(each) fp = open('test\\' + str(si) + '.jpg', 'wb')#wb代表是写入二进制文件 fp.write(pic.content) fp.close() si += 1 time2 = time.time() print u'单线程耗时:'+str(time2-time1)

多线程网络爬虫图片下载(耗时:9.612s)

多线程比单线程下载同样的图片快了一倍!

#-*—coding:utf8-*-
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
import time
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'}

def download(url):#下载操作函数
    si = 0
    html = requests.get(url, headers=hea)
    html.encoding = 'utf-8'
    target = re.findall('

'
, html.text) for each in target: # print each # print 'now downloading:' + each pic = requests.get(each) fp = open('test\\' + str(si) + '.jpg', 'wb') # wb代表是写入二进制文件 fp.write(pic.content) fp.close() si += 1 def addr_list():#将网页地址列表化 urls = [] for i in range(2190, 2206): my_url = 'http://jandan.net/ooxx/page-2206#comments' url = re.sub('page-(.*?)#comments', 'page-%d#comments' % i, my_url) urls.append(url)#利用for循环将生成的网页地址列表化 return urls def multiDownload(): time1 = time.time() urls = addr_list() pool = ThreadPool(4) results = pool.map(download,urls) pool.close() pool.join() time2 = time.time() print u'多线程耗时:'+str(time2-time1) multiDownload()

你可能感兴趣的:(Python)