爬虫性能

Catalog

  • 异步非阻塞
  • 多线程+异步非阻塞
  • 多进程
  • 测试结果

异步非阻塞

from gevent import monkey
monkey.patch_all()
import requests, gevent
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(url):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    response = requests.get(url, headers=header)
    print(response.status_code, response.url)
# 异步非阻塞
def grequest():
    from gevent.pool import Pool
    pool = Pool(16)
    urls = get_urls()
    gevent.joinall([pool.spawn(request, url) for url in urls])
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    grequest()
    print(time() - t)

多线程+异步非阻塞

from gevent import monkey
monkey.patch_all()
import requests, gevent
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(url):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    response = requests.get(url, headers=header)
    print(response.status_code, response.url)
# 异步非阻塞
def grequest(urls):
    from gevent.pool import Pool
    pool = Pool(4)
    gevent.joinall([pool.spawn(request, url) for url in urls])
# 多线程
def concurrent(n=4):
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(n)
    url_ls = get_urls()
    length = len(url_ls)
    step = int(length / n) + 1
    for i in range(0, length, step):
        urls = url_ls[i: i + step]
        pool.submit(grequest, urls)
    pool.shutdown(True)
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    concurrent()
    print(time() - t)

多进程

import requests
# 待访问的URL
def get_urls():
    jd_url = 'https://search.jd.com/Search?keyword=%E7%88%AC%E8%99%AB&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%88%AC%E8%99%AB&page={}&click=0'
    return [jd_url.format(i) for i in range(1, 200, 2)]
# 网页请求
def request(urls):
    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
    header = {'User-Agent': ua}
    for url in urls:
        response = requests.get(url, headers=header)
        print(response.status_code, response.url)
# 多进程
def concurrent(n=16):
    from concurrent.futures import ProcessPoolExecutor
    pool = ProcessPoolExecutor(n)
    url_ls = get_urls()
    length = len(url_ls)
    step = int(length / n) + 1
    for i in range(0, length, step):
        urls = url_ls[i: i + step]
        pool.submit(request, urls)
    pool.shutdown(True)
# 时间测试
if __name__ == '__main__':
    from time import time
    t = time()
    concurrent()
    print(time() - t)

测试结果

方法 时间(秒)
异步 3.4
多线程+异步 3.5
多进程 4.8

你可能感兴趣的:(爬虫)