Python爬虫学习日记四 并行下载

Python爬虫学习日记四 并行下载
                                                         冰冠 2018年06月19日10:38:56
爬取网站:https://www.alexa.com/topsites
1、解析ALexa列表
    Alexa网站列表是以电子表格的形式提供的,表格分为两列,排名 与 域名
     抽取步骤
         (1)下载.zip文件
         (2)从.zip文件中提取csv文件
         (3)解析csv文件
         (4)遍历csv文件的每一行,抽取除域名数据
     代码如下
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-19 上午10:51""

import csv
import sys
import zipfile
from io import StringIO,BytesIO
import os
# downloader模块是在python2的downloader基础上转化而来的python3版本,已上传至pypi,安装pip install downloader-py3
import downloader

url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'

db_path = os.path.expanduser('~/.tmp_downloader.sqlite')
if os.path.exists(db_path):
    os.remove(db_path)
D = downloader.Downloader(db_path, [0.1, 0.2])
zipped_data = D.open_url(url,10,parse_as_html=False)
urls = []
with zipfile.ZipFile(BytesIO(zipped_data)) as zf:
    csv_filename = zf.namelist()[0]
    with zf.open(csv_filename,'r') as cf:
        for row in cf:
            website = row.decode('utf-8').split(',')[1].replace("\n","")
            urls.append('https://'+website)
print(urls)


2、对原先的爬虫进行修改,添加类AlexaCallback
import zipfile
from io import BytesIO


class AlexaCallback:
    def __init__(self,url,max_urls = 1000):
        self.seed_url = url
        self.max_urls =max_urls

    def __call__(self, url,html):
        if url == self.seed_url:
            urls=[]
            with zipfile.ZipFile(BytesIO(html)) as zf:
                csv_filename = zf.namelist()[0]
                with zf.open(csv_filename, 'r') as cf:
                    for row in cf:
                        website = row.decode('utf-8').split(',')[1].replace("\n", "")
                        urls.append('https://' + website)
                        if len(urls) == self.max_urls:
                            break
            return urls


3、串行爬虫测试
from day04_concurrent.alexa_callback import AlexaCallback
from day04_concurrent.link_crawler import link_crawler
from day04_concurrent.mongodb_cache_example import MongoCache

url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
if __name__ == '__main__':
    '''串行爬虫'''
    link_crawler(seed_url=url, cache=MongoCache(), scrape_callback=AlexaCallback(url=url))



 4、多线程爬虫
     4.1 将爬虫扩展为并行爬虫,为防止请求过快造成服务器过载或ip地址被封禁的情况发生,为爬虫添加一个delay延时
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-23 上午10:25"""
import time
import threading
import urllib.parse
from day04_concurrent.downloader import Downloader

SLEEP_TIME = 1



def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    #crawl_queue = queue.deque([seed_url])
    crawl_queue = [seed_url]
    # the URL's that have been seen
    seen = {seed_url}
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
    return urllib.parse.urljoin(seed_url, link)



            4.2多进程爬虫
                将爬虫队列转移到MongoDB中。单独存出队列,也就意味着不同服务器上的 爬虫能够协同处理同一个爬虫任务,如果想要拥有更加健壮的队列,可以考虑考虑使用专用的消息传输工具,比如Celery。
添加自定义队列类MongoQueue
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-23 上午10:41"""
from datetime import datetime, timedelta

from pymongo import MongoClient, errors


class MongoQueue:
    """
    定义三种状态,  OUTSTANDING, PROCESSING, COMPLETE
      OUTSTANDING   添加一个新的url
      PROCESSING    url从队列中取出准备下载
      COMPLETE      下载结束后


    为避免丢失url的结果,设置一个timeout参数
    :param timeout 若url的处理时间抄过此值,则认定处理出现问题,重新设置状态为OUTSTANDING
    """
    OUTSTANDING, PROCESSING, COMPLETE = range(3)

    def __init__(self, client=None, timeout=300):
        self.client = MongoClient()
        self.db = self.client.cache
        self.timeout = timeout

    def __nonzero__(self):
        record = self.db.crawl_queue.find_one(
            {
                'status': {'$sn': self.COMPLETE}
            })
        return True if record else False

    def push(self, url):
        try:
            self.db.crawl_queue.insrt(
                {
                    '_id': url,
                    'status': self.OUTSTANDING
                })
        except errors.DuplicateKeyError as e:
            # this means already in the queue
            pass

    def pop(self):
        record = self.db.crawl_queue.find_and_modify(
            query={'status': self.OUTSTANDING},
            update={
                '$set': {'status': self.PROCESSING,
                         'timestamp': datetime.now()}}
        )
        if record:
            return record['_id']
        else:
            self.repair()
            raise KeyError()

    def peek(self):
        record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
        if record:
            return record['_id']

    def complete(self, url):
        self.db.crawl_queue.update(
            {'_id': url}, {'$set': {'status': self.COMPLETE}}
        )

    def repair(self):
        record = self.db.crawl_queue.find_and_modify(
            query={
                'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
                'status': {'$ne': self.COMPLETE}
            },
            update={'$set': {'status': self.OUTSTANDING}}
        )
        if record:
            print('Realeased:', record['_id'])

    def clear(self):
        self.db.crawl_queue.drop()



对爬虫进行修改
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-23 上午11:06"""

import time
import urllib.parse
import threading
import multiprocessing
from day04_concurrent.mongodb_cache_example import MongoCache
from day04_concurrent.mongo_queue import MongoQueue
from day04_concurrent.downloader import Downloader

SLEEP_TIME = 1


def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp',
                     proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies,
                   num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)


def process_crawler(args, **kwargs):
    num_cpus = multiprocessing.cpu_count()
    # pool = multiprocessing.Pool(processes=num_cpus)
    print('Starting {} processes'.format(num_cpus))
    processes = []
    for i in range(num_cpus):
        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
        # parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
        p.start()
        processes.append(p)
    # wait for processes to complete
    for p in processes:
        p.join()


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urllib.parse.urldefrag(link)  # remove hash to avoid duplicates
    return urllib.parse.urljoin(seed_url, link)



源码https://github.com/ice1995/python_web_crawler-/tree/master/day04_concurrent

你可能感兴趣的:(PYTHON)