python解析csv文件 提取数据

alexa 网站列表是以电子表格的形式提供的,表格中有两项内容,分别是排名和域名

- A B
1 1 google.com
2 2 facebook.com
3 3 youtube.com

抽取数据包含如下四个步骤:
1.下载zip文件
2.从zip文件阿忠提取csv文件
3.解析csv文件
4.遍历csv文件中的每一行,从中提取数据

下面是实现上面功能的代码:
其中Downloader的内容:

#-*- coding=utf-8 -*-
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import  csv
import lxml.html
import  random
import  cssselect
import socket

DEFAULT_AGENT='wswp'
DEFAULT_DELAY=5
DEFAULT_RETRIES=1
DEFAULT_TIMEOUT=60

class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()

class Downloader:
    def __init__(self,delay=5,user_agent='wswp',proxies=None,num_retries=1,timeout=60,opener=None,cache=None):
        socket.setdefaulttimeout(timeout)
        self.throttle=Throttle(delay)
        self.user_agent=user_agent
        self.proxies=proxies
        self.num_retries=num_retries
        self.opener=opener
        self.cache = cache

    def __call__(self,url):
        result=None
        if self.cache:
            try:
                result=self.cache[url]
            except KeyError:
                #网址在缓存里面不可用
                pass
        #else:
            #if result is not None and self.num_retries >0 and 500<=result['code']<600:
                #遇到了服务器的故障 并且重新下载
         #       result=None
        if result==None:
            # 结果并没有在cache中
            #所以仍然需要重新下载
            self.throttle.wait(url)
            proxy=random.choice(self.proxies) if self.proxies else None
            headers={'user_agent':self.user_agent}
            result=self.download(url,headers,proxy,self.num_retries)
            if self.cache:
                #保存结果进入cache
                self.cache[url]=result
        return result['html']

    def download(self,url,headers,proxy,num_retries,data=None):
        print 'Downlaoding:',url
        request=urllib2.Request(url,data,headers or{})
        opener=self.opener or urllib2.build_opener()
        if proxy:
            proxy_params={urlparse.urlparse(url).scheme:proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response=opener.open(request)
            html=response.read()
            code=response.code
        except Exception as e:
            print 'Download error:',str(e)
            html=''
            if hasattr(e,'code'):
                code=e.code
                if num_retries>0 and 500<=code<600:
                    return self.download(url,headers,proxy,num_retries-1,data)
            else:
                code=None
        return {'html':html,'code':code}





#-*- coding=utf-8 -*-
import csv
from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader

D=Downloader()
zipped_data=D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
urls=[]
with ZipFile(StringIO(zipped_data))as zf:
    csv_filename=zf.namelist()[0]
    for _,website in csv.reader(zf.open(csv_filename)):
        urls.append('http://'+website)

可能已经注意到,下载的压缩数据狮子啊使用StringIo封装之后才传给ZipFile的。这是因为ZipFile需要一个类似文件的接口,而不是字符串。接下来我们从压缩文件中提取文件的列表。由于这个.zip 文件只包含一个文件,所以我们直接选择第一个文件即可。然后遍历该csv文件将第二列中的域名数据添加到URl列表中。

你可能感兴趣的:(python)