alexa 网站列表是以电子表格的形式提供的,表格中有两项内容,分别是排名和域名
- | A | B |
---|---|---|
1 | 1 | google.com |
2 | 2 | facebook.com |
3 | 3 | youtube.com |
… | … | … |
抽取数据包含如下四个步骤:
1.下载zip文件
2.从zip文件阿忠提取csv文件
3.解析csv文件
4.遍历csv文件中的每一行,从中提取数据
下面是实现上面功能的代码:
其中Downloader的内容:
#-*- coding=utf-8 -*-
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import csv
import lxml.html
import random
import cssselect
import socket
DEFAULT_AGENT='wswp'
DEFAULT_DELAY=5
DEFAULT_RETRIES=1
DEFAULT_TIMEOUT=60
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
class Downloader:
def __init__(self,delay=5,user_agent='wswp',proxies=None,num_retries=1,timeout=60,opener=None,cache=None):
socket.setdefaulttimeout(timeout)
self.throttle=Throttle(delay)
self.user_agent=user_agent
self.proxies=proxies
self.num_retries=num_retries
self.opener=opener
self.cache = cache
def __call__(self,url):
result=None
if self.cache:
try:
result=self.cache[url]
except KeyError:
#网址在缓存里面不可用
pass
#else:
#if result is not None and self.num_retries >0 and 500<=result['code']<600:
#遇到了服务器的故障 并且重新下载
# result=None
if result==None:
# 结果并没有在cache中
#所以仍然需要重新下载
self.throttle.wait(url)
proxy=random.choice(self.proxies) if self.proxies else None
headers={'user_agent':self.user_agent}
result=self.download(url,headers,proxy,self.num_retries)
if self.cache:
#保存结果进入cache
self.cache[url]=result
return result['html']
def download(self,url,headers,proxy,num_retries,data=None):
print 'Downlaoding:',url
request=urllib2.Request(url,data,headers or{})
opener=self.opener or urllib2.build_opener()
if proxy:
proxy_params={urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response=opener.open(request)
html=response.read()
code=response.code
except Exception as e:
print 'Download error:',str(e)
html=''
if hasattr(e,'code'):
code=e.code
if num_retries>0 and 500<=code<600:
return self.download(url,headers,proxy,num_retries-1,data)
else:
code=None
return {'html':html,'code':code}
#-*- coding=utf-8 -*-
import csv
from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader
D=Downloader()
zipped_data=D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
urls=[]
with ZipFile(StringIO(zipped_data))as zf:
csv_filename=zf.namelist()[0]
for _,website in csv.reader(zf.open(csv_filename)):
urls.append('http://'+website)
可能已经注意到,下载的压缩数据狮子啊使用StringIo封装之后才传给ZipFile的。这是因为ZipFile需要一个类似文件的接口,而不是字符串。接下来我们从压缩文件中提取文件的列表。由于这个.zip 文件只包含一个文件,所以我们直接选择第一个文件即可。然后遍历该csv文件将第二列中的域名数据添加到URl列表中。