import urllib.request as ur
from urllib.error import URLError,ContentTooShortError,HTTPError
import re
from urllib.parse import urljoin
from urllib import robotparser
import time
from urllib.parse import urlparse
class Throttle(object):
def __init__(self,delay):
self.delay=delay
self.domains={}
def wait(self,url):
domain=urlparse(url).netloc
last_successed=self.domains.get(domain)
if self.delay>0 and last_successed is not None:
sleep_secs=self.delay-(time.time()-last_successed)
if sleep_secs>0:
time.sleep(sleep_secs)
self.domains[domain]=time.time()
def download(url, num_retries=2, user_agent='wswp',charset='utf-8',proxy=None):
print('Downloading:',url)
request=ur.Request(url)
request.add_header('User-Agent',user_agent)
try:
if proxy:
getProxy()
resp=ur.urlopen(request)
cs=resp.headers.get_content_charset()
if not cs:
cs=charset
html=resp.read().decode(cs)
except (URLError,ContentTooShortError,HTTPError) as e:
print('Download error:',e.reason)
html=None
if num_retries>0:
if hasattr(e,'code') and (500<=e.code<600):
return download(url,num_retries-1)
return html
def getProxy():
proxy_address = ur.urlopen("{代理IP网址}").read().decode('utf-8')
proxy_handler = ur.ProxyHandler({'http',proxy_address})
proxy_openner = ur.build_opener(proxy_handler)
return proxy_openner
def get_robots_parser(robot_url):
rp=robotparser.RobotFileParser()
rp.set_url(robot_url)
rp.read()
return rp
def link_crawlinks(start_url,link_regex,robots_url=None,user_agent='wswp'):
crawl_queue=[start_url]
if not robots_url:
robots_url='{}/robots.txt'.format(start_url)
rp=get_robots_parser(robots_url)
seen=set(crawl_queue)
while crawl_queue:
throttle=Throttle(delay=0.5)
url=crawl_queue.pop()
if rp.can_fetch(user_agent,url):
throttle.wait(url)
html=download(url,user_agent=user_agent)
if not html:
continue
for link in get_links(html):
if re.match(link_regex,link):
abs_link=urljoin(start_url,link)
if abs_link not in seen:
seen.add(abs_link)
crawl_queue.append(abs_link)
else:
print('Blocked by robots.txt:',url)
def get_links(html):
webpage_regex=re.compile("""]+href=["'](.*?)['"]""",re.IGNORECASE)
return webpage_regex.findall(html)