#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-15 下午3:31"""
import re
import urllib.parse
import urllib.request
import urllib.robotparser
from day03_cache.downloader import Downloader
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp',
proxies=None, num_retries=1, scrape_callback=None, cache=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries,
cache=cache)
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
html = D(url)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print('Blocked by robots.txt:', url)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
return urllib.parse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urllib.parse.urlparse(url1).netloc == urllib.parse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = urllib.robotparser.RobotFileParser()
rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html.decode('utf-8'))
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
user_agent='BadCrawler')
link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
max_depth=1, user_agent='GoodCrawler')
import re
url = 'http://example.webscraping.com/default/view/1'
filename = re.sub('[^/0-9a-zA-Z\-.,;_]','_',url)
filename = '/'.join(segment[:255] for segment in filename.split('/'))
new_url = 'http://example.webscraping.com/default/view/'
components = urllib.parse.urlsplit(new_url)
print(components)
# SplitResult(scheme='http', netloc='example.webscraping.com', path='/default/view/', query='', fragment='')
print(components.path)
# /default/view/
path = components.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path+='index.html'
new_filename = components.netloc +path + components.query
print(new_filename)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-18 上午11:07"""
import os
import re
import urllib.parse
import pickle
class DiskCache:
def __init__(self, cache_dir='cahce', max_length=255):
self.cache_dir = cache_dir
self.max_length = max_length
def url_2_path(self, url):
'''
Create file system path for this url
:param url:
:return:
'''
componts = urllib.parse.urlsplit(url)
# append index.html to empty paths
path = componts.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = componts.netloc + path + componts.query
# replace invaild characters
filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(segment[:255] for segment in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def __getitem__(self, url):
'''
Load data from disk for this url
:param url:
:return:
'''
path = self.url_2_path(url)
if os.path.exists(path):
with open(path, 'rb') as fp:
return pickle.load(fp)
else:
# url hs not been cacheda
raise KeyError(url + 'does not exist')
def __setitem__(self, url, result):
'''
Save data to disk for this url
:param url:
:param result:
:return:
'''
path = self.url_2_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
with open(path, 'wb') as fp:
fp.write(pickle.dumps(result))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-18 上午11:07"""
import os
import re
import urllib.parse
import pickle
from datetime import datetime, timedelta
import zlib
class DiskCache:
def __init__(self, cache_dir='cahce', max_length=255, expires=timedelta(days=30)):
self.cache_dir = cache_dir
self.max_length = max_length
self.expires = expires
def url_2_path(self, url):
'''
Create file system path for this url
:param url:
:return:
'''
componts = urllib.parse.urlsplit(url)
# append index.html to empty paths
path = componts.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = componts.netloc + path + componts.query
# replace invaild characters
filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(segment[:255] for segment in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def __getitem__(self, url):
'''
Load data from disk for this url
:param url:
:return:
'''
path = self.url_2_path(url)
if os.path.exists(path):
with open(path, 'rb') as fp:
result, timestamp = pickle.loads(zlib.decompress(fp.read))
if self.has_expired(timestamp):
raise KeyError(url + 'has exxpired')
return result
else:
# url hs not been cacheda
raise KeyError(url + 'does not exist')
def __setitem__(self, url, result):
'''
Save data to disk for this url
:param url:
:param result:
:return:
'''
path = self.url_2_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
timestamp = datetime.utcnow()
data = pickle.dumps((result, timestamp))
with open(path, 'wb') as fp:
fp.write(pickle.dumps(zlib.compress(data)))
def has_expired(self, timestamp):
'''
:param timestamp:
:return: boolean whether this timestamp has expired
'''
return datetime.utcnow() > timestamp + self.expires
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-19 上午9:07"""
from datetime import datetime, timedelta
from pymongo import MongoClient
class MongoCache:
def __init__(self, client=None, expires=timedelta(days=30)):
self.client = MongoClient('localhost', 27017)
self.db = client.cache
# self.expires = expires
self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
def __getitem__(self, url):
record = self.db.webpage.find_one({'_id': url})
if record:
return record['result']
else:
raise KeyError(url + 'does not exist')
def __setitem__(self, url, result):
record = {'result': result, 'timestamp': datetime.utcnow()}
self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-19 上午9:07"""
import pickle
import zlib
from datetime import datetime, timedelta
from pymongo import MongoClient
from bson.binary import Binary
class MongoCache:
def __init__(self, client=None, expires=timedelta(days=30)):
self.client = MongoClient('localhost', 27017)
self.db = client.cache
# self.expires = expires
self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
def __getitem__(self, url):
record = self.db.webpage.find_one({'_id': url})
if record:
return pickle.loads(zlib.decompress(record['result']))
else:
raise KeyError(url + 'does not exist')
def __setitem__(self, url, result):
record = {'result': Binary(zlib.compress(pickle.dumps(result))),
'timestamp': datetime.utcnow()}
self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
https://github.com/ice1995/python_web_crawler-/tree/master/day03_cache