#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-19 上午10:51""
import csv
import sys
import zipfile
from io import StringIO,BytesIO
import os
# downloader模块是在python2的downloader基础上转化而来的python3版本,已上传至pypi,安装pip install downloader-py3
import downloader
url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
db_path = os.path.expanduser('~/.tmp_downloader.sqlite')
if os.path.exists(db_path):
os.remove(db_path)
D = downloader.Downloader(db_path, [0.1, 0.2])
zipped_data = D.open_url(url,10,parse_as_html=False)
urls = []
with zipfile.ZipFile(BytesIO(zipped_data)) as zf:
csv_filename = zf.namelist()[0]
with zf.open(csv_filename,'r') as cf:
for row in cf:
website = row.decode('utf-8').split(',')[1].replace("\n","")
urls.append('https://'+website)
print(urls)
import zipfile
from io import BytesIO
class AlexaCallback:
def __init__(self,url,max_urls = 1000):
self.seed_url = url
self.max_urls =max_urls
def __call__(self, url,html):
if url == self.seed_url:
urls=[]
with zipfile.ZipFile(BytesIO(html)) as zf:
csv_filename = zf.namelist()[0]
with zf.open(csv_filename, 'r') as cf:
for row in cf:
website = row.decode('utf-8').split(',')[1].replace("\n", "")
urls.append('https://' + website)
if len(urls) == self.max_urls:
break
return urls
from day04_concurrent.alexa_callback import AlexaCallback
from day04_concurrent.link_crawler import link_crawler
from day04_concurrent.mongodb_cache_example import MongoCache
url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
if __name__ == '__main__':
'''串行爬虫'''
link_crawler(seed_url=url, cache=MongoCache(), scrape_callback=AlexaCallback(url=url))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-23 上午10:25"""
import time
import threading
import urllib.parse
from day04_concurrent.downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""Crawl this website in multiple threads
"""
# the queue of URL's that still need to be crawled
#crawl_queue = queue.deque([seed_url])
crawl_queue = [seed_url]
# the URL's that have been seen
seen = {seed_url}
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)
def process_queue():
while True:
try:
url = crawl_queue.pop()
except IndexError:
# crawl queue is empty
break
else:
html = D(url)
if scrape_callback:
try:
links = scrape_callback(url, html) or []
except Exception as e:
print('Error in callback for: {}: {}'.format(url, e))
else:
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen.add(link)
# add this new link to queue
crawl_queue.append(link)
# wait for all download threads to finish
threads = []
while threads or crawl_queue:
# the crawl is still active
for thread in threads:
if not thread.is_alive():
# remove the stopped threads
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
thread.start()
threads.append(thread)
# all threads have been processed
# sleep temporarily so CPU can focus execution on other threads
time.sleep(SLEEP_TIME)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
return urllib.parse.urljoin(seed_url, link)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-23 上午10:41"""
from datetime import datetime, timedelta
from pymongo import MongoClient, errors
class MongoQueue:
"""
定义三种状态, OUTSTANDING, PROCESSING, COMPLETE
OUTSTANDING 添加一个新的url
PROCESSING url从队列中取出准备下载
COMPLETE 下载结束后
为避免丢失url的结果,设置一个timeout参数
:param timeout 若url的处理时间抄过此值,则认定处理出现问题,重新设置状态为OUTSTANDING
"""
OUTSTANDING, PROCESSING, COMPLETE = range(3)
def __init__(self, client=None, timeout=300):
self.client = MongoClient()
self.db = self.client.cache
self.timeout = timeout
def __nonzero__(self):
record = self.db.crawl_queue.find_one(
{
'status': {'$sn': self.COMPLETE}
})
return True if record else False
def push(self, url):
try:
self.db.crawl_queue.insrt(
{
'_id': url,
'status': self.OUTSTANDING
})
except errors.DuplicateKeyError as e:
# this means already in the queue
pass
def pop(self):
record = self.db.crawl_queue.find_and_modify(
query={'status': self.OUTSTANDING},
update={
'$set': {'status': self.PROCESSING,
'timestamp': datetime.now()}}
)
if record:
return record['_id']
else:
self.repair()
raise KeyError()
def peek(self):
record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
if record:
return record['_id']
def complete(self, url):
self.db.crawl_queue.update(
{'_id': url}, {'$set': {'status': self.COMPLETE}}
)
def repair(self):
record = self.db.crawl_queue.find_and_modify(
query={
'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
'status': {'$ne': self.COMPLETE}
},
update={'$set': {'status': self.OUTSTANDING}}
)
if record:
print('Realeased:', record['_id'])
def clear(self):
self.db.crawl_queue.drop()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-23 上午11:06"""
import time
import urllib.parse
import threading
import multiprocessing
from day04_concurrent.mongodb_cache_example import MongoCache
from day04_concurrent.mongo_queue import MongoQueue
from day04_concurrent.downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp',
proxies=None, num_retries=1, max_threads=10, timeout=60):
"""Crawl using multiple threads
"""
# the queue of URL's that still need to be crawled
crawl_queue = MongoQueue()
crawl_queue.clear()
crawl_queue.push(seed_url)
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies,
num_retries=num_retries, timeout=timeout)
def process_queue():
while True:
# keep track that are processing url
try:
url = crawl_queue.pop()
except KeyError:
# currently no urls to process
break
else:
html = D(url)
if scrape_callback:
try:
links = scrape_callback(url, html) or []
except Exception as e:
print('Error in callback for: {}: {}'.format(url, e))
else:
for link in links:
# add this new link to queue
crawl_queue.push(normalize(seed_url, link))
crawl_queue.complete(url)
# wait for all download threads to finish
threads = []
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue.peek():
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
thread.start()
threads.append(thread)
time.sleep(SLEEP_TIME)
def process_crawler(args, **kwargs):
num_cpus = multiprocessing.cpu_count()
# pool = multiprocessing.Pool(processes=num_cpus)
print('Starting {} processes'.format(num_cpus))
processes = []
for i in range(num_cpus):
p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
# parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
p.start()
processes.append(p)
# wait for processes to complete
for p in processes:
p.join()
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
return urllib.parse.urljoin(seed_url, link)
源码https://github.com/ice1995/python_web_crawler-/tree/master/day04_concurrent