#coding=utf-8
import Queue
import threading
import urllib2
import time
from BeautifulSoup import BeautifulSoup
hosts = [" http://yahoo.com", " http://google.com", " http://amazon.com",
" http://ibm.com", " http://apple.com"]
#url读取队列
queue = Queue.Queue()
#http数据读取队列
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
#url读取队列
self.queue = queue
#http数据读取队列
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
#从队列中读取url数据
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
url = urllib2.urlopen(host)
chunk = url.read()
#place chunk into out queue
#将http数据放入列队
self.out_queue.put(chunk)
#signals to queue job is done
#发送信号给队列
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
#http数据读取队列
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
#从队列中读取http数据
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
print soup.findAll(['title'])
#signals to queue job is done
#发送信息给队列
self.out_queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
#生成线程池,传入url列队与http数据读取列队
for i in range(5):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
#将url数据加入队列
for host in hosts:
queue.put(host)
#启动线程池处理列队中的http数据
for i in range(5):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
#等待列队数据处理完成
queue.join()
out_queue.join()
main()
#输入处理时间
print "Elapsed Time: %s" % (time.time() - start)