在python中使用多线程下载网页

import urllib2
import socket
import threading
import fcntl
#import portalocker

def test_urls(f1,f2,f3,f4,threadnum):
  while 1:
    fcntl.flock(f1, fcntl.LOCK_EX) 
    urlstr=f1.readline()
    fcntl.flock(f1,fcntl.LOCK_UN)
    if urlstr:
      datastream=None
      try:
        request = urllib2.Request(urlstr)
        opener = urllib2.build_opener()
        datastream = opener.open(request)
        if datastream:
          if datastream.headers.has_key('Expires'):
            fcntl.flock(f2, fcntl.LOCK_EX) 
            f2.write(urlstr)
            fcntl.flock(f2,fcntl.LOCK_UN)
          else:
            str_cache_control = datastream.headers.get('cache-control')
            if str_cache_control and str_cache_control.find('max-age')!= '-1':
              fcntl.flock(f2, fcntl.LOCK_EX)
              f2.write(urlstr)
              fcntl.flock(f2,fcntl.LOCK_UN)
            else:
              fcntl.flock(f3, fcntl.LOCK_EX)
              f3.write(urlstr)
              fcntl.flock(f3,fcntl.LOCK_UN)
      except IOError, e:
        fcntl.flock(f4, fcntl.LOCK_EX)       
        f4.write(urlstr)
        fcntl.flock(f4,fcntl.LOCK_UN)
      print threadnum
    else:
      break

#set socket timeout
timeout=10
socket.setdefaulttimeout(timeout)
f1=open("urls.txt","r")
f2=open('haveExprires.txt','w')
f3=open('noExprires.txt','w')
f4=open('cantBeOpen_urls.txt', 'w')

threads = []
num = 10
for x in xrange(0,num):
  threads.append(threading.Thread(target=test_urls, args=(f1,f2,f3,f4,x,)))
for t in threads:
  t.start()
for t in threads:
  t.join()
 
f1.close()
f2.close()
f3.close()
f4.close()
 

你可能感兴趣的:(多线程,python,cache,socket,import)