Python通过HTTP协议定期抓取网页

 
import urllib2,time;
class ErrorHandler(urllib2.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, headers):
        result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
        result.status = code
        return result
 
URL='http://www.fx168.com/fxnews/news_now_html.html'
req=urllib2.Request(URL)
mgr=urllib2.build_opener(ErrorHandler())
 
while True:
    ns=mgr.open(req)
    if(ns.headers.has_key('last-modified')):
        modified=ns.headers.get('last-modified')
    if(ns.code==304):
        print '''
          ==============================
              不需要更新
          ==============================
        '''
    elif(ns.code==200):
        print ns.read()
    else:
        print 'there is an error';
 
    if(not locals().has_key('modified')):
        modified=time.time();
    req.add_header('If-Modified-Since',modified)
    time.sleep(10)

你可能感兴趣的:(Python通过HTTP协议定期抓取网页)