import urllib2 import pycurl import cStringIO as _StringIO import sys import shutil import lxml.html as H import threading import StorageClient as sc import lame # http transfer limits accept_type = "*/*" connection_timeout = 500 timeout = 800 low_speed = 200 low_speed_time = 120 max_size = 20485760 # 10MB def get_curl(user_agent="MSIE"): "initialize curl handle" dev_null = _StringIO.StringIO() curl_handle = pycurl.Curl() curl_handle.setopt(pycurl.FOLLOWLOCATION, 1) curl_handle.setopt(pycurl.MAXREDIRS, 5) curl_handle.setopt(pycurl.CONNECTTIMEOUT, connection_timeout) curl_handle.setopt(pycurl.TIMEOUT, timeout) curl_handle.setopt(pycurl.NOSIGNAL, 1) curl_handle.setopt(pycurl.LOW_SPEED_LIMIT, 100) curl_handle.setopt(pycurl.LOW_SPEED_TIME, low_speed_time) curl_handle.setopt(pycurl.HTTPHEADER, ["User-Agent: %s" % "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)", accept_type]) curl_handle.setopt(pycurl.MAXFILESIZE, max_size) curl_handle.setopt(pycurl.COOKIEFILE, 'cookies.txt') curl_handle.setopt(pycurl.COOKIEJAR, 'cookies.txt') curl_handle.setopt(pycurl.WRITEFUNCTION, dev_null.write) return curl_handle def curl_fetch(curl_handle, url): "retrieve url, return the content, http code, time, effective url" fp = _StringIO.StringIO() curl_handle.setopt(pycurl.URL, url) curl_handle.setopt(pycurl.WRITEFUNCTION, fp.write) # perform the transfer try: curl_handle.perform() except pycurl.error, e: print e return (-1,0,0) content_type = curl_handle.getinfo(pycurl.CONTENT_TYPE) print curl_handle.getinfo(pycurl.CONTENT_TYPE) print curl_handle.getinfo(pycurl.HTTP_CODE) return (0,fp.getvalue(), content_type)