001 #coding:utf-8 002 import re,os,shutil,sys 003 import urllib2,socket,cookielib 004 from threading import Thread,stack_size,Lock 005 from Queue import Queue 006 import time 007 from gzip import GzipFile 008 from StringIO import StringIO 009 010 class ContentEncodingProcessor(urllib2.BaseHandler): 011 """A handler to add gzip capabilities to urllib2 requests """ 012 013 # add headers to requests 014 def http_request(self, req): 015 req.add_header("Accept-Encoding", "gzip, deflate") 016 return req 017 018 # decode 019 def http_response(self, req, resp): 020 old_resp = resp 021 # gzip 022 if resp.headers.get("content-encoding") == "gzip": 023 gz = GzipFile( 024 fileobj=StringIO(resp.read()), 025 mode="r" 026 ) 027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) 028 resp.msg = old_resp.msg 029 # deflate 030 if resp.headers.get("content-encoding") == "deflate": 031 gz = StringIO( deflate(resp.read()) ) 032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and 033 resp.msg = old_resp.msg 034 return resp 035 036 # deflate support 037 import zlib 038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format; 039 try: # so on top of all there's this workaround: 040 return zlib.decompress(data, -zlib.MAX_WBITS) 041 except zlib.error: 042 return zlib.decompress(data) 043 044 class Fetcher: 045 ''' 046 html Fetcher 047 048 basic usage 049 ----------- 050 from fetcher import Fetcher 051 f = Fetcher() 052 f.get(url) 053 054 post 055 ---- 056 req = urllib2.Request(...) 057 f.post(req) 058 059 multi-thread 060 ------------ 061 f = Fetcher(threads=10) 062 for url in urls: 063 f.push(url) 064 while f.taskleft() 065 url,html = f.pop() 066 deal_with(url,html) 067 ''' 068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None): 069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'}) 070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) 071 encoding_support = ContentEncodingProcessor() 072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler) 073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler) 074 self.req = urllib2.Request('http://www.hsbc.com') 075 socket.setdefaulttimeout(timeout) 076 self.q_req = Queue() 077 self.q_ans = Queue() 078 self.lock = Lock() 079 self.running = 0 080 if loginfunc: 081 self.opener = loginfunc(self.opener) 082 if threads: 083 self.threads = threads 084 stack_size(stacksize) 085 for i in range(threads): 086 t = Thread(target=self.threadget) 087 t.setDaemon(True) 088 t.start() 089 090 def __del__(self): 091 time.sleep(0.5) 092 self.q_req.join() 093 self.q_ans.join() 094 095 def taskleft(self): 096 return self.q_req.qsize()+self.q_ans.qsize()+self.running 097 098 def push(self,req,repeat=3): 099 if not self.threads: 100 print 'no thread, return get instead' 101 return get(req,repeat) 102 self.q_req.put(req) 103 104 def pop(self): 105 try: 106 data = self.q_ans.get(block=True,timeout=10) 107 self.q_ans.task_done() 108 except: 109 data = ['',''] 110 return data 111 112 def threadget(self): 113 while True: 114 req = self.q_req.get() 115 with self.lock: 116 self.running += 1 117 ans = self.get(req) 118 print 'got',req 119 self.q_ans.put((req,ans)) 120 try: 121 self.q_req.task_done() 122 except: 123 pass 124 with self.lock: 125 self.running -= 1 126 time.sleep(0.1) # don't spam 127 128 def proxyisworking(self): 129 try: 130 self.opener.open('http://www.hsbc.com').read(1024) 131 return True 132 except Exception , what: 133 print what 134 return False 135 def get(self,req,repeat=3): 136 ''' 137 http GET req and repeat 3 times if failed 138 html text is returned when succeeded 139 '' is returned when failed 140 ''' 141 try: 142 response = self.opener.open(req) 143 data = response.read() 144 except Exception , what: 145 print what,req 146 if repeat>0: 147 return self.get(req,repeat-1) 148 else: 149 print 'GET Failed',req 150 return '' 151 return data 152 153 def post(self,req,repeat=3): 154 ''' 155 http POST req and repeat 3 times if failed 156 html text/True is returned when succeeded 157 False is returned when failed 158 ''' 159 if not isinstance(req,urllib2.Request): 160 print 'post method need urllib.Request as argument' 161 return False 162 else: 163 r = self.get(req,repeat) 164 if r: 165 return r 166 else: 167 return True 168 169 class SiteCopyer: 170 def __init__(self,url): 171 self.baseurl = url 172 self.home = self.baseurl.split('/')[2] 173 self.f = Fetcher(threads=10) 174 self.create_dir() 175 176 def create_dir(self): 177 try: 178 shutil.rmtree(self.home) 179 except Exception,what: 180 print what 181 try: 182 os.mkdir(self.home) 183 os.mkdir(self.home+'/media') 184 os.mkdir(self.home+'/media/js') 185 os.mkdir(self.home+'/media/css') 186 os.mkdir(self.home+'/media/image') 187 except Exception,what: 188 print what 189 190 def full_link(self,link,baseurl=None): 191 if not baseurl: 192 baseurl = self.baseurl 193 if '?' in link: 194 link = link.rsplit('?',1)[0] 195 if not link.startswith('http://'): 196 if link.startswith('/'): 197 link = '/'.join(baseurl.split('/',3)[:3]) + link 198 elif link.startswith('../'): 199 while link.startswith('../'): 200 baseurl = baseurl.rsplit('/',2)[0] 201 link = link[3:] 202 link = baseurl+'/'+link 203 else: 204 link = baseurl.rsplit('/',1)[0]+'/'+link 205 return link 206 207 def link_alias(self,link): 208 link = self.full_link(link) 209 name = link.rsplit('/',1)[1] 210 if '.css' in name: 211 name = name[:name.find('.css')+4] 212 alias = '/media/css/'+name 213 elif '.js' in name: 214 name = name[:name.find('.js')+3] 215 alias = '/media/js/'+name 216 else: 217 alias = '/media/image/'+name 218 return alias 219 220 def strip_link(self,link): 221 if link and (link[0] in ['"',"'"]): 222 link = link[1:] 223 while link and (link[-1] in ['"',"'"]): 224 link = link[:-1] 225 while link.endswith('/'): 226 link = link[:-1] 227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link): 228 return link 229 else: 230 return '' 231 232 def copy(self): 233 page = self.f.get(self.baseurl) 234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page) 235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) ) 236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) ) 237 templinks = [] 238 for link in links: 239 slink = self.strip_link(link) 240 if slink: 241 templinks.append(slink) 242 links = templinks 243 for link in set(links): 244 page = page.replace(link,self.link_alias(link)[1:]) 245 self.f.push( self.full_link(link) ) 246 open(self.home+'/index.html','w').write(page) 247 while self.f.taskleft(): 248 url,page = self.f.pop() 249 if url.endswith('.css'): 250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page) 251 templinks = [] 252 for link in links: 253 slink = self.strip_link(link) 254 if slink: 255 templinks.append(slink) 256 links = templinks 257 for link in set(links): 258 self.f.push( self.full_link(link,url) ) 259 page = page.replace(link,self.link_alias(link)[1:].replace("media","..")) 260 print 'write to',self.home+self.link_alias(url) 261 try: 262 open(self.home+self.link_alias(url),'w').write(page) 263 except Exception,what: 264 print what 265 266 if __name__ == "__main__": 267 if len(sys.argv) == 2: 268 url = sys.argv[1] 269 SiteCopyer(url).copy() 270 else: 271 print "Usage: python "+sys.argv[0]+" url"