1 #!/usr/bin/env python 2 3 import cStringIO # 4 import formatter # 5 from htmllib import HTMLParser # We use various classes in these modules for parsing HTML. 6 import httplib # We only need an exception from this module 7 import os # This provides various file system functions 8 import sys # We are just using argv for command-line arguments 9 import urllib # We only need the urlretrieve()function for downloading Web pages 10 import urlparse # We use the urlparse()and urljoin()functions for URL manipulation 11 12 class Retriever(object): 13 __slots__ = ('url','file') 14 15 def __init__(self,url): 16 self.url, self.file = self.get_file(url) 17 18 def get_file(self, url, default='index.html'): 19 'Create usable local filename from URL' 20 parsed = urlparse.urlparse(url) # ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='') 21 host = parsed.netloc.split('@')[-1].split(':')[0] # 'www.baidu.com' 22 filepath = '%s%s' % (host,parsed.path) # 'www.baidu.com' 23 if not os.path.splitext(parsed.path)[1]: # '' 24 filepath = os.path.join(filepath, default) # 'www.baidu.com\\index.html' 25 linkdir = os.path.dirname(filepath) # 'www.baidu.com' 26 if not os.path.isdir(linkdir): # False 27 if os.path.exists(linkdir): # False 28 os.unlink(linkdir) 29 os.makedirs(linkdir) # make a directory named by link directory on the hard disc 30 return url, filepath 31 32 def download(self): 33 'Download URL to specific name file' 34 try: 35 retval = urllib.urlretrieve(self.url, self.file) 36 except (IOError, httplib.InvalidURL) as e: 37 retval = (('*** ERROR:bad URL "%s": %s' % (self.url,e)),) 38 return retval 39 40 def parse_links(self): 41 'Parse out the links found in downloaded HTML file' 42 f = open(self.file, 'r') 43 data = f.read() 44 f.close() 45 parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) 46 parser.feed(data) 47 parser.close() 48 return parser.anchorlist 49 50 class Crawler(object): 51 count = 0 # the number of objects downloaded from the internet 52 53 def __init__(self, url): 54 self.q = [url] # a queue of links to download 55 self.seen = set() # a set containing all the links that we have seen(downloaded) already 56 parsed = urlparse.urlparse(url) 57 host = parsed.netloc.split('@')[-1].split(':')[0] 58 self.dom = '.'.join(host.split('.')[-2:]) # 'b.a.i.d.u' 59 60 def get_page(self, url, media=False): 61 'Download page & parse links, add to queue if nec' 62 r = Retriever(url) 63 fname = r.download()[0] # 'www.baidu.com\\index.html' 64 if fname[0] == '*': # 'w' 65 print fname, '... skipping parse' 66 return 67 Crawler.count += 1 # 1 68 print '\n(', Crawler.count, ')' # (1) 69 print 'URL:', url # URL: http://www.baidu.com 70 print 'FILE:', fname # FILE: www.baidu.com\\index.html 71 self.seen.add(url) # set(['http://www.baidu.com']) 72 ftype = os.path.splitext(fname)[1] # '.html' 73 if ftype not in ('.htm', '.html'): # False 74 return 75 76 for link in r.parse_links(): 77 if link.startswith('mailto:'): # False 78 print '... discarded, mailto link' 79 continue 80 if not media: # False 81 ftype = os.path.splitext(link)[1] 82 if ftype in ('.mp3','.mp4','.m4v','.wav'): 83 print '... discarded, media file' 84 continue 85 if not link.startswith('http://'): # False 86 link = urlparse.urljoin(url, link) 87 print '*', link, 88 if link not in self.seen: # True 89 if self.dom not in link: # False 90 print '... discarded, not in domain' 91 else: 92 if link not in self.q: 93 self.q.append(link) 94 print '... new, added to Q' 95 else: 96 print '... discarded, already in Q' 97 else: 98 print '... discarded, already processed' 99 100 def go(self, media=False): 101 'Process next page in queue (if any)' 102 while self.q: 103 url = self.q.pop() 104 self.get_page(url, media) 105 106 def main(): 107 if len(sys.argv) > 1: 108 url = sys.argv[1] 109 else: 110 try: 111 url = raw_input('Enter starting URL:') 112 except(KeyboardInterrupt, EOFError): 113 url = '' 114 if not url: 115 return 116 if not url.startswith('http://') and not url.startswith('ftp://'): 117 url = 'http://%s/' % url 118 robot = Crawler(url) 119 robot.go() 120 121 if __name__ == '__main__': 122 main()