涉及内容包括:
0.POST DATA到校内网
1.正则表达式
2.urllib2的使用
3.Cache机制
[TODO]
1.SMTP的使用:自动发邮件到指定邮箱
2.SQLITE的使用:把图片存储到数据库
有问题或者建议,欢迎联系:[email protected]
注意要更改Login的账号和密码。
只贴代码:
#coding=utf-8 from urllib import urlencode from time import sleep from random import randint import sqlite3,cookielib,time,re,os import urllib2,shutil add_num = 0 class RenrenRobot: def __init__(self): print '*********************************' print '*Hi,This is Hark\'s Robot For RR*' print '*********************************' print 'Contact With me: [email protected]' print 'Version: 0.1' print '' self.allCount = 0 self.newCount = 0 #Create Dir self.strSpiderBasePath = r'c:\leehark_spider' if os.path.exists(self.strSpiderBasePath) == False: os.mkdir(self.strSpiderBasePath) def Download(self,id): # Fetching Your Friend's Recent Updated Pictures' URL whose id is '@param = id' self.strBaseFilePath = (r'c:\leehark_spider\id_%s' % id) if os.path.exists(self.strBaseFilePath) == False: os.mkdir(self.strBaseFilePath) strFriendURL = ('http://photo.renren.com/photo/%s/album/relatives' % id) res = urllib2.urlopen(strFriendURL) print res.geturl() print res.getcode() print res.info() str = res.read() strPattern = r'<li>.<a href="(http://photo.renren.com/photo/[^<>]*?)" class="picture">' r = re.findall(strPattern,str,re.S|re.I) if r == None: return fout = open('c:\\spider\\tmp.txt','wb') for item in r: photoURL = item print 'Picture\'s URL: ' , photoURL ret = urllib2.urlopen(photoURL) strPicURLContent = ret.read() strPicPattern = r'<img id="photo" src="(http://.*?)" title=".*?" style=".*?">' rPic = re.findall(strPicPattern,strPicURLContent,re.S|re.I) for subItem in rPic: self.allCount+=1 # Construct FilePath strItem = subItem.replace('/','_') strItem = strItem.replace(':','_') strItem = strItem.replace('&','_') strItem = strItem.replace('\\','_') strFilePath = self.strBaseFilePath+'\\'+strItem if os.path.exists(strFilePath): # Hit the Old one print 'Fetched Already : ',subItem continue # This is New Image,Fetch it self.newCount+=1 print 'Fetching New Image :' print subItem,' --> ',strFilePath ret = urllib2.urlopen(subItem) # Get Image Size headers = ret.info().headers length = 0 for header in headers: if header.find('Length') != -1: length = header.split(':')[-1].strip() length = int(length) print 'Image length = ',length # Copy Content To Disk fd = open(strFilePath, 'wb') shutil.copyfileobj(ret,fd,0x10000) fd.close() ret.close() # Do not overload download time.sleep(1) def WalkFriends(self): strFriendURL=r"http://friend.renren.com/myfriendlistx.do" try: res = urllib2.urlopen(strFriendURL) except : print '******',"visit MY FRIEND LIST *",'BIG ERROR ******' else: strFriendURLContent = res.read() #print strFriendURLContent strFriendPattern = r'{"id":([0-9]*?),"vip":' rFriend = re.findall(strFriendPattern,strFriendURLContent,re.S|re.I) for subItem in rFriend: print 'Fetching : ',subItem self.Download(subItem) def Login(self,username,pwd): # Set Cookie Jar cookie = cookielib.CookieJar() cookie_file = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(cookie_file) urllib2.install_opener(opener) # Construct Post Data data = { 'email':username, 'password':pwd, 'origURL':'', 'domain':'renren.com', 'formName':'', 'method':'', 'isplogin':'true', 'submit':'登陆' } web_data = urlencode(data) print 'web_data : ',web_data header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'} req=urllib2.Request(url='http://www.renren.com/PLogin.do', data = web_data, headers = header) try: result = urllib2.urlopen(req) except : print '******',username,'BIG ERROR ******' return else: print result.geturl() print result.getcode() print result.info() print 'Go to YOURPAGE' self.WalkFriends() print 'Picture Count : ',self.allCount print 'New Count : ',self.newCount def main(): robot = RenrenRobot() robot.Login('[email protected]','******') if __name__ == '__main__': main()