利用python实现新浪微博爬虫 .

  

新版新浪微博模拟登陆请看:http://blog.csdn.net/monsion/article/details/8656690

本文后面的解决动态加载的程序依然有效

重新编辑了一次,出了点儿问题

第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:

[python] view plain copy print ?
  1. #! /usr/bin/env python   
  2. # -*- coding: utf-8 -*-   
  3.   
  4. import sys  
  5. import urllib  
  6. import urllib2  
  7. import cookielib  
  8. import base64  
  9. import re  
  10. import json  
  11. import hashlib  
  12.   
  13. class weiboLogin:  
  14.     cj = cookielib.LWPCookieJar()  
  15.     cookie_support = urllib2.HTTPCookieProcessor(cj)  
  16.     opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
  17.     urllib2.install_opener(opener)  
  18.     postdata = {  
  19.         'entry''weibo',  
  20.         'gateway''1',  
  21.         'from''',  
  22.         'savestate''7',  
  23.         'userticket''1',  
  24.         'ssosimplelogin''1',  
  25.         'vsnf''1',  
  26.         'vsnval''',  
  27.         'su''',  
  28.         'service''miniblog',  
  29.         'servertime''',  
  30.         'nonce''',  
  31.         'pwencode''wsse',  
  32.         'sp''',  
  33.         'encoding''UTF-8',  
  34.         'url''http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',  
  35.         'returntype''META'  
  36.     }  
  37.   
  38.     def get_servertime(self):  
  39.         url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'  
  40.         data = urllib2.urlopen(url).read()  
  41.         p = re.compile('\((.*)\)')  
  42.         try:  
  43.             json_data = p.search(data).group(1)  
  44.             data = json.loads(json_data)  
  45.             servertime = str(data['servertime'])  
  46.             nonce = data['nonce']  
  47.             return servertime, nonce  
  48.         except:  
  49.             print 'Get severtime error!'  
  50.             return None  
  51.   
  52.     def get_pwd(self, pwd, servertime, nonce):  
  53.         pwd1 = hashlib.sha1(pwd).hexdigest()  
  54.         pwd2 = hashlib.sha1(pwd1).hexdigest()  
  55.         pwd3_ = pwd2 + servertime + nonce  
  56.         pwd3 = hashlib.sha1(pwd3_).hexdigest()  
  57.         return pwd3  
  58.   
  59.     def get_user(self, username):  
  60.         username_ = urllib.quote(username)  
  61.         username = base64.encodestring(username_)[:-1]  
  62.         return username  
  63.   
  64.   
  65.     def login(self,username,pwd):  
  66.         url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)'  
  67.         try:  
  68.             servertime, nonce = self.get_servertime()  
  69.         except:  
  70.             print 'get servertime error!'  
  71.             return  
  72.         weiboLogin.postdata['servertime'] = servertime  
  73.         weiboLogin.postdata['nonce'] = nonce  
  74.         weiboLogin.postdata['su'] = self.get_user(username)  
  75.         weiboLogin.postdata['sp'] = self.get_pwd(pwd, servertime, nonce)  
  76.         weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)  
  77.         headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11'}  
  78.         req  = urllib2.Request(  
  79.             url = url,  
  80.             data = weiboLogin.postdata,  
  81.             headers = headers  
  82.         )  
  83.         result = urllib2.urlopen(req)  
  84.         text = result.read()  
  85.         p = re.compile('location\.replace\(\'(.*?)\'\)')  
  86.         try:  
  87.             login_url = p.search(text).group(1)  
  88.             urllib2.urlopen(login_url)  
  89.             print "Login success!"  
  90.         except:  
  91.             print 'Login error!'  
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib

class weiboLogin:
	cj = cookielib.LWPCookieJar()
	cookie_support = urllib2.HTTPCookieProcessor(cj)
	opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
	urllib2.install_opener(opener)
	postdata = {
		'entry': 'weibo',
		'gateway': '1',
		'from': '',
		'savestate': '7',
		'userticket': '1',
		'ssosimplelogin': '1',
		'vsnf': '1',
		'vsnval': '',
		'su': '',
		'service': 'miniblog',
		'servertime': '',
		'nonce': '',
		'pwencode': 'wsse',
		'sp': '',
		'encoding': 'UTF-8',
		'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
		'returntype': 'META'
	}

	def get_servertime(self):
		url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'
		data = urllib2.urlopen(url).read()
		p = re.compile('\((.*)\)')
		try:
			json_data = p.search(data).group(1)
			data = json.loads(json_data)
			servertime = str(data['servertime'])
			nonce = data['nonce']
			return servertime, nonce
		except:
			print 'Get severtime error!'
			return None

	def get_pwd(self, pwd, servertime, nonce):
		pwd1 = hashlib.sha1(pwd).hexdigest()
		pwd2 = hashlib.sha1(pwd1).hexdigest()
		pwd3_ = pwd2 + servertime + nonce
		pwd3 = hashlib.sha1(pwd3_).hexdigest()
		return pwd3

	def get_user(self, username):
		username_ = urllib.quote(username)
		username = base64.encodestring(username_)[:-1]
		return username


	def login(self,username,pwd):
		url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)'
		try:
			servertime, nonce = self.get_servertime()
		except:
			print 'get servertime error!'
			return
		weiboLogin.postdata['servertime'] = servertime
		weiboLogin.postdata['nonce'] = nonce
		weiboLogin.postdata['su'] = self.get_user(username)
		weiboLogin.postdata['sp'] = self.get_pwd(pwd, servertime, nonce)
		weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
		headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11'}
		req  = urllib2.Request(
			url = url,
			data = weiboLogin.postdata,
			headers = headers
		)
		result = urllib2.urlopen(req)
		text = result.read()
		p = re.compile('location\.replace\(\'(.*?)\'\)')
		try:
			login_url = p.search(text).group(1)
			urllib2.urlopen(login_url)
			print "Login success!"
		except:
			print 'Login error!'

然后创建main.py文件,输入以下代码:

[python] view plain copy print ?
  1. #!/usr/bin/env python   
  2. # -*- coding: utf-8 -*-   
  3.   
  4. import weiboLogin  
  5. import urllib  
  6. import urllib2  
  7.   
  8. username = '你的微博用户名'  
  9. pwd = '你的微博密码'  
  10.   
  11. WBLogin = weiboLogin.weiboLogin()  
  12. WBLogin.login(username, pwd)  
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import weiboLogin
import urllib
import urllib2

username = '你的微博用户名'
pwd = '你的微博密码'

WBLogin = weiboLogin.weiboLogin()
WBLogin.login(username, pwd)

注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。

参考:http://www.douban.com/note/201767245/

接下来,考虑实现抓取微博的内容。

此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:

[python] view plain copy print ?
  1. #!/usr/bin/env python   
  2. # -*- coding: utf-8 -*-   
  3.   
  4. import urllib  
  5. import urllib2  
  6. import sys  
  7. import time  
  8.   
  9. reload(sys)  
  10. sys.setdefaultencoding('utf-8')  
  11.   
  12. class getWeiboPage:  
  13.     body = {  
  14.         '__rnd':'',  
  15.         '_k':'',  
  16.         '_t':'0',  
  17.         'count':'50',  
  18.         'end_id':'',  
  19.         'max_id':'',  
  20.         'page':1,  
  21.         'pagebar':'',  
  22.         'pre_page':'0',  
  23.         'uid':''  
  24.     }  
  25.     uid_list = []  
  26.     charset = 'utf8'  
  27.   
  28.     def get_msg(self,uid):  
  29.         getWeiboPage.body['uid'] = uid  
  30.         url = self.get_url(uid)  
  31.         self.get_firstpage(url)  
  32.         self.get_secondpage(url)  
  33.         self.get_thirdpage(url)  
  34.     def get_firstpage(self,url):  
  35.         getWeiboPage.body['pre_page'] = getWeiboPage.body['page']-1  
  36.         url = url +urllib.urlencode(getWeiboPage.body)  
  37.         req = urllib2.Request(url)  
  38.         result = urllib2.urlopen(req)  
  39.         text = result.read()  
  40.         self.writefile('./output/text1',text)         
  41.         self.writefile('./output/result1',eval("u'''"+text+"'''"))  
  42.           
  43.     def get_secondpage(self,url):  
  44.         getWeiboPage.body['count'] = '15'  
  45.     #   getWeiboPage.body['end_id'] = '3490160379905732'   
  46.     #   getWeiboPage.body['max_id'] = '3487344294660278'   
  47.         getWeiboPage.body['pagebar'] = '0'  
  48.         getWeiboPage.body['pre_page'] = getWeiboPage.body['page']  
  49.   
  50.         url = url +urllib.urlencode(getWeiboPage.body)  
  51.         req = urllib2.Request(url)  
  52.         result = urllib2.urlopen(req)  
  53.         text = result.read()  
  54.         self.writefile('./output/text2',text)         
  55.         self.writefile('./output/result2',eval("u'''"+text+"'''"))  
  56.     def get_thirdpage(self,url):  
  57.         getWeiboPage.body['count'] = '15'  
  58.         getWeiboPage.body['pagebar'] = '1'  
  59.         getWeiboPage.body['pre_page'] = getWeiboPage.body['page']  
  60.   
  61.         url = url +urllib.urlencode(getWeiboPage.body)  
  62.         req = urllib2.Request(url)  
  63.         result = urllib2.urlopen(req)  
  64.         text = result.read()  
  65.         self.writefile('./output/text3',text)         
  66.         self.writefile('./output/result3',eval("u'''"+text+"'''"))  
  67.     def get_url(self,uid):  
  68.         url = 'http://weibo.com/' + uid + '?from=otherprofile&wvr=3.6&loc=tagweibo'  
  69.         return url  
  70.     def get_uid(self,filename):  
  71.         fread = file(filename)  
  72.         for line in fread:  
  73.             getWeiboPage.uid_list.append(line)  
  74.             print line  
  75.             time.sleep(1)  
  76.     def writefile(self,filename,content):  
  77.         fw = file(filename,'w')  
  78.         fw.write(content)  
  79.         fw.close()  
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib
import urllib2
import sys
import time

reload(sys)
sys.setdefaultencoding('utf-8')

class getWeiboPage:
	body = {
		'__rnd':'',
		'_k':'',
		'_t':'0',
		'count':'50',
		'end_id':'',
		'max_id':'',
		'page':1,
		'pagebar':'',
		'pre_page':'0',
		'uid':''
	}
	uid_list = []
	charset = 'utf8'

	def get_msg(self,uid):
		getWeiboPage.body['uid'] = uid
		url = self.get_url(uid)
		self.get_firstpage(url)
		self.get_secondpage(url)
		self.get_thirdpage(url)
	def get_firstpage(self,url):
		getWeiboPage.body['pre_page'] = getWeiboPage.body['page']-1
		url = url +urllib.urlencode(getWeiboPage.body)
		req = urllib2.Request(url)
		result = urllib2.urlopen(req)
		text = result.read()
		self.writefile('./output/text1',text)		
		self.writefile('./output/result1',eval("u'''"+text+"'''"))
		
	def get_secondpage(self,url):
		getWeiboPage.body['count'] = '15'
	#	getWeiboPage.body['end_id'] = '3490160379905732'
	#	getWeiboPage.body['max_id'] = '3487344294660278'
		getWeiboPage.body['pagebar'] = '0'
		getWeiboPage.body['pre_page'] = getWeiboPage.body['page']

		url = url +urllib.urlencode(getWeiboPage.body)
		req = urllib2.Request(url)
		result = urllib2.urlopen(req)
		text = result.read()
		self.writefile('./output/text2',text)		
		self.writefile('./output/result2',eval("u'''"+text+"'''"))
	def get_thirdpage(self,url):
		getWeiboPage.body['count'] = '15'
		getWeiboPage.body['pagebar'] = '1'
		getWeiboPage.body['pre_page'] = getWeiboPage.body['page']

		url = url +urllib.urlencode(getWeiboPage.body)
		req = urllib2.Request(url)
		result = urllib2.urlopen(req)
		text = result.read()
		self.writefile('./output/text3',text)		
		self.writefile('./output/result3',eval("u'''"+text+"'''"))
	def get_url(self,uid):
		url = 'http://weibo.com/' + uid + '?from=otherprofile&wvr=3.6&loc=tagweibo'
		return url
	def get_uid(self,filename):
		fread = file(filename)
		for line in fread:
			getWeiboPage.uid_list.append(line)
			print line
			time.sleep(1)
	def writefile(self,filename,content):
		fw = file(filename,'w')
		fw.write(content)
		fw.close()


在刚刚的main.py中加入相应内容,完整内容为:


[python] view plain copy print ?
  1. #!/usr/bin/env python   
  2. # -*- coding: utf-8 -*-   
  3.   
  4. import weiboLogin  
  5. import getWeiboMsg  
  6. import urllib  
  7. import urllib2  
  8.   
  9. username = '你的微博用户名'  
  10. pwd = '你的微博密码'  
  11.   
  12. WBLogin = weiboLogin.weiboLogin()  
  13. WBLogin.login(username, pwd)  
  14.   
  15. WBmsg = getWeiboMsg.getWeiboMsg()  
  16. url = 'http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo'  
  17.   
  18. WBmsg.get_firstpage(url)  
  19. WBmsg.get_secondpage(url)  
  20. WBmsg.get_thirdpage(url)  
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import weiboLogin
import getWeiboMsg
import urllib
import urllib2

username = '你的微博用户名'
pwd = '你的微博密码'

WBLogin = weiboLogin.weiboLogin()
WBLogin.login(username, pwd)

WBmsg = getWeiboMsg.getWeiboMsg()
url = 'http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo'

WBmsg.get_firstpage(url)
WBmsg.get_secondpage(url)
WBmsg.get_thirdpage(url)



参考:http://www.cnblogs.com/sickboy/archive/2012/01/08/2316248.html
执行python main.py,应该可以运行,结果保存在./output/文件夹中,该文件夹自己提前创建好。
昨天搞了一个下午,很多东西还没弄好,欢迎留言交流。

你可能感兴趣的:(python,网络爬虫,python,html,新浪微博)