主文件如下:
#coding=utf-8 import requests import urllib import urllib2 import cookielib import WeiboEncode import WeiboSearch import time import re import random import httplib class WeiboLogin: def __init__(self, user, pwd, enableProxy = False):#构造方法,参数依次是自身、用户、密码、是否使用代理服务器 "初始化WeiboLogin,enableProxy表示是否使用代理服务器,默认关闭" print "Initializing WeiboLogin..." self.userName = user self.passWord = pwd self.enableProxy = enableProxy self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.11)&_=1379834957683" self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.11)" self.postHeader = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} #用户代理 User Agent,是指浏览器,它的信息包括硬件平台、系统软件、应用软件和用户个人偏好。 def Login(self):#登陆程序 "登陆程序" self.EnableCookie(self.enableProxy)#cookie或代理服务器配置 serverTime, nonce, pubkey, rsakv = self.GetServerTime()#登陆的第一步 postData = WeiboEncode.PostEncode(self.userName, self.passWord, serverTime, nonce, pubkey, rsakv)#加密用户和密码 print "Post data length:\n", len(postData) req = urllib2.Request(self.loginUrl, postData, self.postHeader)#构造网络请求 print "Posting request..." result = urllib2.urlopen(req)#发出网络请求 text = result.read() try: loginUrl = WeiboSearch.sRedirectData(text)#解析重定位结果(登陆后自动跳转到的页面) urllib2.urlopen(loginUrl) except: print 'Login error!' return False print 'Login sucess!' return True def EnableCookie(self, enableProxy):#"Enable cookie & proxy (if needed)." cookiejar = cookielib.LWPCookieJar()#建立cookie cookie_support = urllib2.HTTPCookieProcessor(cookiejar) #HTTPCookieProcessor instances have one attribute: #HTTPCookieProcessor.cookiejar (The cookielib.CookieJar in which cookies are stored.) if enableProxy: proxy_support = urllib2.ProxyHandler({'http':'59.59.100.123:8118'})#使用代理 opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)#Return an OpenerDirector instance #The OpenerDirector class opens URLs via BaseHandlers chained together. print "Proxy enabled" else: opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener)#构建cookie对应的opener def GetServerTime(self):#"Get server time and nonce, which are used to encode the password" #在摘要认证中服务器让客户选一个随机数(称作”nonce“),然后浏览器使用一个单向的加密函数生成一个消息摘要(message #digest),该摘要是关于用户名、密码、给定的nonce值、HTTP方法,以及所请求的URL。 print "Getting server time and nonce..." serverData = urllib2.urlopen(self.serverUrl).read()#得到网页内容 print serverData try: serverTime, nonce, pubkey, rsakv = WeiboSearch.sServerData(serverData)#解析得到serverTime,nonce等 return serverTime, nonce, pubkey, rsakv except: print 'Get server time & nonce error!' return None def fetch_weibo(id, filename):#不借助API取回微博列表,但只有前几条,参数分别为用户ID、文件名 target = open(filename, 'a') myurl='http://weibo.com/u/'+id line = urllib2.urlopen(myurl).read() target.write(line) if re.search(r'\"WB_detail', line): print "success" p = re.compile(r'\"WB_detail\"') linelist = p.split(line) for fraction in linelist: matchObj = re.search(r'nick-name=\".+?\">\\n +(.+?)<', fraction) if matchObj: target.write(matchObj.group(1)) target.write("\n") def fetchqueryresult():#本方法可取回微博找人的查询结果 myurl="http://s.weibo.com/user/&auth=ord&age=22y&gender=women®ion=custom:33:1&page="#找人页面的url target = open("filename", 'a')#输出文件名称 for i in range(37,51):#起止页码 line = urllib2.urlopen(myurl).read() while re.search(r'ids\=(\d+?)\\', line): matchObj = re.search(r'ids\=(\d+?)\\', line) print matchObj.group(1) target.write(matchObj.group(1)) target.write("\n") p = re.compile(r''+matchObj.group(1)) linelist = p.split(line) line = linelist[len(linelist)-1] print i time.sleep(2+random.random()); def getjson():#本方法可调用微博API,取回已登录用户的微博列表 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'}#定义一些文件头 url = "https://api.weibo.com/2/statuses/user_timeline.json" # 这里是url your_param = {'source': '1675437817'} # 这里是请求参数! result = requests.get(url, params=your_param) # 发送请求,如果url是http://s.weibo.com/weibo/s 那么 这句话的的效果就是 http://s.weibo.com/weibo/s?Refer=sina_index result_final = result.text #这样就获取到了你发送的这个URL + 参数 之后的结果 print result.text if __name__ == '__main__': #if the python interpreter is running that module (the source file) as the main program, #it sets the special __name__ variable to have a value #"__main__". #If this file is being imported from another module, #__name__ will be set to the module's name. weiboLogin = WeiboLogin('[email protected]', 'XXXXXXXX')#邮箱(账号)、密码 if weiboLogin.Login() == True: print "登陆成功!" myurl="http://api.weibo.com/2/statuses/timeline_batch.json?source=1675437817&uids=5029941840" htmlContent = urllib2.urlopen(myurl).read() print htmlContent
另外要用到的两个类是WeiboSearch.py和WeiboEncode.py。
WeiboEncode.py代码如下:
#coding=utf-8 import urllib import base64 import rsa import binascii def PostEncode(userName, passWord, serverTime, nonce, pubkey, rsakv): "Used to generate POST data" encodedUserName = GetUserName(userName)#用户名使用base64加密 encodedPassWord = get_pwd(passWord, serverTime, nonce, pubkey)#目前密码采用rsa加密 postPara = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'userticket': '1', 'ssosimplelogin': '1', 'vsnf': '1', 'vsnval': '', 'su': encodedUserName, 'service': 'miniblog', 'servertime': serverTime, 'nonce': nonce, 'pwencode': 'rsa2', 'sp': encodedPassWord, 'encoding': 'UTF-8', 'prelt': '115', 'rsakv': rsakv, 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META' } postData = urllib.urlencode(postPara)#网络编码 return postData def GetUserName(userName): "Used to encode user name" userNameTemp = urllib.quote(userName) userNameEncoded = base64.encodestring(userNameTemp)[:-1] return userNameEncoded def get_pwd(password, servertime, nonce, pubkey): rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥 message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) #拼接明文js加密文件中得到 passwd = rsa.encrypt(message, key) #加密 passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。 return passwd
WeiboSearch.py
#coding=utf-8 import re import json def sServerData(serverData):#解析得到serverTime,nonce等 "Search the server time & nonce from server data" p = re.compile('\((.*)\)') #re.compile 可以把正则表达式编译成一个正则表达式对象 jsonData = p.search(serverData).group(1) #查找 data = json.loads(jsonData) #对encodedjson进行decode,得到原始数据,需要使用json.loads()函数 serverTime = str(data['servertime']) nonce = data['nonce'] pubkey = data['pubkey']# rsakv = data['rsakv']# print "Server time is:", serverTime print "Nonce is:", nonce return serverTime, nonce, pubkey, rsakv def sRedirectData(text): p = re.compile('location\.replace\([\'"](.*?)[\'"]\)') loginUrl = p.search(text).group(1) print 'loginUrl:',loginUrl return loginUrl
目前该爬虫可以自动登录,以及调用新浪微博的普通API。但是批量取回他人的微博需要高级授权,目前正在申请。