有4万条已知的网页微博地址,想要抓取每一条网页上的评论数,点赞数和转发数目。
第一部分:安装包
#只有rsa和pandas需要安装
import re, urllib.parse, urllib.request, http.cookiejar, base64, binascii, rsa
import pandas as pd
第二部分:获取get和post信息
cj = http.cookiejar.LWPCookieJar()
cookie_support = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
# 有的网页支持gbk格式,有的支持utf-8,于是get和post各写了两个,try,catch调用即可。
def getData(url):
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
text = response.read().decode('gbk')
return text
def getData1(url):
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
return text
def postData(url, data):
# headers请根据自己的浏览器自行更改,方法是打开检查元素,看网页源码的header部分。
headers = {
'User-Agent': '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"'}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(request)
text = response.read().decode('gbk')
return text
def postData1(url, data):
headers = {
'User-Agent': '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"'}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(request)
text = response.read().decode('gbk')
return text
第三部分:登录
# 登录程序
def login_in(nick,pwd):
prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.15)&_=1400822309846' % nick
preLogin = getData(prelogin_url)
servertime = re.findall('"servertime":(.*?),', preLogin)[0]
pubkey = re.findall('"pubkey":"(.*?)",', preLogin)[0]
rsakv = re.findall('"rsakv":"(.*?)",', preLogin)[0]
nonce = re.findall('"nonce":"(.*?)",', preLogin)[0]
su = base64.b64encode(bytes(urllib.request.quote(nick), encoding='utf-8'))
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537)
message = bytes(str(servertime) + '\t' + str(nonce) + '\n' + str(pwd), encoding='utf-8')
sp = binascii.b2a_hex(rsa.encrypt(message, key))
param = {'entry': 'weibo', 'gateway': 1, 'from': '', 'savestate': 7, 'useticket': 1,
'pagerefer': 'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D',
'vsnf': 1, 'su': su, 'service': 'miniblog', 'servertime': servertime, 'nonce': nonce, 'pwencode': 'rsa2',
'rsakv': rsakv, 'sp': sp, 'sr': '1680*1050',
'encoding': 'UTF-8', 'prelt': 961,
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack'}
try:
s = postData1('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param)
except(UnicodeDecodeError):
s = postData('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param)
if len(re.findall("location.replace\(\'(.*?)\'\);", s)) == 0:
urll = re.findall("location.replace\(\"(.*?)\"\);", s)[0]
else:
urll = re.findall("location.replace\(\'(.*?)\'\);", s)[0]
getData(urll)
第四部分:调用和解析
df = pd.read_csv("test.csv") #每一行是一个url
userid = '' #自行填写
pwd = '' # 自行填写
count = 0
for hang in range(len(df["link"].values)):
print(hang)
if count%100 == 0: #担心爬着爬着又断了,每隔100条就重新登陆一次
login_in(nick, pwd)
try:
link = df["link"].values[hang]
try:
text = getData1(link)
except(UnicodeDecodeError):
text = getData(link)
info = re.findall("\.*?\<", text) #解析核心,可以将网页内容打印出来查看为什么这么解析
df1 = pd.DataFrame(columns=('link', 'zhuanfa', 'comment', 'dianzan'))
if len(info) == 4:
df1.loc[0] = [link, info[1][4:-1], info[2][4:-1], info[3][4:-1]]
elif len(info) == 7:
df1.loc[0] = [link, info[4][4:-1], info[5][4:-1], info[6][4:-1]]
else:
write_log(link)
continue
df1.to_csv("results.csv", index=False, header=False, mode="a", encoding="gbk")
except:
write_log(link + ": error")
continue
count = count+1
以上程序换了三台设备才跑完所有的url,差不多每跑一万条就会限制设备访问了。