网页微博抓取

需求

有4万条已知的网页微博地址,想要抓取每一条网页上的评论数,点赞数和转发数目。

问题

  • 怎么登录?目前可以抓取cookie的方法有很多博客写过了,微博只支持移动版的cookie获取。
  • 怎么抓取?有很多框架,beautifulsoup,pyquery等,但是这里不适用,选了最简单的正则表达式。

代码

第一部分:安装包

#只有rsa和pandas需要安装
import re, urllib.parse, urllib.request, http.cookiejar, base64, binascii, rsa
import pandas as pd

第二部分:获取get和post信息

cj = http.cookiejar.LWPCookieJar()
cookie_support = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)

# 有的网页支持gbk格式,有的支持utf-8,于是get和post各写了两个,try,catch调用即可。
def getData(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    text = response.read().decode('gbk')
    return text

def getData1(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    text = response.read().decode('utf-8')
    return text

def postData(url, data):
   # headers请根据自己的浏览器自行更改,方法是打开检查元素,看网页源码的header部分。
    headers = {
        'User-Agent': '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"'} 
    data = urllib.parse.urlencode(data).encode('utf-8')
    request = urllib.request.Request(url, data, headers)
    response = urllib.request.urlopen(request)
    text = response.read().decode('gbk')
    return text

def postData1(url, data):
    headers = {
        'User-Agent': '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"'}
    data = urllib.parse.urlencode(data).encode('utf-8')
    request = urllib.request.Request(url, data, headers)
    response = urllib.request.urlopen(request)
    text = response.read().decode('gbk')
    return text
   

第三部分:登录

# 登录程序
def login_in(nick,pwd):
    prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.15)&_=1400822309846' % nick
    preLogin = getData(prelogin_url)
    servertime = re.findall('"servertime":(.*?),', preLogin)[0]
    pubkey = re.findall('"pubkey":"(.*?)",', preLogin)[0]
    rsakv = re.findall('"rsakv":"(.*?)",', preLogin)[0]
    nonce = re.findall('"nonce":"(.*?)",', preLogin)[0]
    su = base64.b64encode(bytes(urllib.request.quote(nick), encoding='utf-8'))
    rsaPublickey = int(pubkey, 16)
    key = rsa.PublicKey(rsaPublickey, 65537)
    message = bytes(str(servertime) + '\t' + str(nonce) + '\n' + str(pwd), encoding='utf-8')
    sp = binascii.b2a_hex(rsa.encrypt(message, key))
    param = {'entry': 'weibo', 'gateway': 1, 'from': '', 'savestate': 7, 'useticket': 1,
             'pagerefer': 'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D',
             'vsnf': 1, 'su': su, 'service': 'miniblog', 'servertime': servertime, 'nonce': nonce, 'pwencode': 'rsa2',
             'rsakv': rsakv, 'sp': sp, 'sr': '1680*1050',
             'encoding': 'UTF-8', 'prelt': 961,
             'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack'}
    try:
        s = postData1('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param)
    except(UnicodeDecodeError):
        s = postData('http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param)


    if len(re.findall("location.replace\(\'(.*?)\'\);", s)) == 0:
        urll = re.findall("location.replace\(\"(.*?)\"\);", s)[0]
    else:
        urll = re.findall("location.replace\(\'(.*?)\'\);", s)[0]

    getData(urll)

第四部分:调用和解析

df = pd.read_csv("test.csv") #每一行是一个url
userid = '' #自行填写
pwd = '' # 自行填写

count = 0

for hang in range(len(df["link"].values)):
   print(hang)
   if count%100 == 0:  #担心爬着爬着又断了,每隔100条就重新登陆一次
       login_in(nick, pwd)
   try:
       link = df["link"].values[hang]
       try:
           text = getData1(link)
       except(UnicodeDecodeError):
           text = getData(link)
       info = re.findall("\.*?\<", text) #解析核心,可以将网页内容打印出来查看为什么这么解析
       df1 = pd.DataFrame(columns=('link', 'zhuanfa', 'comment', 'dianzan'))
       if len(info) == 4:
           df1.loc[0] = [link, info[1][4:-1], info[2][4:-1], info[3][4:-1]]
       elif len(info) == 7:
           df1.loc[0] = [link, info[4][4:-1], info[5][4:-1], info[6][4:-1]]
       else:
           write_log(link)
           continue
       df1.to_csv("results.csv", index=False, header=False, mode="a", encoding="gbk")
   except:
       write_log(link + ": error")
       continue

   count = count+1

存在问题

以上程序换了三台设备才跑完所有的url,差不多每跑一万条就会限制设备访问了。

你可能感兴趣的:(python)