Python + selenium爬取B站用户信息(iP池+pymsql存储)

import bs4
import json
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def getPage(mid, n, href):

    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Cookie': "",
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'Referer': href+'/fans/fans',
    }
    params = (
        ('vmid', str(mid)),
        ('pn', str(n)),
        ('ps', '50'),
        ('order', 'desc'),
    )

    proxy = ["116.117.134.134", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"]
    i = 0
    while True:
        print(i)
        if i < len(proxy):
            proxies = {
                'https://':  proxy[i]
            }
            response = requests.get('https://api.bilibili.com/x/relation/followers',  proxies=proxies, headers=headers, params=params)
            if response.status_code == 200:
                break
            i = i + 1
        if i + 1 == len(proxy):
            print("IP 全部失效")
            break
    return response

def getUserDetails(mid):
    cookies = {'domain': '/',
          'expires': 'false',
          'httpOnly': 'false',
          'name': 'buvid3',
          'path': 'Fri, 29 Jan 2021 08:50:10 GMT',
          'value': '7A29BBDE-VA94D-4F66-QC63-D9CB8568D84331045infoc,bilibili.com'}

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Origin': 'https://space.bilibili.com',
        'Connection': 'keep-alive',
        'Referer': 'https://space.bilibili.com/546195/fans/fans',
        'Cache-Control': 'max-age=0',
    }

    params = (
        ('mid', str(mid)),
        ('jsonp', 'jsonp'),
    )

    proxy = ["112.95.18.193", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"]
    i = 0
    while True:
        print(i)
        if i < len(proxy):
            proxies = {
                'https://': proxy[i]
            }
            response = requests.get('https://api.bilibili.com/x/space/acc/info', proxies=proxies, headers=headers, cookies=cookies, params=params)
            if response.status_code == 200:
                break
            i = i + 1
        if i + 1 == len(proxy):
            print("IP 全部失效")
            break
    return response

def getUpInfoBySelenium(href, mid):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
    browser.get(href)
    try:
        html = browser.execute_script("return document.documentElement.outerHTML")
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        focus = soup.find('p', 'n-data-v space-attention').text  # 关注数
        fans = soup.find('p', 'n-data-v space-fans').text  # 粉丝数
        print("关注数" + str(focus), "粉丝数" + str(fans))
    finally:
        browser.close()

def viplevel(vip):
    if vip == 0:
        vipname = '非会员'
    elif vip == 1:
        vipname = '会员'
    else:
        vipname = '大会员'
    return vipname

def createDb():

#--------------------------------------------------------------------------------------------------
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306)
    cursor = db.cursor();
    sql = 'CREATE DATABASE bilibili'
    cursor.execute(sql)
    cursor.close()

#--------------------------------------------------------------------------------------------------
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
    cursor = db.cursor()
    sql = 'CREATE TABLE IF NOT EXISTS up (id int(11) NOT NULL AUTO_INCREMENT, ' \
          'up_id VARCHAR(255) NOT NULL,up_name VARCHAR(255) NOT NULL, ' \
          'sex VARCHAR(10) NOT NULL, birthday VARCHAR(255),' \
          'focus VARCHAR(255),fans VARCHAR(255),area VARCHAR(255),' \
          'praise VARCHAR(255),view VARCHAR(255),' \
          'sign VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,' \
          'PRIMARY KEY (id,up_id))'
    cursor.execute(sql)
    db.close()
#---------------------------------------------------------------------------------------------------
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
    cursor = db.cursor()
    sql = 'CREATE TABLE IF NOT EXISTS fans (id int(11) NOT NULL AUTO_INCREMENT,' \
          'up_id VARCHAR(255) NOT NULL,fans_id VARCHAR(255) NOT NULL,' \
          'fans_name VARCHAR(255) NOT NULL, sex VARCHAR(10) NOT NULL,' \
          'fans_level VARCHAR(10) NOT NULL,viplevel VARCHAR(255) NOT NULL,' \
          'time VARCHAR(255) NOT NULL,' \
          'PRIMARY KEY (id))'
    cursor.execute(sql)
    db.close()

def insertUp(mid, name, sex, sign, birthday, title):
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
    cursor = db.cursor()
    sql = 'INSERT INTO up(up_id,up_name,sex,sign,birthday,title) values(%s,%s,%s,%s,%s,%s)'
    val = (mid, name, sex, sign, birthday, title)
    try :
        cursor.execute (sql, val)
        db.commit()
    except:
        db. rollback ()
    db.close()

def insertFans(up_mid, fans_mid, time, uname, viplevel, sex, level):
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
    cursor = db.cursor()
    sql = 'INSERT INTO fans(up_id,fans_id,fans_name,sex,fans_level,viplevel,time) values(%s,%s,%s,%s,%s,%s,%s)'
    val = (up_mid, fans_mid, uname, sex, level, viplevel, time)
    try:
        cursor.execute(sql, val)
        db.commit()
    except:
        db.rollback()
    db.close()


if __name__ == '__main__':

    up_id = ["546195", "9824766", "777536", "321173469", "517327498", "122879", "20165629", "14110780", "62540916", "19577966"]
    for i in range(len(up_id)):
        href = "https://space.bilibili.com/" + str(up_id[i]) + "/video"

        up = getUserDetails(up_id[i]) #获取up主个人信息(json)
        json_obj = json.loads(up.text)
        up_mid = json_obj['data']['mid']
        name = json_obj['data']['name']
        sex = json_obj['data']['sex']
        sign = json_obj['data']['sign']
        level = json_obj['data']['level']
        birthday = json_obj['data']['birthday']
        title = json_obj['data']['official']['title']
        print("up主uid:"+str(up_mid), "用户名:"+name, "性别:"+sex, "留言:"+sign, "生日:"+birthday, "称号:"+title)

        # ------------------------------------------------ #
        print("开始 selenium")
        getUpInfoBySelenium(href, str(up_mid))  # 打印粉丝数
        print("结束 selenium")
        # ------------------------------------------------ #

        print("粉丝数据:", end='')
        for j in range(1, 5):
            print("j:" + j)
            r = getPage(up_id[i], j, href)
            json_obj = json.loads(r.text) #返回json格式
            for entry in json_obj['data']['list']:
                fans_mid = entry['mid']
                mtime = entry['mtime']
                uname = entry['uname']
                vip = entry['vip']['vipType']
                fansDetails = getUserDetails(fans_mid)
                json_obj = json.loads(fansDetails.text)
                sex = json_obj['data']['sex']
                level = json_obj['data']['level']
                print("uid:" + str(fans_mid), "关注时间:"+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime)), "用户名:" + uname, "vip等级:" + viplevel(vip), "性别:"+sex, "账户等级:"+str(level))
            time.sleep(5) # 防止封ip

本文初始定义了几个UP主的id号,因为这几个UP主的粉丝量较大,故易获取粉丝信息

本文通过构造UP主的空间信息,获取粉丝量,并访问粉丝信息,通过python的pymsql库链接本地mysql进行数据存储,其中的SQL代码已经内嵌进去。对于B站高频访问会有IP限制,所以本文也采用了代理IP池的方法,不过本文并没有进一步通过构建时时IP池进行刷新IP,这一点受制于有效IP过少的限制,所以仅使用了几个IP进行替换。

另,B站的UP主粉丝信息并不能持续翻页,这点受制于B站网站的用户信息限制或本博主技术不到位,无法突破。故每个UP主的粉丝信息只能爬取几十页的粉丝目录。

你可能感兴趣的:(Python爬虫,python,pymysql,selenium,代理IP池,爬虫)