


# -*- coding:UTF-8 -*-
# 作者: 囚生CY
# 最后更新: 20190715
# 转载请注明原作者, 禁止用于商业用途

import re
import os
import sys
import time
import json
import numpy
import pandas
import random

from PIL import Image
from requests import Session
from bs4 import BeautifulSoup
from selenium import webdriver
from matplotlib import pyplot as plt
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

class BiliBili():
	def __init__(self,
		userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
	):																	 # 构造函数
		""" 类构造参数 """
		self.username = username
		self.password = password
		self.userAgent = userAgent
		""" 类常用参数 """
		self.workspace = os.getcwd()									 # 类工作目录
		self.date = time.strftime("%Y%m%d")								 # 类构造时间
		self.labelCompiler = re.compile(r"<[^>]+>",re.S)				 # 标签正则编译
		self.tempFolder = "Temp"										 # 存储临时文件的文件夹
		self.videoFolder = "Video"										 # 存储视频数据的文件夹
		self.userFolder = "User"										 # 存储用户数据的文件夹
		self.commentFolder = "Comment"									 # 存储评论数据的文件夹
		self.log = "{}.log".format(self.date)							 # 记录文件
		self.videoPath = "{}\\{}\\{}".format(self.workspace,self.videoFolder,self.date)
		self.userPath = "{}\\{}\\{}".format(self.workspace,self.userFolder,self.date)
		self.commentPath = "{}\\{}\\{}".format(self.workspace,self.commentFolder,self.date)
		self.mainURL = "https://www.bilibili.com/"						 # BiliBili主页
		self.loginURL = "https://passport.bilibili.com/login"			 # 用户登录页面
		self.videoURL = "https://www.bilibili.com/video/av{}/"			 # 视频网址链接
		self.userURL = "https://space.bilibili.com/{}"					 # 用户空间链接
		self.options = webdriver.FirefoxOptions()						 # 火狐驱动配置
		self.headers = {"User-Agent": userAgent}
		self.session = Session()
		self.videoField = [												 # 视频数据库字段
			"av",														 # 视频av号
			"title",													 # 视频标题
			"up",														 # UP主昵称
			"follower",													 # UP粉丝数
			"playback_volume",											 # 播放量
			"barrage",													 # 弹幕数
			"like",														 # 点赞数
			"coin",														 # 硬币数
			"collect",													 # 收藏数
			"comment",													 # 评论数
			"comment_page",												 # 评论页数
			"category",													 # 视频类别
			"tags",														 # 视频标签(用|隔开)
			"timestamp",												 # 爬取数据的时间戳
		self.userField = [												 # 用户数据库字段
			"id",														 # 用户ID
			"name",														 # 用户昵称
			"gender",													 # 性别
			"level",													 # 用户等级
			"signature",												 # 个性签名
			"is_member",												 # 是否为大会员
			"fans_icon",												 # 是否开通粉丝勋章
			"follower",													 # 关注TA的人
			"followee",													 # TA关注的人
			"playback_volume",											 # 总播放量
			"reading_volume",											 # 总阅读数
			"contribution",												 # 投稿数
			"timestamp",												 # 爬取数据的时间戳
		self.commentField = [											 # 评论数据库的字段
			"av",														 # 视频av号
			"name",														 # 用户昵称
			"level",													 # 用户等级
			"text",														 # 评论内容
			"like",														 # 点赞数
			"reply",													 # 回复数
			"date",														 # 评论日期
			"timestamp",												 # 爬取数据的时间戳
		""" 类初始化 """
		self.session.headers = self.headers.copy()
		self.options.add_argument("--headless")							 # 设定无头浏览器的配置

		if not os.path.exists("{}\\{}".format(self.workspace,self.tempFolder)):
			string = "正在新建文件夹以存储临时文件..."

		if not os.path.exists(self.videoPath):							 # 视频数据文件初始化
			string = "正在新建文件夹以存储视频数据{}...".format(self.date)
			if not os.path.exists("{}\\{}".format(self.workspace,self.videoFolder)): os.mkdir("{}\\{}".format(self.workspace,self.videoFolder))
			with open("{}\\video{}.csv".format(self.videoPath,self.date),"w") as f:
				count = -1
				for field in self.videoField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)

		if not os.path.exists(self.userPath):							 # 用户数据文件初始化
			string = "正在新建文件夹以存储用户数据{}...".format(self.date)
			if not os.path.exists("{}\\{}".format(self.workspace,self.userFolder)): os.mkdir("{}\\{}".format(self.workspace,self.userFolder))
			with open("{}\\user{}.csv".format(self.userPath,self.date),"w") as f:
				count = -1
				for field in self.userField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)

		if not os.path.exists(self.commentPath):						 # 评论数据文件初始化
			string = "正在新建文件夹以存储评论数据{}...".format(self.date)
			if not os.path.exists("{}\\{}".format(self.workspace,self.commentFolder)): os.mkdir("{}\\{}".format(self.workspace,self.commentFolder))
			with open("{}\\comment{}.csv".format(self.commentPath,self.date),"w") as f:
				count = -1
				for field in self.commentField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)

	def login_20190408(self,):											 # 用户登录(20190408更新, 20190712检验已失效)

		def download_verifying_picture(divs,name):						 # 下载滑动验证图片	
			style = divs[0].attrs["style"]
			index1 = style.find("(")
			index2 = style.find(")")
			url = eval(style[index1+1:index2])
			html = self.session.get(url).content
			with open("{}\\{}\\{}.webp".format(self.workspace,self.tempFolder,name),"wb") as f: f.write(html)

		def recover_picture(divs,name):									 # 设法复原下载好的图片(该函数默认切片是两行)
			index = []
			for div in divs:											 # 遍历所有切片(52片)
				style = div.attrs["style"]
				index1 = style.find("background-position")				 # 寻找背景图的切片坐标
				temp = style[index1+21:-1].strip().replace("px","").replace("-","").split()
				temp = [int(i) for i in temp]				
			image = Image.open("{}\\{}\\{}.webp".format(self.workspace,self.tempFolder,name))
			image = numpy.asarray(image)								 # 图片转矩阵
			imageRe = numpy.zeros(image.shape)							 # 初始化复原图片矩阵
			total = len(index)											 # 获取总切片数
			Xaxis,Yaxis,Zaxis = image.shape								 # 获取图片三维信息(116×312×3)
			X = int(2*Yaxis/total)										 # 每个切片的列宽(12px)
			Y = int(Xaxis/2)											 # 每个切片的行高(58px)
			index = [[int((indice[0]-1)/X),int(indice[1]>0)] for indice in index]
			for i in range(total):										 # 遍历切片复原
				x1 = index[i][0]*X										 # 切片实际左坐标
				x2 = x1+X												 # 切片实际右坐标
				y1 = index[i][1]*Y										 # 切片实际上坐标
				y2 = y1+Y												 # 切片实际下坐标
				a = int(Y)												 # 切片原横坐标
				b1 = int((i%(total/2))*X)								 # 切片原上坐标
				b2 = int((i%(total/2))*X+X)								 # 切片原下坐标
				""" 判断当前切片是第几行(目前按照默认是前26个为第一行切片,后26个为第二行切片来做的) """
				if i=10:
				x = random.randint(5,9)
				xoffset -= x
			for i in range(int(xoffset)): tracks.append(1)				 # 最后几步慢慢走
			return tracks

		while True:
			browser = webdriver.Firefox()								 # 驱动火狐浏览器
			browser.get(self.loginURL)									 # 访问登录页面
			interval = 1.												 # 初始化页面加载时间(如果页面没有加载成功,将无法获取到下面的滑动验证码按钮,林外我意外的发现有时候竟然不是滑动验证,而是验证图片四字母识别,个人感觉处理滑动验证更有意思)
			while True:													 # 由于可能未成功加载,使用循环确保加载成功
				xpath = "//div[@class='gt_slider_knob gt_show']"		 # 滑动验证码最左边那个按钮的xpath定位
					time.sleep(interval)								 # 等待加载
					div = browser.find_element_by_xpath(xpath)
					interval += .5										 # 每失败一次让interval增加0.5秒

			html = browser.page_source									 # 此时获取的源代码中将包含滑动验证图片以及存在缺块的滑动验证图片
			soup = BeautifulSoup(html,"lxml")							 # 解析页面源代码
			div1s = soup.find_all("div",class_="gt_cut_fullbg_slice")	 # 找到没有缺块的验证图片52个切片
			div2s = soup.find_all("div",class_="gt_cut_bg_slice")		 # 找到存在缺块的验证图片52个切片
			div3 = soup.find("div",class_="gt_slice gt_show gt_moving")	 # 找到那个传说中的缺块						
			download_verifying_picture(div1s,1)							 # 下载无缺块
			download_verifying_picture(div2s,2)							 # 下载有缺块
			recover_picture(div1s,1)									 # 复原无缺块
			recover_picture(div2s,2)									 # 复原有缺块
			xoffset = find_block_space()								 # 寻找缺块位置的横坐标
			tracks = get_track(xoffset)
			total = 0
			for track in tracks:
				total += track
			xpath = "//a[@class='btn btn-login']"						 # 登录按钮的xpath定位
			browser.find_element_by_xpath(xpath).click()				 # 点击登录按钮
			html = browser.page_source
			soup = BeautifulSoup(html,"lxml")
			title = soup.find("title")
			if str(title.string[4])=="弹":
				return browser

	def login_20190712(self,):											 # 用户登录(20190712更新)
		 - 次序上是先输入用户名密码, 点击登录后才会出现验证码图片;
		 - 验证码图片的元素结构变化, 没有小切片, 并且无法获取原图链接, 这大大增加了复原的难度(而且我还找不到);
		 - 滑动按钮并未改变, 因此看起来是极验自身升级了, 因为近期无登录需求, 不打算攻破这种验证码, 认为在识别上有一定难度;

	def parse_video(self,av,driver,
		isVedioString="player-wrap",									 # 用于判断视频是否失效的字符串
		maxpage=50,														 # 最多获取maxpage页的评论(按热度排序)
	):																	 # 给定av号与浏览器驱动, 获取视频数据
		driver.get(self.videoURL.format(av))							 # 访问视频链接
		html = driver.page_source										 # 立即获取源码
		if not isVedioString in html: return False						 # 确认视频是否存在: 如果源码中有isVedioString则认为视频未失效			 
		while True:														 # 加载需要时间: 之前我使用WebDriverWait方法, 但是发现只要加载成功, 页面会回到顶部, 结果就又找不到底部的元素
			driver.execute_script("window.scrollBy(0,500)")				 # 滚屏找到评论
			try: divs = driver.find_element_by_xpath('//div[@class="baffle"]')
			except: continue											 # 找不到元素继续滚屏
			break														 # 走到这一步当然是找到元素咯
		html = driver.page_source										 # 获取完整的源代码
		timestamp = int(time.time())									 # 即刻获取timestamp

		soup = BeautifulSoup(html,"lxml")								 # 解析页面源代码
		 - title: "//span[@class='tit']";
		 - up: 有点硬写得;
		 - follower: 写得也很硬;
		 - playback_volume: 第一种从title属性中获取,过滤掉的前4个字符是总播放数(精确到个位), 第二种直接拿string(精确到千位);
		 - barrage: 第一种从title属性中获取, 过滤掉前7个字符是历史累计弹幕数(精确到个位), 第二种直接拿string(精确到千位);
		 - like: 第一种从title属性中获取, 过滤掉前3个字符是点赞数(精确到个位), 第二种直接拿string(精确到千位);
		 - coin: "//span[@class='tit']";
		 - collect: "//span[@class='tit']";
		 - comment: 利用先定位到"//div[@class='common']", 降低容错率;
		 - comment_page: 利用先定位到"//div[@class='common']", 降低容错率;
		 - category: 目前来看只有早期部分视频没有分类, 几乎所有视频是有分类的, 因此元素可能定位不到;
		 - tags: 少数视频无tag, 不同的tag用"|"分开;
		 - timestamp: 时间戳;
		title = str(soup.find("span",class_="tit").string)
		up = str(soup.find("div",class_="u-info").find("div",class_="name").find("a").string)
		follower = str(soup.find("i",class_="van-icon-general_addto_s").find_next_sibling().string)

		playback_volume = soup.find("span",class_="view")
		playback_volume1 = playback_volume.attrs["title"][4:]
		playback_volume2 = str(playback_volume.string)
		playback_volume2 = playback_volume2[:playback_volume2.find("播放")]

		barrage = soup.find("span",class_="dm")
		barrage1 = barrage.attrs["title"][7:]		
		barrage2 = str(barrage.string)					
		barrage2 = barrage2[:barrage2.find("弹幕")]
		temp = soup.find("div",class_="ops")
		like = temp.find("span",class_="like")
		like1 = like.attrs["title"][3:]
		like2 = self.labelCompiler.sub("",str(like)).replace("\n","").replace(" ","")
		coin = temp.find("span",class_="coin")
		coin = self.labelCompiler.sub("",str(coin)).replace("\n","").replace(" ","")
		collect = temp.find("span",class_="collect")
		collect = self.labelCompiler.sub("",str(collect)).replace("\n","").replace(" ","")

		temp = soup.find("div",class_="common")
		comment = temp.find("span",class_="b-head-t results").string
		comment = 0 if comment is None else int(comment)
		comment_page = 0 if comment==0 else int(str(temp.find("span",class_="result").string).replace(" ","")[1:-1])

		try: category = str(soup.find("span",class_="a-crumbs").find("a").string)
		except: category = str()										 # 无分类: 使用异常测试尽管不会报错, 但是出问题也将尽快发现20190714;

		temp = soup.find("div",id="v_tag").find_all("li",class_="tag")	 # 该temp包含了所有tag
		tags = str()
		for tag in temp: tags += "{}|".format(tag.find("a").string)
		tags = tags[:-1]												 # 去掉最后一个"|"符号

		string = str()
		for item in [av,title,up,follower,playback_volume1,barrage1,like1,coin,collect,comment,comment_page,category,tags,timestamp]: string += "{},".format(item)
		string = "{}\n".format(string[:-1])
		with open("{}\\video{}.csv".format(self.videoPath,self.date),"a",encoding="UTF-8") as f: f.write(string)

		if comment==0: return 											 # 无评论就告辞了
		""" 以下开始获取评论信息 """
		driver.find_element_by_xpath("//li[@class='hot-sort  on']").click()
		page = 0														 # 记录当前页数
		while page标签, 不方便直接获取string, 因此选择去标签正则, 有些评论有换行, 目前先用"|"符号替代\n
				like = child.find("span",class_="like").string
				like = 0 if like is None else int(like)					 # like: 当无点赞时string位置是空
				reply = child.find("div",class_="reply-box")
				reply = len(list(reply.children))						 # reply
				date = str(child.find("span",class_="time").string)		 # date
				string = str()
				for item in [av,uid,name,level,text,like,reply,date,timestamp]: string += "{},".format(item)
				string = "{}\n".format(string[:-1])			
				with open("{}\\comment{}.csv".format(self.commentPath,self.date),"a",encoding="UTF-8") as f: f.write(string)
			try: driver.find_element_by_xpath("//a[@class='next']").click()
			except: break												 # 找不到下一页的按钮了

	def parse_user(self,uid,driver,
		xpath_flag="//div[@id='app']"									 # 用于判定页面是否加载完成: 未登录状态时或不为TA的粉丝时为
, 登录状态时且为TA的粉丝为
, 总之只看id属性差不多够了 ): # 给定用户ID与浏览器驱动, 获取用户数据 driver.get(self.userURL.format(uid)) WebDriverWait(driver,15).until(lambda driver: driver.find_element_by_xpath(xpath_flag).is_displayed()) html = driver.page_source # 相对来说用户空间的html加载很快 timestamp = int(time.time()) soup = BeautifulSoup(html,"lxml") # 解析起来也较为容易 self.userField = [ # 用户数据库字段 "id", # 用户ID "name", # 用户昵称 "gender", # 性别 "level", # 用户等级 "signature", # 个性签名 "is_member", # 是否为大会员 "fans_icon", # 是否开通粉丝勋章 "follower", # 关注TA的人 "followee", # TA关注的人 "playback_volume", # 总播放量 "reading_volume", # 总阅读数 "contribution", # 投稿数 "timestamp", # 爬取数据的时间戳 ] with open("log_{}.html".format(uid),"w",encoding="UTF-8") as f: f.write(html) temp = soup.find("div",class_="h-basic") # 定位到左上部用户信息区域 name = str(temp.find("span",id="h-name").string) # name: 用户昵称应该不会有什么问题 gender = temp.find("span",id="h-gender").attrs["class"] # 在性别标签的class属性下包含了性别信息 gender = gender[2] if len(gender)==3 else str() # gender: 性别为男女不定, class标签为["icon","gender","male"/"female"], 没有填写性别的用户没有第三个class, 且不展示 level = temp.find("a",class_="h-level m-level").attrs["lvl"][0] # level: level1~level6, 应该也不会有什么问题 signature = str(temp.find("div",class_="h-basic-spacing").find("h4",class_="h-sign").string) signature = str() if signature=="None" else signature.strip() # signature: 个性签名为空处理为空字符串, 不展示 is_member = temp.find("a",class_="h-vipType").string is_member = False if is_member is None else True # is_member: 开通年度大会员的用户string字段是"年度大会员", 未开通的该字段不展示且为空 fans_icon = temp.find("span",class_="h-fans-icon") fans_icon = False if fans_icon is None else True # fans_icon: 与上面的不展示的标签不同, 未开通粉丝勋章的用户是没有该标签的 temp = soup.find("div",class_="n-statistics") # 定位到右上部用户数据统计区域 follower = temp.find("a",class_="n-data n-fs").attrs["title"] # follower: 这个应该没有太多问题, title属性里是精确的个数, string部分里精确到千位 followee = temp.find("a",class_="n-data n-gz").attrs["title"] # followee: 这个应该没有太多问题, title属性里是精确的个数, string部分里精确到千位 volumes = temp.find_all("a",class_="n-data n-bf") # 这部分是流量区域: 目前我只找到播放数与阅读数两种, 没有投稿的人不会有播放数, 没有动态的人不会有阅读数 if len(volumes)==0: playback_volume = reading_volume = 0 # 无播放数, 无阅读数(大部分边缘用户) elif len(volumes)==1: # 播放数阅读数二选一(代表人物:papi酱, 1532165) string = str(volume[0].find("p",class_="n-data-k").string) if string=="播放数": reading_volume = 0 playback_volume = volumes[0].attrs["title"].replace(",","") elif string=="阅读数": playback_volume = 0 reading_volume = volumes[0].attrs["title"].replace(",","") else: # 异常记录 with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("Error1: 无法确定流量类别!UID{}\t{}\n".format(uid,time.strftime("%Y-%m-%d %H:%M:%S"))) elif len(volumes)==2: # 有播放数, 有阅读数(代表人物:lexburner, 777536) playback_volume = volumes[0].attrs["title"].replace(",","") # playback_volume: 播放量超过1000则title属性里的精确播放数会有","符号 reading_volume = volumes[1].attrs["title"].replace(",","") # reading_volume: 阅读量超过1000则title属性里的精确阅读数会有","符号 else: # 如果超过2个我决定抛出异常 with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("Error2: 流量数量超过2!UID{}\t{}\n".format(uid,time.strftime("%Y-%m-%d %H:%M:%S"))) temp = soup.find("a",class_="n-btn n-video n-audio n-article n-album") contribution = int(soup.find("span",class_="n-num").string) string = str() for item in [uid,name,gender,level,signature,is_member,fans_icon,follower,followee,playback_volume,reading_volume,contribution,timestamp]: string += "{},".format(item) string = "{}\n".format(string[:-1]) with open("{}\\user{}.csv".format(self.userPath,self.date),"a",encoding="UTF-8") as f: f.write(string) def parse(self, headless=False, ): av = 0 # 记录当前av号 driver = webdriver.Firefox(options=self.options) if headless else webdriver.Firefox() driver.implicitly_wait(10) # 设置等待超时 while True: av += 1 string = "正在获取av{}的信息...".format(av) print(string) with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("{}\t{}\n".format(string,time.strftime("%Y-%m-%d %H:%M:%S"))) self.parse_video(av,driver) driver.quit() def test(self,): # 测试代码 driver = webdriver.Firefox() uids = [777536,1532165,281317955] driver.implicitly_wait(10) for uid in uids: self.parse_user(uid,driver) if __name__ == "__main__": bilibili = BiliBili() #bilibili.parse() bilibili.test()


