#哔哩哔哩每日热榜 #网页链接:https://www.bilibili.com/ranking? import requests from bs4 import BeautifulSoup #发出request请求,获取html网页 url = 'https://www.bilibili.com/ranking?' kv = {'user-agent': 'Mozilla/5.0'}#伪装爬虫 r = requests.get(url,timeout = 30,headers=kv) r.text#获取源代码 html=r.text #解析网页,提取内容 soup=BeautifulSoup(html,'lxml')#构造Soup的对象 res = soup.find_all('a',class_='title') num = 0 text = '' for i in res: num+=1 text+='{}{}\n'.format(num,i.string)#先把内容保存到变量里去 print(text) #保存 with open('rank.text','w',encoding='utf8')as fout: fout.write(text)
2020.3.26
import requests
from bs4 import BeautifulSoup
import csv
import datetime
url = 'https://www.bilibili.com/ranking?'
#发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text,'html.parser')
def main():
#用来保存视频信息的对象
class Video:
def __init__(self,rank,title,point,visit,up,url):
self.rank = rank
self.title = title
self.point = point
self.visit = visit
self.up = up
self.url = url
def to_csv(self):
return [self.rank,self.title,self.point,self.visit,self.up,self.url]
@staticmethod
def csv_title():
return ['排名','标题','分数','播放量','UP','URL']
#提取列表
items = soup.find_all('li',{'class':'rank-item'})
videos = [] #保存提取出来的video
for itm in items:
title = itm.find('a',{'class':'title'}).text#标题
point = itm.find('div',{'class':'pts'}).text#综合得分
rank = itm.find('div',{'class':'num'}).text#排名
visit = itm.find('span',{'class':'data-box'}).text#播放量
up = itm.find_all('a',)[2].text#up
url = itm.find('a',{'class':'title'}).get('href')#获取链接
v = Video(rank,title,point,visit,up,url)
videos.append(v)
#保存
now_str = datetime.datetime.now().strftime('%Y%m%d')
file_name = f'top100.csv_{now_str}.csv'
with open(file_name,'w',newline='') as f:
writer = csv.writer(f)
writer.writerow(Video.csv_title())
for v in videos:
writer.writerow(v.to_csv())
1.打开网页
2.获取源代码
这里找到需要提取的标题a标签,分析特点,它的类是title,在代码中可以用find_all函数查找
发现成功将排行榜爬取下来,想到可以用for循环把结果一个个打印出来
因为内容都是按顺序排下来的,所以可以自己弄数字形成排名
然后把内容保存到一个变量里去并检查有没有正常保存
最后直接保存到文件里面去,创建一个rank.txt,以写入的方式打开,把它赋值到fout这个变量里,fout写入获取到的文本内容
获取数据截图