import requests
from bs4 import BeautifulSoup
import csv
嘿嘿,该项目需要引入3个库
1.requests(用于连接URL(网络))
2.bs4(用于解析URL(网络))
3.csv(把文件保存为csv模式(类似excel))
如果没有这些库可以在dos窗口用‘pip install 库名’来安装
1.获取数据所需要的库: requests
2.解析数据所需要的库:beautifulsoup4
3.保存数据到csv的库:csv
1首先第一个库requests用于连接网络
url='https://www.bilibili.com/ranking'
response = requests.get(url)
requests里面的get和post后面加要爬取的网站就可以打开该网站啦
2其次用bs4里面的Beautifulsoup进行网络的解析
html_text=response.text
soup = BeautifulSoup(html_text,'html.parser')
其次用soup的findall来寻找我们要爬取的资料,我以下爬取了6个属性
items = soup.findAll('li', {'class': 'rank-item'})
print(len(items))
for itm in items:
title = itm.find('a', {'class': 'title'}).text
up = itm.find_all('a')[2].text
score = itm.find('div', {'class': 'pts'}).find('div').text
rank = itm.find('div', {'class': 'num'}).text
url = itm.find('a', {'class': 'title'}).get('href')
space = itm.find_all('a')[2].get('href')
up_id = space[len('//space.bilibili.com/'):]
3最后就是把资料保存下来了
1.定义类保存
class top:
def __init__(self, rank, title, score, up, up_id, url):
self.rank = rank
self.title = title
self.score = score
self.up = up
self.up_id = up_id
self.url = url
2.定义链表保存数据
topcontent = []
3.完善类,让它可以保存完整数据
class top:
def __init__(self, rank, title, score, up, up_id, url):
self.rank = rank
self.title = title
self.score = score
self.up = up
self.up_id = up_id
self.url = url
def tocsv(self):
return [self.rank, self.title, self.score, self.up, self.up_id, self.url]
@staticmethod
def csvtitle():
return ['rank', 'title', 'score', 'up', 'up_id', 'url']
4.存入类的变量
for itm in items:
title = itm.find('a', {'class': 'title'}).text
up = itm.find_all('a')[2].text
score = itm.find('div', {'class': 'pts'}).find('div').text
rank = itm.find('div', {'class': 'num'}).text
url = itm.find('a', {'class': 'title'}).get('href')
space = itm.find_all('a')[2].get('href')
up_id = space[len('//space.bilibili.com/'):]
v = top(rank, title, score, up, up_id, url)
topcontent.append(v)
5.最后保存到csv就欧克拉
file_name = 'TOP100.csv'
with open(file_name, 'w', newline='', encoding='utf-8') as file:
pen = csv.writer(file)
pen.writerow(v.csvtitle())
for v in topcontent:
pen.writerow(v.tocsv())
最后附上全代码哈
import requests
from bs4 import BeautifulSoup
import csv
class top:
def __init__(self, rank, title, score, up, up_id, url):
self.rank = rank
self.title = title
self.score = score
self.up = up
self.up_id = up_id
self.url = url
def tocsv(self):
return [self.rank, self.title, self.score, self.up, self.up_id, self.url]
@staticmethod
def csvtitle():
return ['rank', 'title', 'score', 'up', 'up_id', 'url']
url='https://www.bilibili.com/ranking'
response = requests.get(url)
print(requests.get(url))
html_text=response.text
soup = BeautifulSoup(html_text,'html.parser')
topcontent = []
items = soup.findAll('li', {'class': 'rank-item'})
print(len(items))
for itm in items:
title = itm.find('a', {'class': 'title'}).text
up = itm.find_all('a')[2].text
score = itm.find('div', {'class': 'pts'}).find('div').text
rank = itm.find('div', {'class': 'num'}).text
url = itm.find('a', {'class': 'title'}).get('href')
space = itm.find_all('a')[2].get('href')
up_id = space[len('//space.bilibili.com/'):]
v = top(rank, title, score, up, up_id, url)
topcontent.append(v)
print(len(topcontent))
file_name = 'TOP100.csv'
with open(file_name, 'w', newline='', encoding='utf-8') as file:
pen = csv.writer(file)
pen.writerow(v.csvtitle())
for v in topcontent:
pen.writerow(v.tocsv())