关键函数scratchInfo:
def scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url):
for i in range(size + 1):
newUrl = url + str(i * 25)
text = getHTMLText(newUrl)
soup = BeautifulSoup(text, "html.parser")
for item in soup.find_all(class_="title"): #爬取专辑名
st = item.a.string
AlbumNamesList.append(st[9:-7])
for item in soup.find_all(class_="abstract"): #爬取表演者名
st = item.contents[0]
PerformerList.append(st[24:-13])
for item in soup.find_all(class_="rating_nums"): #爬取评分
st = str(item.string)
if (st == "None"):
st = "0.0"
RatingList.append(st)
for item in soup.find_all(class_="ft"): #爬取评语
type = item.div.attrs['class'][0]
if (type != "comment-item"):
st = " \n"
else:
st = item.div.blockquote.contents[2]
CommentsList.append(st)
for item in soup.find_all(class_="post"): #爬取专辑图片链接
st = item.a.img.attrs["src"]
PicUrlList.append(st)
pass
完整代码(以爬取【MCB杂志上评出】 百张史上最具革命性的专辑 为例):
import requests
from bs4 import BeautifulSoup
def getHTMLText(url): #用于得到网站完整文本
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("Error")
return ""
def downloadPicture(PicUrlList, AlbumNamesList):
di = "D://PythonRequestsDownload//" #下载专辑图片的目录
length = len(PicUrlList)
for i in range(length):
url = PicUrlList[i]
r = requests.get(url)
path = di + AlbumNamesList[i] + ".jpg"
with open(path, "wb") as f:
f.write(r.content)
pass
def getSize(url):
text = getHTMLText(url)
soup = BeautifulSoup(text, "html.parser")
ls = soup.find_all(class_ = "doulist-filter")
str = ls[0].span.text
num = str[1:-1]
size = int(num) // 25
print(size)
return size
def DeleteElement(numList, AlbumNamesList, PerformerList, CommentsList, PicUrlList):
for i in numList: #删除numList列表内的数字对应豆列内容
del AlbumNamesList[i]
del PerformerList[i]
del CommentsList[i]
pass
def scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url):
for i in range(size + 1):
newUrl = url + str(i * 25)
text = getHTMLText(newUrl)
soup = BeautifulSoup(text, "html.parser")
for item in soup.find_all(class_="title"):
st = item.a.string
AlbumNamesList.append(st[9:-7])
for item in soup.find_all(class_="abstract"):
st = item.contents[0]
PerformerList.append(st[24:-13])
for item in soup.find_all(class_="rating_nums"):
st = str(item.string)
if (st == "None"):
st = "0.0"
RatingList.append(st)
for item in soup.find_all(class_="ft"):
type = item.div.attrs['class'][0]
if (type != "comment-item"):
st = " \n"
else:
st = item.div.blockquote.contents[2]
CommentsList.append(st)
for item in soup.find_all(class_="post"):
st = item.a.img.attrs["src"]
PicUrlList.append(st)
pass
def scratchTitle(url):
text = getHTMLText(url)
soup = BeautifulSoup(text, "html.parser")
title = soup.head.title.string #爬取豆列名,用于对文本文件命名
print(title)
return title
def printList(ls1, ls2, ls3, ls4): #输出爬取的信息
length = min(len(ls1), len(ls2), len(ls3), len(ls4))
for i in range(length):
print("No.", i + 1)
print("Album: " + ls1[i])
print("Rate: " + ls3[i])
print("Performer: " + ls2[i])
print("Comment: " + ls4[i])
pass
def writeList(ls1, ls2, ls3, ls4, title): #将爬取的信息写入文本文件
length = min(len(ls1), len(ls2), len(ls3), len(ls4))
name = title + ".txt"
with open(name, "w", encoding='utf-8') as f:
for i in range(length):
f.write("No." + str(i + 1) + "\n")
f.write("Album: 《" + ls1[i] + "》\n")
f.write("Rate: " + ls3[i] + "\n")
f.write("Performer: " + ls2[i] + "\n")
f.write("Comment: " + ls4[i] + "\n")
f.close()
pass
def main():
AlbumNamesList = list()
PerformerList = list()
CommentsList = list()
RatingList = list()
PicUrlList = list()
url = "https://www.douban.com/doulist/73799/?start=" #爬取的豆列链接,最后改成"start="的结构
title = scratchTitle(url + "0")
size = getSize(url)
scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url)
deleteList = [0] #不参与爬取的豆列元素编号
DeleteElement(deleteList, AlbumNamesList, PerformerList, CommentsList, PicUrlList) #删除元素
writeList(AlbumNamesList, PerformerList, RatingList, CommentsList, title) #写入文本文件
# downloadPicture(PicUrlList, AlbumNamesList) #下载专辑图片
printList(AlbumNamesList, PerformerList, RatingList, CommentsList) #输出豆列信息到控制台
main()