用Python爬取豆瓣音乐豆列,保存信息至文本文件并下载专辑图片

  • 语言:Python 3.5.5
  • 使用到的库:requests库BeautifulSoup库(均可以使用pip方法安装)
  • 功能实现:爬取豆列的信息,将其保存为文本文件,下载其中专辑图片并保存
  • 功能限制:仅支持爬取音乐类豆列,图书类和电影类暂不支持

关键函数scratchInfo:

def scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url):
    for i in range(size + 1):
        newUrl = url + str(i * 25)
        text = getHTMLText(newUrl)
        soup = BeautifulSoup(text, "html.parser")
        for item in soup.find_all(class_="title"): #爬取专辑名
            st = item.a.string
            AlbumNamesList.append(st[9:-7])
        for item in soup.find_all(class_="abstract"): #爬取表演者名
            st = item.contents[0]
            PerformerList.append(st[24:-13])
        for item in soup.find_all(class_="rating_nums"): #爬取评分
            st = str(item.string)
            if (st == "None"):
                st = "0.0"
            RatingList.append(st)
        for item in soup.find_all(class_="ft"): #爬取评语
            type = item.div.attrs['class'][0]
            if (type != "comment-item"):
                st = " \n"
            else:
                st = item.div.blockquote.contents[2]
            CommentsList.append(st)
        for item in soup.find_all(class_="post"): #爬取专辑图片链接
            st = item.a.img.attrs["src"]
            PicUrlList.append(st)
    pass

完整代码(以爬取【MCB杂志上评出】 百张史上最具革命性的专辑 为例):

import requests
from bs4 import BeautifulSoup

def getHTMLText(url):    #用于得到网站完整文本
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("Error")
        return ""


def downloadPicture(PicUrlList, AlbumNamesList):
    di = "D://PythonRequestsDownload//" #下载专辑图片的目录
    length = len(PicUrlList)
    for i in range(length):
        url = PicUrlList[i]
        r = requests.get(url)
        path = di + AlbumNamesList[i] + ".jpg"
        with open(path, "wb") as f:
            f.write(r.content)
    pass

def getSize(url):
    text = getHTMLText(url)
    soup = BeautifulSoup(text, "html.parser")
    ls = soup.find_all(class_ = "doulist-filter")
    str = ls[0].span.text
    num = str[1:-1]
    size = int(num) // 25
    print(size)
    return size

def DeleteElement(numList, AlbumNamesList, PerformerList, CommentsList, PicUrlList):
    for i in numList:      #删除numList列表内的数字对应豆列内容
        del AlbumNamesList[i]
        del PerformerList[i]
        del CommentsList[i]
    pass


def scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url):
    for i in range(size + 1):
        newUrl = url + str(i * 25)
        text = getHTMLText(newUrl)
        soup = BeautifulSoup(text, "html.parser")
        for item in soup.find_all(class_="title"):
            st = item.a.string
            AlbumNamesList.append(st[9:-7])
        for item in soup.find_all(class_="abstract"):
            st = item.contents[0]
            PerformerList.append(st[24:-13])
        for item in soup.find_all(class_="rating_nums"):
            st = str(item.string)
            if (st == "None"):
                st = "0.0"
            RatingList.append(st)
        for item in soup.find_all(class_="ft"):
            type = item.div.attrs['class'][0]
            if (type != "comment-item"):
                st = " \n"
            else:
                st = item.div.blockquote.contents[2]
            CommentsList.append(st)
        for item in soup.find_all(class_="post"):
            st = item.a.img.attrs["src"]
            PicUrlList.append(st)
    pass


def scratchTitle(url):
    text = getHTMLText(url)
    soup = BeautifulSoup(text, "html.parser")
    title = soup.head.title.string  #爬取豆列名,用于对文本文件命名
    print(title)
    return title


def printList(ls1, ls2, ls3, ls4):   #输出爬取的信息
    length = min(len(ls1), len(ls2), len(ls3), len(ls4))
    for i in range(length):
        print("No.", i + 1)
        print("Album: " + ls1[i])
        print("Rate: " + ls3[i])
        print("Performer: " + ls2[i])
        print("Comment: " + ls4[i])
    pass


def writeList(ls1, ls2, ls3, ls4, title):  #将爬取的信息写入文本文件
    length = min(len(ls1), len(ls2), len(ls3), len(ls4))
    name = title + ".txt"
    with open(name, "w", encoding='utf-8') as f:
        for i in range(length):
            f.write("No." + str(i + 1) + "\n")
            f.write("Album: 《" + ls1[i] + "》\n")
            f.write("Rate: " + ls3[i] + "\n")
            f.write("Performer: " + ls2[i] + "\n")
            f.write("Comment: " + ls4[i] + "\n")
    f.close()
    pass


def main():
    AlbumNamesList = list()
    PerformerList = list()
    CommentsList = list()
    RatingList = list()
    PicUrlList = list()
    url = "https://www.douban.com/doulist/73799/?start=" #爬取的豆列链接,最后改成"start="的结构
    title = scratchTitle(url + "0")
    size = getSize(url)
    scratchInfo(AlbumNamesList, PerformerList, RatingList, CommentsList, PicUrlList, size, url)
    deleteList = [0]     #不参与爬取的豆列元素编号
    DeleteElement(deleteList, AlbumNamesList, PerformerList, CommentsList, PicUrlList)   #删除元素
    writeList(AlbumNamesList, PerformerList, RatingList, CommentsList, title)    #写入文本文件
    # downloadPicture(PicUrlList, AlbumNamesList)  #下载专辑图片
    printList(AlbumNamesList, PerformerList, RatingList, CommentsList)    #输出豆列信息到控制台


main()

你可能感兴趣的:(Python,爬虫)