python爬取豆瓣读书并进行图形化分析

python爬取豆瓣读书并进行图形化分析

  1. 豆瓣读书网页数据爬取并保存至csv
  2. 对数据进行分析并汇成图形
  3. 绘制散点图
  4. 图形效果展示

以下代码内容大多是团队小伙伴的杰作,而本人只是为了能让更多的人学习到知识从而能更好的解决问题,才将整个python项目完整发出。

目录结构如下:

建立一个豆瓣总文件夹,然后只需在总文件夹下面建立一个image文件夹(这是用来存放从豆瓣中下载的图片),douban.py、display.py、Work.py。其它的在代码运行之后都会自动生成。

python爬取豆瓣读书并进行图形化分析_第1张图片

 

1、豆瓣读书网页数据爬取并保存至csv

douban.py代码如下:

'''
   项目:豆瓣读书分析
   日期:2020/12/14
'''
import requests
from lxml import etree
import time
import csv

#信息头
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
        }

def douban_booksrank(url):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    contents = selector.xpath('//div[@class="article"]/div[contains(@class,"doulist-item")]')  #循环点
    for content in contents:
        try:
            title = content.xpath('div/div[2]/div[3]/a/text()')[0].strip()  #书名

            scores = content.xpath('div/div[2]/div[4]/span[2]/text()')  #评分

            scores.append('9.0')  #因为有一些书没有评分,导致列表为空,此处添加一个默认评分,若无评分则默认为9.0
            score = scores[0]
            comments = content.xpath('div/div[2]/div[4]/span[3]/text()')[0] #评论数量

            author = content.xpath('div/div[2]/div[5]/text()[1]')[0].strip()    #作者
            #利用str的区间索引去除作者:
            real_author=author[3:]
            publishment = content.xpath('div/div[2]/div[5]/text()[2]')[0].strip()   #出版社
            real_publishment=publishment[4:]
            pub_year = content.xpath('div/div[2]/div[5]/text()[3]')[0].strip()  #出版时间
            real_year = pub_year[4:]
            img_url = content.xpath('div/div[2]/div[2]/a/img/@src')[0].strip()  #书本图片的网址

            img = requests.get(img_url) #解析图片网址,为下面下载图片
            img_name_file = 'Image/{}.png'.format((title.strip())[:3])   #图片存储位置,图片名只取前3
             #写入csv
            with open('douban_books.csv','a', encoding='utf-8-sig',newline='')as fp:   #newline 使不隔行
                 writer = csv.writer(fp)
                 writer.writerow((title, score, comments, real_author, real_publishment, real_year, img_url))
             #下载图片,为防止图片名导致格式错误,加入try...except
            try:
                with open(img_name_file, 'wb')as imgf:
                     imgf.write(img.content)
            except FileNotFoundError or OSError:
                print('图片文件写入位置错误!')
        except IndexError:
           print('该条数据格式太长,下载失败!')
def main():

    # # 爬取所有书本,共12页的内容
    urls = ['https://www.douban.com/doulist/1264675/?start={}&sort=time&playable=0⊂_type='.format(str(i)) for i in
             range(0, 300, 25)]
     # 写csv首行
    with open('douban_books.csv', mode='w',newline='', encoding='utf-8-sig')as f:
        writer = csv.writer(f)
        writer.writerow(('title', 'score', 'comment', 'author', 'publishment', 'pub_year', 'img_url'))
    # 遍历所有网页,执行爬取程序
    for url in urls:
        douban_booksrank(url)

if __name__ == '__main__':
    main()

2、display.py代码:绘制饼图、直方图、散点图

'''
展示分析数据,利用pyecharts库——————生成直观,炫丽的图表
'''
from pyecharts.charts import Bar,Pie,Polar
import pyecharts.options as opts
import pandas as pd
from pyecharts.globals import ThemeType
# 绘制书籍出版年份分布饼图
def draw_pie(years_contain):

    # 计数,计算出出版年份距今的时间
    recent_5 = 0
    recent_10 = 0
    late_10 = 0
    for i in years_contain:
        if (2020 - i <= 5):
            recent_5 += 1
        elif (5 < 2020 - i <= 10):
            recent_10 += 1
        else:
            late_10 += 1
    # 年份计数转化为数组
    y_data = []
    y_data.append(recent_5)
    y_data.append(recent_10)
    y_data.append(late_10)
    x_data = ["出版于近5年", "出版于近5-10年", "出版于10前年"]
    data_pair = [list(z) for z in zip(x_data, y_data)]
    data_pair.sort(key=lambda x: x[1])

    (
        Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
            .add(
            series_name="占百分比",
            data_pair=data_pair,
            rosetype="radius",
            radius="55%",
            center=["50%", "50%"],
            label_opts=opts.LabelOpts(is_show=False, position="center"),
        )
            .set_global_opts(
            title_opts=opts.TitleOpts(
                title="书籍出版年份分布",
                pos_left="center",
                pos_top="20",
                title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
            ),
            legend_opts=opts.LegendOpts(is_show=False),
        )
            .set_series_opts(
            tooltip_opts=opts.TooltipOpts(
                trigger="item", formatter="{a} 
{b}: {c} ({d}%)" ), label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"), ) .render("书籍出版年份占比.html") ) # 绘制出版年份与评论数量的关系 def draw_years_and_comment(years_contain,new_comment): # 对年份排序 years_contain.sort() c = ( Bar({"theme": ThemeType.MACARONS}) .add_xaxis(years_contain) .add_yaxis("评论数", new_comment) .set_global_opts( title_opts={"text": "Bar-出版年份与评论数量的关系", "subtext": "用于评论增长趋势"} ) .render("出版年份与评论数量的关系.html") ) # 绘制书籍评论数量分布直方图 def draw_comments_distribution(new_comment): less_and2000 = 0 less_4000_greater_2000 = 0 less_6000_greater_4000 = 0 greater_6000 = 0 for number in new_comment: if number <= 2000: less_and2000 += 1 elif 2000 < number <= 4000: less_4000_greater_2000 += 1 elif 4000 < number <= 6000: less_6000_greater_4000 += 1 else: greater_6000 += 1 c = ( Bar() .add_xaxis(['<=2000', '2000-4000', '4000-6000', '>6000']) .add_yaxis("评论数", [less_and2000, less_4000_greater_2000, less_6000_greater_4000, greater_6000]) .set_global_opts( title_opts=opts.TitleOpts(title="书籍评论数分布"), brush_opts=opts.BrushOpts(), ) .render("书籍评论数分布.html") ) def radar_map(new_score): #将new_score按照1.0制评分画散点图 score_1=[] for score in new_score: modify_score=score-9 #将评论转化为0.1的精度 score_1.append('{:.1f}'.format(modify_score)) data = [(i, score_1[i]) for i in range(len(score_1))] c = ( Polar() .add("", data, type_="scatter", label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title="图书评分雷达散点图",subtitle='基于9.0评分' )) .render("雷达散点图.html") ) def main(): #调用pandas 库读取csv文件 all_data=pd.read_csv('douban_books.csv') #获取出版年份进行筛选 data = all_data['pub_year'].values.tolist() pursur_year = [] for i in data: if len(i[0:5]) ==5 : pursur_year.append(i[0:5].strip()) years_contain=[] for year in pursur_year : if year !='二零零六': years_contain.append(int(year)) #获取评论进行筛选 comments_data = all_data['comment'].values.tolist() comments =[] for item in comments_data : new_item= item[1:-4] comments.append(new_item) # 将comment的元素化为整型 new_comment = [] for i in comments: new_comment.append(int(i)) #绘制出版年份饼图 draw_pie(years_contain) #绘制出版年份与评论数量的关系 draw_years_and_comment(years_contain,new_comment) #绘制书籍评论数量图 draw_comments_distribution(new_comment) #获取csv中所有的score评分 new_score=all_data.score # 用雷达散点图绘制 radar_map(new_score) if __name__ == '__main__': main()

3、Work.py代码: 出版年份与评论数量的关系

import pygame, random, sys, time

pygame.init()
screen = pygame.display.set_mode([600, 400])
screen.fill((255, 255, 255))
radiusr = 0
arrradiusr = [0] * 10  # 圆的半径
arraddradiusr = [0] * 10  # 圆的半径增量
arrradiusbool = [False] * 10  # 圆是否存在   False代表该索引值下的圆不存在,True代表存在
arrradiusx = [0] * 10  # 圆的坐标x轴
arrradiusy = [0] * 10  # 圆的坐标y轴
RGBx = [0] * 10  # 颜色RGB值第一个值
RGBy = [0] * 10  # 颜色RGB值第二个值
RGBz = [0] * 10  # 颜色RGB值第三个值

while True:
    time.sleep(0.1)  # 0.1秒
    for event in pygame.event.get():  # 监听器
        if event.type == pygame.MOUSEBUTTONDOWN:  # 鼠标按下
            num = arrradiusbool.index(False)   #获取圆不存在的索引值
            arrradiusbool[num] = True          #将该索引值的圆设置为存在
            arrradiusr[num] = 0                #该圆的半径设置为0
            arrradiusx[num], arrradiusy[num] = pygame.mouse.get_pos()        #获取鼠标坐标
            RGBx[num] = random.randint(0, 255)       #获取颜色值
            RGBy[num] = random.randint(0, 255)       #获取颜色值
            RGBz[num] = random.randint(0, 255)       #获取颜色值
            pygame.draw.circle(screen, pygame.Color(RGBx[num], RGBy[num], RGBz[num]),     #画圆
                               (arrradiusx[num], arrradiusy[num]), arrradiusr[num], 1)
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()
    for i in range(10):
        if arrradiusbool[i] == False:     #如果圆不存在则跳过循环
            pass
        else:
            if (arrradiusr[i] < random.randint(10, 50)):       #随机圆的大小
                arraddradiusr[i] = random.randint(0, 5)        #圆的随机半径增量
                arrradiusr[i] += arraddradiusr[i]
                pygame.draw.circle(screen, pygame.Color(RGBx[i], RGBy[i], RGBz[i]),     #画圆
                                   (arrradiusx[i], arrradiusy[i]), arrradiusr[i], 1)
            else:
                arrradiusbool[i] = False    #若圆已达到最大,这将该索引值的圆设置为不存在
    pygame.display.update()

4、效果图展示:

 

python爬取豆瓣读书并进行图形化分析_第2张图片


 

python爬取豆瓣读书并进行图形化分析_第3张图片

 


 


python爬取豆瓣读书并进行图形化分析_第4张图片

python爬取豆瓣读书并进行图形化分析_第5张图片

 

 

 

你可能感兴趣的:(python)