目录结构如下:
建立一个豆瓣总文件夹,然后只需在总文件夹下面建立一个image文件夹(这是用来存放从豆瓣中下载的图片),douban.py、display.py、Work.py。其它的在代码运行之后都会自动生成。
'''
项目:豆瓣读书分析
日期:2020/12/14
'''
import requests
from lxml import etree
import time
import csv
#信息头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
def douban_booksrank(url):
res = requests.get(url, headers=headers)
selector = etree.HTML(res.text)
contents = selector.xpath('//div[@class="article"]/div[contains(@class,"doulist-item")]') #循环点
for content in contents:
try:
title = content.xpath('div/div[2]/div[3]/a/text()')[0].strip() #书名
scores = content.xpath('div/div[2]/div[4]/span[2]/text()') #评分
scores.append('9.0') #因为有一些书没有评分,导致列表为空,此处添加一个默认评分,若无评分则默认为9.0
score = scores[0]
comments = content.xpath('div/div[2]/div[4]/span[3]/text()')[0] #评论数量
author = content.xpath('div/div[2]/div[5]/text()[1]')[0].strip() #作者
#利用str的区间索引去除作者:
real_author=author[3:]
publishment = content.xpath('div/div[2]/div[5]/text()[2]')[0].strip() #出版社
real_publishment=publishment[4:]
pub_year = content.xpath('div/div[2]/div[5]/text()[3]')[0].strip() #出版时间
real_year = pub_year[4:]
img_url = content.xpath('div/div[2]/div[2]/a/img/@src')[0].strip() #书本图片的网址
img = requests.get(img_url) #解析图片网址,为下面下载图片
img_name_file = 'Image/{}.png'.format((title.strip())[:3]) #图片存储位置,图片名只取前3
#写入csv
with open('douban_books.csv','a', encoding='utf-8-sig',newline='')as fp: #newline 使不隔行
writer = csv.writer(fp)
writer.writerow((title, score, comments, real_author, real_publishment, real_year, img_url))
#下载图片,为防止图片名导致格式错误,加入try...except
try:
with open(img_name_file, 'wb')as imgf:
imgf.write(img.content)
except FileNotFoundError or OSError:
print('图片文件写入位置错误!')
except IndexError:
print('该条数据格式太长,下载失败!')
def main():
# # 爬取所有书本,共12页的内容
urls = ['https://www.douban.com/doulist/1264675/?start={}&sort=time&playable=0⊂_type='.format(str(i)) for i in
range(0, 300, 25)]
# 写csv首行
with open('douban_books.csv', mode='w',newline='', encoding='utf-8-sig')as f:
writer = csv.writer(f)
writer.writerow(('title', 'score', 'comment', 'author', 'publishment', 'pub_year', 'img_url'))
# 遍历所有网页,执行爬取程序
for url in urls:
douban_booksrank(url)
if __name__ == '__main__':
main()
'''
展示分析数据,利用pyecharts库——————生成直观,炫丽的图表
'''
from pyecharts.charts import Bar,Pie,Polar
import pyecharts.options as opts
import pandas as pd
from pyecharts.globals import ThemeType
# 绘制书籍出版年份分布饼图
def draw_pie(years_contain):
# 计数,计算出出版年份距今的时间
recent_5 = 0
recent_10 = 0
late_10 = 0
for i in years_contain:
if (2020 - i <= 5):
recent_5 += 1
elif (5 < 2020 - i <= 10):
recent_10 += 1
else:
late_10 += 1
# 年份计数转化为数组
y_data = []
y_data.append(recent_5)
y_data.append(recent_10)
y_data.append(late_10)
x_data = ["出版于近5年", "出版于近5-10年", "出版于10前年"]
data_pair = [list(z) for z in zip(x_data, y_data)]
data_pair.sort(key=lambda x: x[1])
(
Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
.add(
series_name="占百分比",
data_pair=data_pair,
rosetype="radius",
radius="55%",
center=["50%", "50%"],
label_opts=opts.LabelOpts(is_show=False, position="center"),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="书籍出版年份分布",
pos_left="center",
pos_top="20",
title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
),
legend_opts=opts.LegendOpts(is_show=False),
)
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a}
{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),
)
.render("书籍出版年份占比.html")
)
# 绘制出版年份与评论数量的关系
def draw_years_and_comment(years_contain,new_comment):
# 对年份排序
years_contain.sort()
c = (
Bar({"theme": ThemeType.MACARONS})
.add_xaxis(years_contain)
.add_yaxis("评论数", new_comment)
.set_global_opts(
title_opts={"text": "Bar-出版年份与评论数量的关系", "subtext": "用于评论增长趋势"}
)
.render("出版年份与评论数量的关系.html")
)
# 绘制书籍评论数量分布直方图
def draw_comments_distribution(new_comment):
less_and2000 = 0
less_4000_greater_2000 = 0
less_6000_greater_4000 = 0
greater_6000 = 0
for number in new_comment:
if number <= 2000:
less_and2000 += 1
elif 2000 < number <= 4000:
less_4000_greater_2000 += 1
elif 4000 < number <= 6000:
less_6000_greater_4000 += 1
else:
greater_6000 += 1
c = (
Bar()
.add_xaxis(['<=2000', '2000-4000', '4000-6000', '>6000'])
.add_yaxis("评论数", [less_and2000, less_4000_greater_2000,
less_6000_greater_4000, greater_6000])
.set_global_opts(
title_opts=opts.TitleOpts(title="书籍评论数分布"),
brush_opts=opts.BrushOpts(),
)
.render("书籍评论数分布.html")
)
def radar_map(new_score):
#将new_score按照1.0制评分画散点图
score_1=[]
for score in new_score:
modify_score=score-9
#将评论转化为0.1的精度
score_1.append('{:.1f}'.format(modify_score))
data = [(i, score_1[i]) for i in range(len(score_1))]
c = (
Polar()
.add("", data, type_="scatter", label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="图书评分雷达散点图",subtitle='基于9.0评分' ))
.render("雷达散点图.html")
)
def main():
#调用pandas 库读取csv文件
all_data=pd.read_csv('douban_books.csv')
#获取出版年份进行筛选
data = all_data['pub_year'].values.tolist()
pursur_year = []
for i in data:
if len(i[0:5]) ==5 :
pursur_year.append(i[0:5].strip())
years_contain=[]
for year in pursur_year :
if year !='二零零六':
years_contain.append(int(year))
#获取评论进行筛选
comments_data = all_data['comment'].values.tolist()
comments =[]
for item in comments_data :
new_item= item[1:-4]
comments.append(new_item)
# 将comment的元素化为整型
new_comment = []
for i in comments:
new_comment.append(int(i))
#绘制出版年份饼图
draw_pie(years_contain)
#绘制出版年份与评论数量的关系
draw_years_and_comment(years_contain,new_comment)
#绘制书籍评论数量图
draw_comments_distribution(new_comment)
#获取csv中所有的score评分
new_score=all_data.score
# 用雷达散点图绘制
radar_map(new_score)
if __name__ == '__main__':
main()
import pygame, random, sys, time
pygame.init()
screen = pygame.display.set_mode([600, 400])
screen.fill((255, 255, 255))
radiusr = 0
arrradiusr = [0] * 10 # 圆的半径
arraddradiusr = [0] * 10 # 圆的半径增量
arrradiusbool = [False] * 10 # 圆是否存在 False代表该索引值下的圆不存在,True代表存在
arrradiusx = [0] * 10 # 圆的坐标x轴
arrradiusy = [0] * 10 # 圆的坐标y轴
RGBx = [0] * 10 # 颜色RGB值第一个值
RGBy = [0] * 10 # 颜色RGB值第二个值
RGBz = [0] * 10 # 颜色RGB值第三个值
while True:
time.sleep(0.1) # 0.1秒
for event in pygame.event.get(): # 监听器
if event.type == pygame.MOUSEBUTTONDOWN: # 鼠标按下
num = arrradiusbool.index(False) #获取圆不存在的索引值
arrradiusbool[num] = True #将该索引值的圆设置为存在
arrradiusr[num] = 0 #该圆的半径设置为0
arrradiusx[num], arrradiusy[num] = pygame.mouse.get_pos() #获取鼠标坐标
RGBx[num] = random.randint(0, 255) #获取颜色值
RGBy[num] = random.randint(0, 255) #获取颜色值
RGBz[num] = random.randint(0, 255) #获取颜色值
pygame.draw.circle(screen, pygame.Color(RGBx[num], RGBy[num], RGBz[num]), #画圆
(arrradiusx[num], arrradiusy[num]), arrradiusr[num], 1)
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
for i in range(10):
if arrradiusbool[i] == False: #如果圆不存在则跳过循环
pass
else:
if (arrradiusr[i] < random.randint(10, 50)): #随机圆的大小
arraddradiusr[i] = random.randint(0, 5) #圆的随机半径增量
arrradiusr[i] += arraddradiusr[i]
pygame.draw.circle(screen, pygame.Color(RGBx[i], RGBy[i], RGBz[i]), #画圆
(arrradiusx[i], arrradiusy[i]), arrradiusr[i], 1)
else:
arrradiusbool[i] = False #若圆已达到最大,这将该索引值的圆设置为不存在
pygame.display.update()