这学期新增了一门 python数据清洗与可视化,这门课是以大作业的形式作为期末考核标准,展示:
1、运行main.py文件的窗体展示效果图
2、点击更新榜单的展示效果
3、点击数据分析总图的展示效果
# 获取页面源码
def htmlContent(url):
resp = requests.get(url, headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"})
httpContent = resp.content.decode("utf-8")
return httpContent
# 所有漫画分类
# https://www.kuaikanmanhua.com/ranking/6
def get_type(url):
resp = htmlContent(url)
parse = lxml.html.etree.HTML(resp)
allTypeName = parse.xpath('/html/body/div/div[2]/div/div/div[2]/div[1]/ul/li/a/text()') # 所有类型
allTypeLink = parse.xpath('/html/body/div/div[2]/div/div/div[2]/div[1]/ul/li/a/@href') # 所有类型对应的链接
arr = []
for i in range(len(allTypeLink) - 1):
name = allTypeName[i + 1]
link = "https://www.kuaikanmanhua.com" + allTypeLink[i + 1]
dic = {
"name": name,
"link": link
}
arr.append(dic)
return arr
# 传入详情链接 拿到对应的观看人数和点赞
def detail(url):
resp = htmlContent(url)
parse = lxml.html.etree.HTML(resp)
peopleNum = \
parse.xpath('/html/body/div/div[2]/div/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[2]/span[1]/text()')[
0].strip() # 观看人数
dianzan = parse.xpath('/html/body/div/div[2]/div/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[2]/span[2]/text()')[
0].strip() # 点赞
dd = {
"dianzan": dianzan,
"peopleNum": peopleNum
}
return dd
# 整理所有信息
def getList(url):
typeMes = get_type(url)
list = []
for i in range(len(typeMes)):
name = typeMes[i]["name"] # 1.漫画类型
link = typeMes[i]["link"] # 对应类型的链接
content = htmlContent(link)
parse = lxml.html.etree.HTML(content)
AllTitle = parse.xpath(
'/html/body/div/div[2]/div/div/div[2]/div[2]/div/a/div[2]/div[1]/span/span/text()') # 所有漫画名称
AllAuthor = parse.xpath('/html/body/div/div[2]/div/div/div[2]/div[2]/div/a/div[2]/div[2]/text()') # 所有作者
AllState = parse.xpath(
'/html/body/div/div[2]/div/div/div[2]/div[2]/div/a/div[2]/div[@class="trend"]/text()') # 所有漫画的状态
AllmanhuaLink = parse.xpath('/html/body/div/div[2]/div/div/div[2]/div[2]/div/a/@href') # 所有漫画对应的链接
for i in range(len(AllTitle)):
title = AllTitle[i].strip() # 2. 漫画名称
author = AllAuthor[i].strip() # 3. 作者
state = AllState[i].strip() # 4. 漫画状态
if state == '':
state = '未上榜'
link = "https://www.kuaikanmanhua.com" + AllmanhuaLink[i]
dianzan = detail(link)["dianzan"] # 5.点赞
peopleNum = detail(link)['peopleNum'] # 6.观看人数
if (peopleNum[-1] == "亿"):
peopleNum = eval(peopleNum[:-1])
elif (peopleNum[-1] == "万"):
peopleNum = eval(peopleNum[:-1]) / 10000
else:
peopleNum = eval(peopleNum)
subList = []
subList.append(i + 1)
subList.append(name)
subList.append(title)
subList.append(author)
subList.append(state)
subList.append(dianzan)
subList.append(peopleNum)
print(subList)
list.append(subList)
name = ['序号', '类型', '漫画名称', '作者', '漫画状态', '点赞', '观看人数']
test = pd.DataFrame(columns=name, data=list)
test.to_csv('bangdan.csv')
with open("bangdan.json", 'w', encoding='utf-8') as f:
f.write(json.dumps(list, ensure_ascii=False))
print("写入数据成功!")
# 读取存在本地的json文件,实现数据的快速展示
def get_localData():
data = []
with open("bangdan.json", 'r', encoding='utf-8') as f:
for i in json.loads(f.read()):
data.append(i)
print(i)
return data
def run():
#读取数据
csv_file = './bangdan.csv'#导入csv数据
data = pd.read_csv(csv_file)
datas = data['类型'].value_counts()
te = pd.DataFrame(data=datas)
te.to_csv('leixing.csv')
datazt = data['漫画状态'].value_counts()
t = pd.DataFrame(data=datazt)
t.to_csv('zhuangtai.csv')
dataci = data['漫画名称']
print(dataci)
word_list = list(dataci)
word_list = remove_markers(word_list) #漫画名称列表
print(word_list)
datazzzs = data[['作者','观看人数']]
datazzzs = datazzzs.groupby('作者').agg({'观看人数':'sum'}).sort_values(by='观看人数',ascending=False)
testpm = pd.DataFrame(data=datazzzs)
testpm.to_csv('peoplenum.csv')
csvpm_file = './peoplenum.csv' # 导入csv数据
datazzpm = pd.read_csv(csvpm_file)
datazzpm = datazzpm.head(10)
print(datazzpm)
words_list=[]
for line in word_list:
words_list.extend(word for word, flag in pseg.cut(line, use_paddle=True) if flag in ['a', 'vd', 'n'])
c1 = Counter(words_list) #jieba分词,得到关键字
print(c1)
a = (
Bar(init_opts=opts.InitOpts(height="450px",width="900px"))
.add_xaxis(list(datas.index))
.add_yaxis("类型", list(datas))
.set_global_opts(
title_opts=opts.TitleOpts(title="漫画榜单类型统计图"),
datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],
)
)
aa = (
Bar(init_opts=opts.InitOpts(height="450px", width="900px"))
.add_xaxis(list(datas.index))
.add_yaxis("类型", list(datas))
.set_global_opts(
title_opts=opts.TitleOpts(title="漫画榜单类型统计图"),
datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],
)
.render("漫画榜单类型统计图.html")
)
b = (
WordCloud(init_opts=opts.InitOpts(height="450px",width="900px"))
.add(series_name="热点分析", data_pair=c1.most_common(),word_size_range=[22, 66])
.set_global_opts(
title_opts=opts.TitleOpts(
title="热点分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
c = (
Pie(init_opts=opts.InitOpts(height="450px",width="600px"))
.add(
"漫画状态",
[list(z) for z in zip(datazt.index, list(datazt))],
radius=["40%", "75%"],
label_opts=opts.LabelOpts(is_show=False, position="center"),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="漫画状态"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%",is_show=False),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
cc = (
Pie(init_opts=opts.InitOpts(height="450px", width="600px"))
.add(
"漫画状态",
[list(z) for z in zip(datazt.index, list(datazt))],
radius=["40%", "75%"],
label_opts=opts.LabelOpts(is_show=False, position="center"),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="漫画状态"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%", is_show=False),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("漫画状态展示图.html")
)
d = (
Funnel(init_opts=opts.InitOpts(height="450px",width="600px"))
.add(
"作者",
[list(z) for z in zip(list(datazzpm['作者']),datazzpm.index+1)],
sort_="ascending",
label_opts=opts.LabelOpts(position="inside"),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="作者-观看人数天梯榜(Top10)"),
legend_opts=opts.LegendOpts(is_show=False)
)
)
e = (
EffectScatter(init_opts=opts.InitOpts(height="450px",width="600px"))
.add_xaxis(
xaxis_data=data['序号'])
.add_yaxis(
series_name="观看人数",
y_axis=data['观看人数'],
symbol_size=15,
label_opts=opts.LabelOpts(is_show=False),
)
.set_series_opts()
.set_global_opts(
title_opts=opts.TitleOpts(title="观看人数(亿)-排名分析"),
xaxis_opts=opts.AxisOpts(
type_="value", splitline_opts=opts.SplitLineOpts(is_show=True)
),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
ee = (
EffectScatter(init_opts=opts.InitOpts(height="450px", width="600px"))
.add_xaxis(
xaxis_data=data['序号'])
.add_yaxis(
series_name="观看人数",
y_axis=data['观看人数'],
symbol_size=15,
label_opts=opts.LabelOpts(is_show=False),
)
.set_series_opts()
.set_global_opts(
title_opts=opts.TitleOpts(title="观看人数(亿)-排名分析"),
xaxis_opts=opts.AxisOpts(
type_="value", splitline_opts=opts.SplitLineOpts(is_show=True)
),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
.render("漫画观看排名图.html")
)
page = (
Page(page_title="热门小说分析",layout=Page.SimplePageLayout)
.add(a)
.add(b)
.add(c)
.add(d)
.add(e)
.render("热门小说分析.html")
)
(三)、GUI窗体类实现代码
class uiob:
def clear_tree(self, tree): # 清空表格
x = tree.get_children()
for item in x:
tree.delete(item)
def add_tree(self, list, tree): # 新增数据到表格
i = 0
for subList in list:
tree.insert('', 'end', values=subList)
i = i + 1
tree.grid()
def searh(self):
self.clear_tree(self.treeview) # 清空表格
self.B_0['text'] = '正在努力搜索'
list = get_localData() # 读取本地数据信息
self.add_tree(list, self.treeview) # 将数据添加到tree中
self.B_0['state'] = NORMAL
self.B_0['text'] = '更新榜单'
def center_window(self, root, w, h): # 窗口居于屏幕中央 root: root w: 窗口宽度 h: 窗口高度
# 获取屏幕 宽、高
ws = root.winfo_screenwidth()
hs = root.winfo_screenheight()
# 计算 x, y 位置
x = (ws / 2) - (w / 2)
y = (hs / 2) - (h / 2)
root.geometry('%dx%d+%d+%d' % (w, h, x, y))
def click1(self):
webbrowser.open("热门小说分析.html")
def click2(self):
webbrowser.open("漫画榜单类型统计图.html")
def click3(self):
webbrowser.open("漫画状态展示图.html")
def click4(self):
webbrowser.open("漫画观看排名图.html")
def ui_process(self):
root = Tk() # 生成主窗口
self.root = root
root.title("快看漫画各大榜单数据展示") # 修改框体的名字
self.center_window(root, 900, 350)
root.resizable(0, 0) # 将窗口大小设置为不可变
root['highlightcolor'] = 'yellow'
labelframe = LabelFrame(root, width=900, height=350, background="white")
labelframe.place(x=5, y=5)
self.labelframe = labelframe
# 图片
photo = tk.PhotoImage(file="kuaikan.png")
Lab = tk.Label(root, image=photo, )
Lab.place(x=10, y=10)
B_1 = Button(labelframe, text="数据分析总图", background="white")
B_1.place(x=300, y=25, width=150, height=50)
self.B_1 = B_1
B_1.configure(command=lambda: thread_it(self.click1())) # 按钮绑定单击事件
B_2 = Button(labelframe, text="榜单类型展示图", background="white")
B_2.place(x=500, y=5, width=100, height=25)
self.B_2 = B_2
B_2.configure(command=lambda: thread_it(self.click2())) # 按钮绑定单击事件
B_3 = Button(labelframe, text="状态展示图", background="white")
B_3.place(x=500, y=35, width=100, height=25)
self.B_3 = B_3
B_3.configure(command=lambda: thread_it(self.click3())) # 按钮绑定单击事件
B_4 = Button(labelframe, text="观看数量分析图", background="white")
B_4.place(x=500, y=65, width=100, height=25)
self.B_4 = B_4
B_4.configure(command=lambda: thread_it(self.click4())) # 按钮绑定单击事件
# 查询按钮
B_0 = Button(labelframe, text="更新榜单", background="white")
B_0.place(x=700, y=25, width=150, height=50)
self.B_0 = B_0
B_0.configure(command=lambda: thread_it(self.searh)) # 按钮绑定单击事件
# 框架布局,承载多个控件
frame_root = Frame(labelframe)
frame_l = Frame(frame_root)
frame_r = Frame(frame_root)
self.frame_root = frame_root
self.frame_l = frame_l
self.frame_r = frame_r
# 表格
columns = ("序号", "类型", "漫画名称", "作者", "漫画状态", "点赞", "观看人数")
treeview = ttk.Treeview(frame_l, height=10, show="headings", columns=columns)
treeview.column("序号", width=50, anchor='center')
treeview.column("类型", width=50, anchor='center')
treeview.column("漫画名称", width=200, anchor='center')
treeview.column("作者", width=300, anchor='center')
treeview.column("漫画状态", width=80, anchor='center')
treeview.column("点赞", width=75, anchor='center')
treeview.column("观看人数", width=75, anchor='center')
treeview.heading("序号", text="序号") # 显示表头
treeview.heading("类型", text="类型")
treeview.heading("漫画名称", text="漫画名称")
treeview.heading("作者", text="作者")
treeview.heading("漫画状态", text="漫画状态")
treeview.heading("点赞", text="点赞")
treeview.heading("观看人数", text="观看人数")
# 垂直滚动条
vbar = ttk.Scrollbar(frame_r, command=treeview.yview)
treeview.configure(yscrollcommand=vbar.set)
treeview.pack()
self.treeview = treeview
vbar.pack(side=RIGHT, fill=Y)
self.vbar = vbar
# 框架的位置布局
frame_l.grid(row=0, column=0, sticky=NSEW)
frame_r.grid(row=0, column=1, sticky=NS)
frame_root.place(x=10, y=100)
root.mainloop() # 显示主窗口
(四)、主函数
if __name__ == '__main__':
ui = uiob()
ui.ui_process()
这次的课程设计收获颇多,这是python爬虫、pyecharts以及GUI窗体综合实现效果,这个小项目中,除了获取数据的爬虫代码无法复用以外,剩下的代码均可以实现代码的复用。
源码码地址:https://download.csdn.net/download/m0_51992766/75546646