所以很显然,我们只要把这样的标签都提取出来,我们就可以得到糗事百科中的段子了。
2 数据处理
首先我们把我们需要的内容转换到Beautiful soup中。
# 引入Beautiful Soup包
from bs4 import BeautifulSoup
# 把刚刚保存在content中的文件放入Beautiful Soup中
soup = BeautifulSoup(content, 'lxml')
#首先我们分解出所有class为article block untagged mb15 typs_hot 标签:
divs = soup.find_all(class_ = 'article block untagged mb15 typs_hot')
接下来我们要做的事情就是把这些div里面的span都取出来。
我们先把最后一行去掉,避免不必要的打印。然后提取出每个div里面的span
# 取出每个div中的数据
for div in divs:
joke = div.span.get_text()
print(joke)
print("------")
3 多页面数据
完整代码,不到几十行的爬虫代码,你会了吗?
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'JackFeng'
# @Time : 20/12/20 23:18
# @Author : JackFeng
# @FileName: pySprider.py
# @Software: PyCharm
# @Blog :http://www.a2data.cn/
import requests
from bs4 import BeautifulSoup
"""
# 所需要依赖包
pip install requests
pip install beautifulsoup4
"""
# 请求头部
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
# 设定一个网址不变的部分,然后我们只要每次在这个后面加数字就可以了
base_url = 'https://www.qiushibaike.com/text/page/'
# 设置循环,让num分别等于2-9
for num in range(2, 9):
print('第{}页'.format(num))
r1 = requests.get(base_url + str(num), headers = headers) #这里对网址进行一个修改
# 剩下的部分都是和原来的代码一样
content = r1.text
# print(content)
# 把刚刚保存在content中的文件放入Beautiful Soup中
soup = BeautifulSoup(content, 'lxml')
divs = soup.find_all(class_='article block untagged mb15 typs_hot')
# 我们可以打印出divs看看是什么样子的。
# print(divs)
for div in divs:
joke = div.span.get_text()
print(joke)
print("------")
2、数据挖掘
1、数据存储
根据爬取的数据,将数据存储下来。为了做出更加好看,有趣,有用的可视化视图。
优化爬虫内容
增加 作者,评论数 ,好笑度
# -*- coding: utf-8 -*-
from urllib import parse
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'https://www.qiushibaike.com/text/',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
next_page = "/text/page/1/"
while next_page:
response = requests.get(url=parse.urljoin('https://www.qiushibaike.com', next_page), headers=headers)
html = etree.HTML(response.content)
infos = html.xpath("//div[contains(@class,'article block untagged mb15')]")
for one in infos:
content = one.xpath("string(.//div[@class='content'])")
vote = one.xpath(".//div[@class='stats']/span[@class='stats-vote']//i/text()")
vote = vote[0] if vote else 0
comments = one.xpath(".//div[@class='stats']/span[@class='stats-comments']//i/text()")
comments = comments[0] if comments else 0
cmt_main = "".join(one.xpath(".//div[@class='cmtMain']/div[@class='main-text']/text()")).strip()
item = {
"content": content.strip(),
"vote": vote,
"comments": comments,
"cmt_main": cmt_main,
}
print(item)
print("*" * 100)
#爬虫结果保存到文件wordCloud,供词云使用
with open('wordCloud.txt', 'a', encoding='utf-8') as a:
a.write(item['content'])
next_page = html.xpath("//span[@class='next']/../@href")
next_page = next_page[0] if next_page else None
print(next_page)
保存到wordCloud文件:
2、Nlp 分词
stopWords
下载源自网络 + 自己更新了部分词汇 —— DsTeam Kath
import pandas as pd
import numpy
import jieba
#去除停用词
wordCount = open('wordCloud.txt',encoding='utf-8').read()
stopwords = pd.read_csv('stopWords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') #stopWords下载源自网络+自己更新了部分词汇
words=[]
wordCount = jieba.lcut(wordCount)
for word in wordCount:
if len(word) > 1 and word != '\r\n':
words.append(word)
wordCount=pd.DataFrame({'words':words})
# print(wordCount.count())
wordCount=wordCount[~wordCount.words.isin(stopwords.stopword)] # 保留不在停用词词表中的词,即把包含在停用词表里的词语去掉
print(wordCount.count())
#统计词频
wordStat=wordCount.groupby('words').agg(计数=pd.NamedAgg(column='words', aggfunc=numpy.size)).reset_index().sort_values(by='计数', ascending=False)
print(wordStat.head(20))
3、数据可视化
1、词云制作
#安装词云库
pip install wordcloud
# jieba nlp分词
pip install jieba
#词云制作
from wordcloud import WordCloud,ImageColorGenerator
import jieba
import imageio
f = open('wordCloud.txt',encoding='utf-8').read()
f = ' '.join(jieba.lcut(f))
stopwords = open('stopWords.txt',encoding='utf-8').read()
# 词云背景
background = imageio.imread("qiubai.jpeg")
image_colors = ImageColorGenerator(background)
w = WordCloud(
mask=background,
width=690,
height=560,
font_path='C:\Windows\Fonts\simhei.ttf', # 自己可以更换字体
scale=5,
stopwords=stopwords)
w.generate(f)
w.to_file('qiubaiWordCloud.png')
# word_counts = collections.Counter(object_list)
2、pyecharts画图
# 词云
from pyecharts import WordCloud
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("", wordStat['words'], wordStat['计数'], word_size_range=[20, 100])
wordcloud
3、图表
#coding=utf-8
from __future__ import unicode_literals
#绘制图表
from pyecharts import Bar
bar = Bar("糗事百科","词频分布(DataScience)")
bar.add("分词",wordStat['words'], wordStat['计数'])
#bar.print_echarts_options() # 该行只为了打印配置项,方便调试时使用
bar.render() #生成本地 HTML 文件
bar
4、Bar
#EG 案例
from pyecharts import Bar
attr = ["{}".format(i) for i in wordStat['words']]
v1 = wordStat['计数']
bar = Bar("糗事百科-DataScience")
bar.add("词频分布",attr,v1,mark_line=["average"],mark_point=["max","min"])
bar
5、饼图
# 饼图
from pyecharts import Pie
# 数据太多 我们取top 200
w=wordStat.head(20)
attr = w['words']
v1 = w['计数']
pie= Pie("糗事百科-玫瑰图示例-DataScience",title_pos='center',width=900)
pie.add("词频分布",attr,v1,center=[25,50],is_random=True,redius=[30,75],rosetype='redius')
pie.add("词频分布",attr,v1,center=[75,50],is_random=True,redius=[30,75],rosetype='area',is_legend_show=False,is_label_show=True)
pie
特别彩蛋(留言送书(两本)):
购买链接:
请添加小编,回复关键词:[数据可视化],
-今日互动-
如果对你有帮助的话
❤️来个「转发朋友圈」和「在看」,是最大的支持❤️