静态网页爬取+词频统计+词云图

文章目录

  • 具体要求
  • 功能一
  • 功能二
  • 功能三
  • 功能四

具体要求

  1. 将网页信息按系列格式保存到 urlList.txt 文件中,每一行存放一个网页的信息(文件名为:getYXDTLists.py)
  2. 读取“urlList.txt”文件中的 url 信息,爬取每个链接中的文本信息,将其 按段保存在gushi/title.txt文档中。
  3. 读取目录gushi中的所有记事本文件,统计出现频率为前 20 的词语, 并输出。
  4. 读取目录 gushi 中的所有记事本文件,分词,输出
    词云。

功能一

import urllib.request
from bs4 import BeautifulSoup

# 输入url,返回该网页的源码
def getURL(urls):
    contents = []
    for url in urls:
        res = urllib.request.urlopen(url)
        contents.append(res.read().decode('utf-8'))
        # print(contents)
    return contents


# 输入content源码,返回lists数据
def resolveHtml(contents):
    lists = set()
    for content in contents:
        soup = BeautifulSoup(content, "html.parser")
        divs = soup.find_all('div', {'class': 'list-main-warp'})
        # print(len(divs))
        lis = divs[0].find_all('li')
        # print(len(lis))
        for li in lis:
            one = ""
            # url
            url = li.find_all('a')[0].get("href")
            # 标题
            title = li.find_all('a')[0].get("title")
            one = url + "," + title
            lists = lists | {one}
# saveInfo:
#  输入:lists,filename
# 功能: 将lists中数据写入文件filename中
def saveInfo(lists, filename):
    with open(filename, "w", encoding="utf-8") as fp:
        for i in lists:
            fp.write(i + "\n")


def main():
    urls = ["https://www.hist.edu.cn/index/sy/kygs/{}.htm".format(str(i)) for i in range(1, 4)]
    urls.append("https://www.hist.edu.cn/index/sy/kygs.htm")
    # print(urls)
    contents = getURL(urls)
    lists = resolveHtml(contents)
    filename = "urlList.txt"
    saveInfo(lists, filename)


if __name__ == "__main__":
    main()

功能二

import os
import urllib.request
import shutil
from bs4 import BeautifulSoup

# 获取网页内容
def getText(path):
    lists = set()
    with open('urlList.txt', 'r', encoding='utf-8') as fp:
        lists = fp.readlines()
    os.chdir(path)
    for list in lists:
        one = list.split(',')
        title = one[1].strip(" ").strip("\n") + ".txt"
        with open(title, 'w', encoding='utf-8') as fp:
            url = list.split(',')[0]
            url_one = url.split("/")
            if len(url.split("/")) == 6:
                url = "https://www.hist.edu.cn/" + url_one[3] + "/" + url_one[4] + "/" + url_one[5]
            else:
                url = "https://www.hist.edu.cn/" + url_one[2] + "/" + url_one[3] + "/" + url_one[4]
            res = urllib.request.urlopen(url)
            content = res.read().decode('utf-8')
            soup = BeautifulSoup(content, "html.parser")
            divs = soup.find_all('div', id='vsb_content_501')
            if len(divs) > 0:
                paragraphs = divs[0].find_all('p')
            else:
                divs = soup.find_all('div', id='vsb_content')
                paragraphs = divs[0].find_all('p')
            for i in paragraphs:
                fp.write(i.text)

def buildDir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.mkdir(path)


def main():
    path = r'E:\python\project\Task1\gushi'
    buildDir(path)
    getText(path)

if __name__ == "__main__":
    main()

功能三

import jieba
import re
import os

path = r'E:\python\project\venv\Task1\gushi'
files = os.listdir(path)
all_text = list()
text = list()
for file in files:
    os.chdir(path)
    with open(file, "r", encoding='utf-8') as f:  # 读取文件
        text = f.readlines()
    all_text = all_text + text
all_text = re.sub("[A-Za-z0-9\:\·\-\,\。\“\”\?\\n]", "", str(all_text))  # 去除中文标点符号
ls = jieba.lcut(all_text)  # 实现分词
ls = filter(lambda word: len(word) > 1, ls)
count = dict()
# 词频统计
for word in ls:
    count[word] = count.get(word, 0) + 1
# print(count)
item = list(count.items())
all_words = item
item.sort(key=lambda x: x[1], reverse=True)  # 将词频由高到低排序,同时item的子元素成了元组形式
item = item[:20]  # 截取前20的词频
print("前20的词频为:")
print(item)

功能四

import jieba
import wordcloud
import imageio.v2 as imageio
import re
import os

path = r'E:\python\project\venv\数据211-20211574129-钦佳燕\Task1\gushi'
files = os.listdir(path)
all_text = list()
text = list()
for file in files:
    os.chdir(path)
    with open(file, "r", encoding='utf-8') as f:  # 读取文件
        text = f.readlines()
    all_text = all_text + text
all_text = re.sub("[A-Za-z0-9\:\·\-\,\。\“\”\?\\n]", "", str(all_text))  # 去除中文标点符号
ls = jieba.lcut(all_text)  # 实现分词
ls = filter(lambda word: len(word) > 1, ls)  # 留下字长大于的词
# 将词语用空格分开
txt = " ".join(ls)
# 返回上一级目录
os.chdir("..")
# 控制词云图生成形状的图片
mask = imageio.imread('pikaqiu.png')
# 高1400像素,宽2000像素,,及使用微软雅黑字体,图片白底,形状为mask
w = wordcloud.WordCloud(width=2000, height=1400,
                        font_path="msyh.ttc", colormap='cool',
                        background_color='white', mask=mask)
w.generate(txt)
w.to_file('a.png')  # 生成的图片

可根据自己需求设置相对应的词云图形状
静态网页爬取+词频统计+词云图_第1张图片
实现以上功能用到的部分库的常用方法

你可能感兴趣的:(python,numpy,开发语言)