Python——爬虫+词云+数据库

         爬取指定新闻网站,将爬取到的数据做词频统计生成词云图,做相应的词频分析饼状图,柱状图,散点图。最后将词频数据存至MySQL数据库。

Python——爬虫+词云+数据库_第1张图片

        编译环境:PyCharm

        数据库下载:AppServ8.5

       需用到的模块有:

import time
from PySide2.QtCore import QFile
from PySide2.QtGui import QPixmap
from PySide2.QtWidgets import QApplication, QGraphicsScene, QGraphicsPixmapItem
from PySide2.QtUiTools import QUiLoader
from urllib import request
from bs4 import BeautifulSoup
import wordcloud  # 词云图
import collections
import jieba
import re
import numpy as np
from PIL import Image
import threading    # 多线程
from string import punctuation as Englishpunctuation    # 获取英文标点集合
from zhon.hanzi import punctuation as zhonpunctuation       # 获取中文标点集合
from selenium import webdriver
import csv
import xlwt
import pymysql
import pyecharts.options as opts
from pyecharts.charts import Pie,Bar,EffectScatter
from pyecharts.globals import ThemeType

        主程序:

import time
from PySide2.QtCore import QFile                    # .ui文件获取
from PySide2.QtGui import QPixmap                   # 加载图片
from PySide2.QtWidgets import QApplication, QGraphicsScene, QGraphicsPixmapItem    # 加载图片
from PySide2.QtUiTools import QUiLoader
from urllib import request      # 爬取URL
from bs4 import BeautifulSoup   # 筛取爬完的html文件
import wordcloud  # 词云图
import collections      # 词频计数
import jieba            # 分词
import re
import numpy as np
from PIL import Image   # 加载图片
import threading    # 多线程
from string import punctuation as Englishpunctuation    # 获取英文标点集合
from zhon.hanzi import punctuation as zhonpunctuation       # 获取中文标点集合
import showpie     # 自己定义
from selenium import webdriver  # 用浏览器打开html
import 数据库      # 自己定义
import CSV  # 自己定义
# 三个ui文件“UI.ui”,"denlu.ui","zhuce.ui"
# 词云图片保存为“wc.png”
# 词云数据保存为“word_counts_topALL.text”
# xls文件保存为“myexcel.xls”
# 数据库 URL:"http://localhost:8080/phpMyAdmin/"   登录名:root  密码:12345678  数据库名:db.db
url = []
url_sina = ['https://news.sina.com.cn/c/xl/2022-01-01/doc-ikyakumx7683060.shtml','https://news.sina.com.cn/o/2022-01-01/doc-ikyamrmz2472300.shtml','https://news.sina.com.cn/o/2022-01-01/doc-ikyamrmz2467548.shtml',
     'https://news.sina.com.cn/c/2022-01-01/doc-ikyamrmz2515302.shtml','https://news.sina.com.cn/c/2021-12-31/doc-ikyamrmz2441156.shtml','https://finance.sina.com.cn/roll/2022-01-01/doc-ikyakumx7644198.shtml',
     'https://news.sina.com.cn/c/2021-12-31/doc-ikyakumx7537811.shtml','https://news.sina.com.cn/w/2021-12-30/doc-ikyamrmz2171966.shtml','https://news.sina.com.cn/c/2022-01-01/doc-ikyakumx7666807.shtml',
     'https://news.sina.com.cn/c/xl/2021-12-30/doc-ikyakumx7357600.shtml']
url_163 = ['https://www.163.com/dy/article/GSG29CEJ05346RC6.html',
           'https://www.163.com/dy/article/GSIEC0U70514R9OJ.html', 'https://www.163.com/dy/article/GSGOQOQE05346RC6.html','https://www.163.com/dy/article/GSK18CJD0514R9OJ.html'
         'https://www.163.com/news/article/GSIBLSAH000189FH.html','https://www.163.com/dy/article/GSKR7DJ60514R9M0.html','https://www.163.com/dy/article/GSH9FIT90514R9M0.html?clickfrom=w_yw','https://www.163.com/gov/article/GD3TBM6R002399RB.html'
         'https://www.163.com/dy/article/GA4CC6I20512D3VJ.html','https://www.163.com/dy/article/GSKP64C80514R9OJ.html']
url_ifeng = ['https://news.ifeng.com/c/8CRYT9RnXii', 'https://news.ifeng.com/c/8CQQcdns5Jg','https://news.ifeng.com/c/8CRl1IGC2vG','https://news.ifeng.com/c/8CRE4AyY0NX','https://finance.ifeng.com/c/8CRE4AyY0P0',
             'https://news.ifeng.com/c/8CCxj8nMaVl','https://news.ifeng.com/c/8CQKzempy4j','https://news.ifeng.com/c/8CRa0Nir1bc','https://news.ifeng.com/c/8CRYT9RnXkL','https://news.ifeng.com/c/8CRU98XbWBp']
url.append(url_sina);url.append(url_163);url.append(url_ifeng)


class UIPython:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("UI.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.ciyunButton.clicked.connect(self.ciyun)

    # 将爬取内容写入html文件 使用异常处理防止反爬导致程序崩溃停止
    def writetext(self, url_name, address):
        with open(address, mode='w', encoding='utf-8') as f:
            f.write('')
        for j in url_name:
            try:
                r = request.Request(j)
                r.add_header('User-agent', 'PyMOTW(https://pymotw.com/)')
                responce = request.urlopen(r)
            except Exception as e:
                print("页面加载失败{0}\n".format(j))
            data = responce.read().decode('utf-8', 'ignore')
            with open('try.html', mode='a', encoding='utf-8') as f:
                f.write(data)
            data = BeautifulSoup(data, 'lxml')
            data_title = list(data.find_all('title'))
            data_content = list(data.find_all('content'))
            data_page = list(data.find_all('p'))
            with open(address, mode='a', encoding='utf-8') as f:
                for i in data_title:
                    f.write(str(i.text) + '\n')
                for i in data_content:
                    f.write(str(i.text) + '\n')
                for i in data_page:
                    f.write(str(i.text) + '\n')

    # 定义词云格式
    def definewc(self):
        # 获取UI.ui界面内容
        max_words = self.ui.max_words.currentText()
        max_font_size = self.ui.max_font_size.currentText()
        colormap = self.ui.colormap.currentText()
        background_color = self.ui.background_color.currentText()
        font_path = self.ui.font_path.currentText()
        mask = self.ui.maskname.currentText()
        contour_color = self.ui.contour_color.currentText()
        contour_width =self.ui.contour_width.text()

        # 获取字体
        if font_path == '中文简体':
            font_path = 'fonts\simfang.ttf'
        elif font_path == '方正舒体':
            font_path='fonts\FZSTK.TTF'
        elif font_path == '华文行楷':
            font_path='fonts\STXINGKA.TTF'

        # 获取mask
        if mask == '中国地图':
            mask = np.array(Image.open('E://语音包//ChinaMap.png'))  # 定义词频背景
        elif mask == '爱丽丝':
            mask = np.array(Image.open('E://语音包//alice_mask.png'))

        # 设置词云格式
        wc = wordcloud.WordCloud(
            font_path=str(font_path),  # 设置字体格式
            background_color=str(background_color),
            mask=mask,  # 设置背景图
            colormap=str(colormap),
            max_words=int(max_words),  # 最多显示词数
            max_font_size=int(max_font_size),  # 字体最大值
            contour_color=contour_color,
            contour_width=int(contour_width)
        )
        return wc

    # 展示词云
    def showwordcloud(self, address):
        with open(address, mode='r', encoding='utf-8') as f:
            string_data = f.read()
        # 文本预处理
        for i in Englishpunctuation:
            string_data = string_data.replace(i, '')            # 删除标点符号
        for i in zhonpunctuation:
            string_data = string_data.replace(i, '')
        string_data = re.sub('[a-zA-Z]', '', string_data)       # 删除英文
        string_data = re.sub('[\d]', '', string_data)           # 删除数字
        # 文本分词
        seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
        object_list = []
        remove_words = [u'的', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'中', u'在', u'了',u'通常', u'如果', u'我们', u'需要', u'他', u'要', u"\u3000", u'年', u'月', u'也', u'你'
                        , u'\n', u' ', u'▎']  # 自定义去除词库

        for word in seg_list_exact:  # 循环读出每个分词
            if word not in remove_words:  # 如果不在去除词库中
                object_list.append(word)  # 分词追加到列表

        # 词频统计
        word_counts = collections.Counter(object_list)  # 对分词做词频统计
        maxword_number = self.ui.number.currentText()
        word_counts_top = word_counts.most_common(int(maxword_number))  # 获取前10最高频的词
        word_counts_topall = word_counts.most_common()
        print(word_counts_top)  # 输出检查

        # 词频展示
        wc = self.definewc()
        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        wc.to_file('wc.png')

        # 将词云图加载到UI.ui界面
        self.ui.GraphView.scene_img = QGraphicsScene()
        self.imgShow = QPixmap()
        self.imgShow.load('wc.png')
        self.imgShowItem = QGraphicsPixmapItem()
        self.imgShowItem.setPixmap(QPixmap(self.imgShow))
        self.ui.GraphView.scene_img.addItem(self.imgShowItem)
        self.ui.GraphView.setScene(self.ui.GraphView.scene_img)
        self.ui.GraphView.fitInView(QGraphicsPixmapItem(QPixmap(self.imgShow)))

        # 将词云结果插入MySQL数据库
        数据库.insertdb(word_counts_topall)

        # 写入“word_counts_topALL.text”文本
        with open("word_counts_topALL.text", mode='w', encoding='utf8') as f:
            for i in word_counts_topall:
                f.write(i[0]+'\t'+str(i[1])+'\n')

        # 写入.csv和.xls
        CSV.writecsv(word_counts_topall)
        CSV.openxls()

        # 获取选择的图形颜色
        colour = self.ui.colour.currentText()
        if colour == 'blue':colour = '#abddff'
        elif colour == 'yellow':colour = '#ffff7f'
        elif colour == 'green':colour = '#7cff9d'
        elif colour == 'red':colour = '#ff0000'

        # 生成分析图形
        word = []
        for i in word_counts_top:
            word.append(list(i))
        tuxing = self.ui.tuxing.currentText()
        if tuxing == '饼状图':
            showpie.pietu(word, colour)
        elif tuxing == '柱状图':
            showpie.Bartu(word, colour)
        elif tuxing == '散点图':
            showpie.Scatter(word, colour)

        # 设置显示图片守护线程
        thread1 = threading.Thread(target=self.keepdriver)
        thread1.setDaemon(True)
        thread1.start()

    # 使html在游览器上循环出现
    def keepdriver(self):
        driver = webdriver.Edge()
        while(True):
            driver.get('file://C://Users//hp//Desktop//Python//课设//Lib//customized.html')
            driver.maximize_window()
            time.sleep(1000)

    # 词云主程序
    def ciyun(self):
        url_address=self.ui.news_address.currentText()

        if url_address == '新浪新闻':
            url_address = url[0]
            address = 'news_sina.html'
        elif url_address == '网易新闻':
            url_address = url[1]
            address = 'news_163.html'
        elif url_address == '凤凰新闻':
            url_address = url[2]
            address = 'news_ifeng.html'
        self.writetext(url_address, address)
        self.showwordcloud(address)


# 登陆界面
class Denlu:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("denlu.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.acknowledge.clicked.connect(self.acknowledge)
        self.ui.delete_2.clicked.connect(self.ui.close)
        self.ui.zhuce.clicked.connect(self.zhuce)

    # 确认按钮程序
    def acknowledge(self):
        name = self.ui.name.text()
        password = self.ui.password.text()
        if 数据库.searchusername_password(name, password):
            self.state = UIPython()
            self.state.ui.show()
            self.ui.close()
        elif name == '' or password == '':
            self.ui.Error.setText('Error! Do not enter a user name or password')
        else:
            self.ui.Error.setText('Error! Incorrect user name or password')

    # 注册按钮程序
    def zhuce(self):
            self.zhuce = Zhuce()
            self.zhuce.ui.show()
            self.ui.close()


# 注册界面
class Zhuce:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("zhuce.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.submit.clicked.connect(self.submit)
        self.ui.delete_2.clicked.connect(self.ui.close)

    # 提交按钮————MySQL数据库
    def submit(self):
        username = self.ui.username.text()
        password = self.ui.password.text()
        password1 = self.ui.password1.text()
        if username == '' or password == '' or password1 == '':
            self.ui.Error.setText('Error! Do not enter a user name or password')
        elif password != password1:
            self.ui.Error.setText('Error! Two times to enter the password do not match')
        else:
            if 数据库.searchusername(username):
                数据库.insertusername(username, password)
                self.denlu = Denlu()
                self.denlu.ui.show()
                self.ui.close()
            else:
                self.ui.Error.setText('Error! 用户名重复')


app = QApplication([])
denlu=Denlu()
denlu.ui.show()
# 设置守护线程
thread = threading.Thread(target=app.exec_())
thread.setDaemon(True)
thread.start()

        展示分析图片程序:

import pyecharts.options as opts
from pyecharts.charts import Pie,Bar,EffectScatter
from pyecharts.globals import ThemeType
# 图形保存为“customized.html”


# 饼图
def pietu(data_pair,colour):
    data_pair.sort(key=lambda x: x[1])  # 排序
    c = (
        # 初始化
        Pie(init_opts=opts.InitOpts(
            width="900px",
            height="600px",
            theme=ThemeType.MACARONS))
            .add(
            series_name="访问来源",  # 系列名称
            data_pair=data_pair,  # 系列数据项,格式为 [(key1, value1), (key2, value2)]

            # 是否展示成南丁格尔图,通过半径区分数据大小,有'radius'和'area'两种模式。
            # radius:扇区圆心角展现数据的百分比,半径展现数据的大小
            # area:所有扇区圆心角相同,仅通过半径展现数据大小
            rosetype="radius",

            # 饼图的半径
            radius="55%",

            # 饼图的中心(圆心)坐标,数组的第一项是横坐标,第二项是纵坐标
            # 默认设置成百分比,设置成百分比时第一项是相对于容器宽度,第二项是相对于容器高度
            center=["50%", "50%"],

            # 标签配置项
            label_opts=opts.LabelOpts(is_show=False, position="center"),
        )

            # 全局配置项
            .set_global_opts(
            # 设置标题
            title_opts=opts.TitleOpts(
                title="Customized Pie",
                pos_left="center",
                pos_top="20",
                title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
            ),
            # 设置图例
            legend_opts=opts.LegendOpts(is_show=True),
        )

            # 系统配置项
            .set_series_opts(
            # 设置提示框
            tooltip_opts=opts.TooltipOpts(
                trigger="item", formatter="{a} 
{b}: {c} ({d}%)" ), label_opts=opts.LabelOpts(color=colour), ).render("customized.html") ) # 柱状图 def Bartu(data_pair, colour): data_pair.sort(key=lambda x: x[1]) # 排序 x1 = [];y1 = [] for i in data_pair: x1.append(i[0]);y1.append(i[1]) c = ( Bar(init_opts=opts.InitOpts( width="900px", height="600px", theme=ThemeType.MACARONS)).add_xaxis(x1).add_yaxis('频率', y1).set_colors(colour).set_global_opts( title_opts=opts.TitleOpts(title="新闻词出现频率"), yaxis_opts=opts.AxisOpts(name="频率"), xaxis_opts=opts.AxisOpts(name="词")) ).render("customized.html") # 散点图 def Scatter(data_pair, colour): x1 = [];y1 = [] for i in data_pair: x1.append(i[0]);y1.append(i[1]) c = ( EffectScatter(init_opts=opts.InitOpts( width="900px", height="600px", theme=ThemeType.MACARONS)).add_xaxis(x1).add_yaxis('频率', y1).set_colors(colour).set_global_opts( title_opts=opts.TitleOpts(title="新闻词出现频率"), yaxis_opts=opts.AxisOpts(name="频率"), xaxis_opts=opts.AxisOpts(name="词")) ).render("customized.html")

数据库登陆方法:localhost:8080 / localhost | phpMyAdmin 4.6.4http://localhost:8080/phpMyAdmin/

数据库程序:

import pymysql


# 向MySQL服务器插入词云数据
def insertdb(data_pair):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()

    cursor.execute("DROP TABLE IF EXISTS CIYUN")
    sql = """CREATE TABLE CIYUN(
            NAME CHAR(100) NOT NULL,
            NUMBER INT )"""
    cursor.execute(sql)

    sql = "INSERT INTO CIYUN(NAME,NUMBER) \
           VALUES (%s,%s)"

    data_pair1 = []
    for i in data_pair:
        data_pair1.append((i[0], str(i[1])))
    data_pair1 = tuple(data_pair1)

    for i in data_pair1:
        cursor.execute(sql, i)
    db.commit()
    db.close()


# 在MySQL服务器中搜索用户名和密码
def searchusername_password(usename,usepassword):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "SELECT * FROM USERNAME"
    cursor.execute(sql)
    results = cursor.fetchall()
    for row in results:
        if row[0] == usename and row[1] == usepassword:
            db.close()
            return 1
    return 0


# 在MySQL服务器中搜索用户名
def searchusername(usename):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "SELECT * FROM USERNAME"
    cursor.execute(sql)
    results = cursor.fetchall()
    for row in results:
        if row[0] == usename:
            db.close()
            return 0
    return 1


# 注册用户名和密码
def insertusername(username, usepassword):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "INSERT INTO USERNAME(NAME,PASSWORD) \
            VALUES (%s,%s)"
    value = (str(username), str(usepassword))
    cursor.execute(sql, value)
    db.commit()
    db.close()


# 辅助测试用 主程序并未调用 可删除
def creatusername():
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS USERNAME")
    sql = """CREATE TABLE USERNAME(
                NAME CHAR(100) NOT NULL,
                PASSWORD  CHAR(100))"""

    cursor.execute(sql)
    sql = "INSERT INTO USERNAME(NAME,PASSWORD) \
            VALUES (%s,%s)"

    value = (('2537148609', '12345678'), ('yangqun', 'kuaile'))
    for i in value:
        cursor.execute(sql, i)
    db.commit()
    db.close()

        .csv 文件和和 .xls文件程序:

import csv
import xlwt
# csv文件保存为“ciping.csv”
# xls文件保存为“myexcel.xls”


# 写csv文件
def writecsv(data_pair):
    headers = ['词语', '频率']
    data_pair1 = []
    for i in data_pair:
        data_pair1.append((i[0], str(i[1])))
    with open('ciping.csv',mode='w',encoding='utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(data_pair1)


# 写xls文件
def openxls():
    myexcel = xlwt.Workbook()
    # 新建sheet页
    mysheet = myexcel.add_sheet("testsheet")
    # 打开csv文件,事实证明file和open 效果一样的,网上建议用open打开
    with open("ciping.csv",mode='r',encoding='utf8') as csvfile:
        reader = csv.reader(csvfile)
        l = 0
        # 通过循环获取单行信息
        for line in reader:
            r = 0
            # 通过双重循环获取单个单元信息
            for i in line:
                # 通过双重循环写入excel表格
                mysheet.write(l, r, i)
                r += 1
            l += 1
        # 最后保存到excel
        myexcel.save("myexcel.xls")

你可能感兴趣的:(python,爬虫,数据库,mysql,正则表达式)