python爬虫篇6——抓取大数据资讯

mysql代码:创建mysql数据库的代码已集成在python代码中。

使用PyQt5编写可视化界面,感兴趣的请先了解pyqt5的安装及基本使用。

项目结构:python爬虫篇6——抓取大数据资讯_第1张图片

1.pyqt5界面设计:python爬虫篇6——抓取大数据资讯_第2张图片

2.程序code:

untitled.py主程序入口:
# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'untitled.ui'
#
# Created by: PyQt5 UI code generator 5.10.1
#
# WARNING! All changes made in this file will be lost!
import sys

from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtCore import QStringListModel, Qt, pyqtSignal
from PyQt5.QtWidgets import QMessageBox, QWidget

from 大数据资讯快抓.getdata import *


class Ui_mainWindow(object):
    def setupUi(self, mainWindow):
        mainWindow.setObjectName("mainWindow")
        mainWindow.resize(640, 560)
        mainWindow.setMinimumSize(QtCore.QSize(640, 560))
        mainWindow.setMaximumSize(QtCore.QSize(640, 560))
        self.centralwidget = QtWidgets.QWidget(mainWindow)
        self.centralwidget.setMinimumSize(QtCore.QSize(640, 560))
        self.centralwidget.setMaximumSize(QtCore.QSize(640, 560))
        self.centralwidget.setObjectName("centralwidget")
        # 显示顶部图片
        self.top_widget = QtWidgets.QWidget(self.centralwidget)
        self.top_widget.setGeometry(QtCore.QRect(0, 0, 640, 120))
        self.top_widget.setMinimumSize(QtCore.QSize(640, 120))
        self.top_widget.setMaximumSize(QtCore.QSize(640, 120))
        self.top_widget.setStyleSheet("image: url(:/png/bg1.png);\n"
                                      "background-image: url(:/png/bg2.png);")
        self.top_widget.setObjectName("top_widget")
        # 数据库参数模块
        self.db_widget = QtWidgets.QWidget(self.centralwidget)
        self.db_widget.setGeometry(QtCore.QRect(0, 120, 640, 120))
        self.db_widget.setStyleSheet("background-image: url(:/png/bg2.png);")
        self.db_widget.setObjectName("db_widget")
        # 数据库参数
        self.db_set = QtWidgets.QLabel(self.db_widget)
        self.db_set.setGeometry(QtCore.QRect(10, 15, 64, 12))
        self.db_set.setObjectName("db_set")
        self.db_set.setFont(self.getFont())
        # 数据库地址
        self.db_addr = QtWidgets.QLabel(self.db_widget)
        self.db_addr.setGeometry(QtCore.QRect(10, 40, 64, 12))
        self.db_addr.setObjectName("db_addr")
        self.db_addr.setFont(self.getFont())
        # 数据库地址输入框
        self.db_addr_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_addr_edit.setGeometry(QtCore.QRect(80, 35, 150, 24))
        self.db_addr_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_addr_edit.setObjectName("db_addr_edit")
        self.db_addr_edit.setFont(self.getFont())
        # 数据库端口号
        self.db_port = QtWidgets.QLabel(self.db_widget)
        self.db_port.setGeometry(QtCore.QRect(10, 65, 64, 12))
        self.db_port.setObjectName("db_port")
        self.db_port.setFont(self.getFont())
        # 数据库端口号输入框
        self.db_port_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_port_edit.setGeometry(QtCore.QRect(80, 60, 150, 24))
        self.db_port_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_port_edit.setObjectName("db_port_edit")
        self.db_port_edit.setFont(self.getFont())
        # 数据库编码
        self.db_charset = QtWidgets.QLabel(self.db_widget)
        self.db_charset.setGeometry(QtCore.QRect(10, 90, 64, 12))
        self.db_charset.setObjectName("db_charset")
        self.db_charset.setFont(self.getFont())
        # 数据库编码输入框
        self.db_charset_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_charset_edit.setGeometry(QtCore.QRect(80, 85, 150, 24))
        self.db_charset_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_charset_edit.setObjectName("db_charset_edit")
        self.db_charset_edit.setFont(self.getFont())
        # 数据库名称
        self.db_name = QtWidgets.QLabel(self.db_widget)
        self.db_name.setGeometry(QtCore.QRect(240, 40, 64, 12))
        self.db_name.setObjectName("db_name")
        self.db_name.setFont(self.getFont())
        # 数据库名称输入框
        self.db_name_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_name_edit.setGeometry(QtCore.QRect(310, 35, 150, 24))
        self.db_name_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_name_edit.setObjectName("db_name_edit")
        self.db_name_edit.setFont(self.getFont())
        # 数据库用户名
        self.db_usename = QtWidgets.QLabel(self.db_widget)
        self.db_usename.setGeometry(QtCore.QRect(240, 65, 64, 12))
        self.db_usename.setObjectName("db_usename")
        self.db_usename.setFont(self.getFont())
        # 数据库用户名输入框
        self.db_usename_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_usename_edit.setGeometry(QtCore.QRect(310, 60, 150, 24))
        self.db_usename_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_usename_edit.setObjectName("db_usename_edit")
        self.db_usename_edit.setFont(self.getFont())
        # 数据库密码
        self.db_pwd = QtWidgets.QLabel(self.db_widget)
        self.db_pwd.setGeometry(QtCore.QRect(240, 90, 64, 12))
        self.db_pwd.setObjectName("db_pwd")
        self.db_pwd.setFont(self.getFont())
        # 数据库密码输入框
        self.db_pwd_edit = QtWidgets.QTextEdit(self.db_widget)
        self.db_pwd_edit.setGeometry(QtCore.QRect(310, 85, 150, 24))
        self.db_pwd_edit.setStyleSheet("background-image: url(:/png/bg3.png);")
        self.db_pwd_edit.setObjectName("db_pwd_edit")
        self.db_pwd_edit.setFont(self.getFont())
        # 数据库参数保存按钮
        self.db_save_btn = QtWidgets.QPushButton(self.db_widget)
        self.db_save_btn.setGeometry(QtCore.QRect(500, 85, 90, 24))
        self.db_save_btn.setObjectName("db_save_btn")
        self.db_save_btn.setFont(self.getFont())

        self.line_2 = QtWidgets.QFrame(self.db_widget)
        self.line_2.setGeometry(QtCore.QRect(0, 0, 640, 3))
        self.line_2.setFrameShape(QtWidgets.QFrame.HLine)
        self.line_2.setFrameShadow(QtWidgets.QFrame.Sunken)
        self.line_2.setObjectName("line_2")
        self.widget = QtWidgets.QWidget(self.centralwidget)
        self.widget.setGeometry(QtCore.QRect(0, 240, 640, 320))
        self.widget.setObjectName("widget")
        # 大数据世界消息栏列表
        self.bigdataworld_listview = QtWidgets.QListView(self.widget)
        self.bigdataworld_listview.setGeometry(QtCore.QRect(0, 30, 250, 290))
        self.bigdataworld_listview.setObjectName("bigdataworld_listview")
        self.bigdataworld_listview.setFont(self.getFont())

        self.bigdataworld_tip_widget = QtWidgets.QWidget(self.widget)
        self.bigdataworld_tip_widget.setGeometry(QtCore.QRect(0, 0, 250, 30))
        self.bigdataworld_tip_widget.setStyleSheet("background-image: url(:/png/bg2.png);")
        self.bigdataworld_tip_widget.setObjectName("bigdataworld_tip_widget")
        self.showtable = QtWidgets.QLabel(self.bigdataworld_tip_widget)
        self.showtable.setGeometry(QtCore.QRect(60, 10, 120, 12))
        self.showtable.setObjectName("showtable")
        self.showtable.setFont(self.getFont())
        self.line_4 = QtWidgets.QFrame(self.bigdataworld_tip_widget)
        self.line_4.setGeometry(QtCore.QRect(0, 0, 250, 3))
        self.line_4.setFrameShape(QtWidgets.QFrame.HLine)
        self.line_4.setFrameShadow(QtWidgets.QFrame.Sunken)
        self.line_4.setObjectName("line_4")
        self.bigdatazx_tip_widget = QtWidgets.QWidget(self.widget)
        self.bigdatazx_tip_widget.setGeometry(QtCore.QRect(390, 0, 250, 30))
        self.bigdatazx_tip_widget.setStyleSheet("background-image: url(:/png/bg2.png);")
        self.bigdatazx_tip_widget.setObjectName("bigdatazx_tip_widget")
        self.showtable_2 = QtWidgets.QLabel(self.bigdatazx_tip_widget)
        self.showtable_2.setGeometry(QtCore.QRect(60, 10, 120, 12))
        self.showtable_2.setObjectName("showtable_2")
        self.showtable_2.setFont(self.getFont())
        self.line_5 = QtWidgets.QFrame(self.bigdatazx_tip_widget)
        self.line_5.setGeometry(QtCore.QRect(0, 0, 250, 3))
        self.line_5.setFrameShape(QtWidgets.QFrame.HLine)
        self.line_5.setFrameShadow(QtWidgets.QFrame.Sunken)
        self.line_5.setObjectName("line_5")
        # 大数据资讯消息栏列表
        self.bigdatazx_listview = QtWidgets.QListView(self.widget)
        self.bigdatazx_listview.setGeometry(QtCore.QRect(390, 30, 250, 290))
        self.bigdatazx_listview.setObjectName("bigdatazx_listview")
        self.bigdatazx_listview.setFont(self.getFont())

        self.function_widget = QtWidgets.QWidget(self.widget)
        self.function_widget.setGeometry(QtCore.QRect(250, 0, 140, 320))
        self.function_widget.setStyleSheet("background-image: url(:/png/bg2.png);")
        self.function_widget.setObjectName("function_widget")
        self.function_btn = QtWidgets.QLabel(self.function_widget)
        self.function_btn.setGeometry(QtCore.QRect(45, 10, 54, 12))
        self.function_btn.setObjectName("function_btn")
        self.function_btn.setFont(self.getFont())
        # 获取大数据世界数据按钮
        self.getbigdataworld_btn = QtWidgets.QPushButton(self.function_widget)
        self.getbigdataworld_btn.setGeometry(QtCore.QRect(25, 90, 90, 23))
        self.getbigdataworld_btn.setObjectName("getbigdataworld_btn")
        self.getbigdataworld_btn.setFont(self.getFont())
        # 获取大数据资讯数据按钮
        self.getbigdatazx_btn = QtWidgets.QPushButton(self.function_widget)
        self.getbigdatazx_btn.setGeometry(QtCore.QRect(25, 180, 90, 23))
        self.getbigdatazx_btn.setObjectName("getbigdatazx_btn")
        self.getbigdatazx_btn.setFont(self.getFont())

        self.line = QtWidgets.QFrame(self.function_widget)
        self.line.setGeometry(QtCore.QRect(0, 30, 140, 3))
        self.line.setFrameShape(QtWidgets.QFrame.HLine)
        self.line.setFrameShadow(QtWidgets.QFrame.Sunken)
        self.line.setObjectName("line")
        self.line_3 = QtWidgets.QFrame(self.function_widget)
        self.line_3.setGeometry(QtCore.QRect(0, 0, 140, 3))
        self.line_3.setFrameShape(QtWidgets.QFrame.HLine)
        self.line_3.setFrameShadow(QtWidgets.QFrame.Sunken)
        self.line_3.setObjectName("line_3")
        self.bigdataworld_sum_tip = QtWidgets.QLabel(self.function_widget)
        self.bigdataworld_sum_tip.setGeometry(QtCore.QRect(20, 130, 64, 12))
        self.bigdataworld_sum_tip.setObjectName("bigdataworld_sum_tip")
        self.bigdataworld_sum_tip.setFont(self.getFont())
        # 大数据世界数据已抓取数量
        self.bigdataworld_sum = QtWidgets.QLabel(self.function_widget)
        self.bigdataworld_sum.setGeometry(QtCore.QRect(90, 130, 30, 12))
        self.bigdataworld_sum.setObjectName("bigdataworld_sum")
        self.bigdataworld_sum.setFont(self.getFont())
        self.bigdatazx_sum_tip = QtWidgets.QLabel(self.function_widget)
        self.bigdatazx_sum_tip.setGeometry(QtCore.QRect(20, 220, 64, 12))
        self.bigdatazx_sum_tip.setObjectName("bigdatazx_sum_tip")
        self.bigdatazx_sum_tip.setFont(self.getFont())
        # 大数据资讯数据已抓取数量
        self.bigdatazx_sum = QtWidgets.QLabel(self.function_widget)
        self.bigdatazx_sum.setGeometry(QtCore.QRect(90, 220, 30, 12))
        self.bigdatazx_sum.setObjectName("bigdatazx_sum")
        self.bigdatazx_sum.setFont(self.getFont())
        # 大数据世界提示加载条
        self.bdw_progressBar = QtWidgets.QProgressBar(self.widget)
        self.bdw_progressBar.setGeometry(QtCore.QRect(40, 155, 180, 20))
        font = QtGui.QFont()
        font.setPointSize(1)
        self.bdw_progressBar.setFont(font)
        self.bdw_progressBar.setMaximum(0)
        self.bdw_progressBar.setProperty("value", -1)
        self.bdw_progressBar.setObjectName("bdw_progressBar")
        self.bdw_progressBar.setVisible(False)
        # 大数据资讯提示加载条
        self.bdzx_progressBar = QtWidgets.QProgressBar(self.widget)
        self.bdzx_progressBar.setGeometry(QtCore.QRect(430, 155, 180, 20))
        font = QtGui.QFont()
        font.setPointSize(1)
        self.bdzx_progressBar.setFont(font)
        self.bdzx_progressBar.setMaximum(0)
        self.bdzx_progressBar.setProperty("value", -1)
        self.bdzx_progressBar.setObjectName("bdzx_progressBar")
        self.bdzx_progressBar.setVisible(False)
        mainWindow.setCentralWidget(self.centralwidget)

        self.retranslateUi(mainWindow)
        QtCore.QMetaObject.connectSlotsByName(mainWindow)
        # 设置数据库保存按钮功能
        self.SetDBData()  # 设置已保存的数据库数据
        self.db_save_btn.clicked.connect(self.SaveDBData)
        # 设置抓取大数据世界功能按钮
        self.getbigdataworld_btn.clicked.connect(self.GetBigDataWorld)
        # 设置抓取大数据资讯功能按钮
        self.getbigdatazx_btn.clicked.connect(self.GetBigDataZX)

    # 抓取大数据世界功能
    def GetBigDataWorld(self):
        self.bdw_progressBar.setVisible(True)
        self.bigdataworld_listview.setEnabled(False)
        self.bigdataworld_listview.setModel(QStringListModel().setStringList([]))
        self.addListItem(QStringListModel(), self.bigdataworld_listview, '数据抓取中...')
        # 创建线程
        self.thread = Runthread()
        # 连接信号
        self.thread._signal.connect(self.callbacklog)
        # 开始线程
        self.thread.start()

    def callbacklog(self, msg):
        # print(msg)
        if len(msg) > 0:
            self.itemmodel = QStringListModel()
            self.titlelist = []
            self.urllist = []
            for item in msg:
                self.titlelist.append(item['title'])
                self.urllist.append(item['url'])
            self.itemmodel.setStringList(self.titlelist)
            self.bigdataworld_listview.setModel(self.itemmodel)
            self.bigdataworld_listview.clicked.connect(self.clickBDWList)
            self.bigdataworld_sum.setText(QtCore.QCoreApplication.translate("mainWindow", str(len(self.urllist))))
            self.bdw_progressBar.setVisible(False)
            self.bigdataworld_listview.setEnabled(True)
        else:
            self.itemmodel.setStringList(['无更新数据'])
            self.bigdataworld_listview.setModel(self.itemmodel)
            self.bigdataworld_listview.clicked.connect(self.clickBDWList)
            self.bigdataworld_sum.setText(QtCore.QCoreApplication.translate("mainWindow", str(len(self.urllist))))
            self.bdw_progressBar.setVisible(False)
            self.bigdataworld_listview.setEnabled(True)

    # 抓取大数据资讯功能
    def GetBigDataZX(self):
        self.bdzx_progressBar.setVisible(True)
        self.bigdatazx_listview.setEnabled(False)
        self.bigdatazx_listview.setModel(QStringListModel().setStringList([]))
        self.addListItem(QStringListModel(), self.bigdatazx_listview, '数据抓取中...')
        # 创建线程
        self.thread1 = Runthread1()
        # 连接信号
        self.thread1._signal.connect(self.callbacklog1)
        # 开始线程
        self.thread1.start()

    def callbacklog1(self, msg):
        # print(msg)
        if len(msg) > 0:
            self.itemmodel = QStringListModel()
            self.titlelist = []
            self.urllist = []
            for item in msg:
                self.titlelist.append(item['title'])
                self.urllist.append(item['url'])
            self.itemmodel.setStringList(self.titlelist)
            self.bigdatazx_listview.setModel(self.itemmodel)
            self.bigdatazx_listview.clicked.connect(self.clickBDWList)
            self.bigdatazx_sum.setText(QtCore.QCoreApplication.translate("mainWindow", str(len(self.urllist))))
            self.bdzx_progressBar.setVisible(False)
            self.bigdatazx_listview.setEnabled(True)
        else:
            self.itemmodel.setStringList(['无更新数据'])
            self.bigdatazx_listview.setModel(self.itemmodel)
            self.bigdatazx_listview.clicked.connect(self.clickBDWList)
            self.bigdatazx_sum.setText(QtCore.QCoreApplication.translate("mainWindow", str(len(self.urllist))))
            self.bdzx_progressBar.setVisible(False)
            self.bigdatazx_listview.setEnabled(True)

    # listview添加一项
    def addListItem(self, itemmodel, m_ListView, content):
        count = itemmodel.rowCount()  # 取数据存储数据条数
        selectindex = m_ListView.currentIndex()  # 取当前选择的数据项位置
        if selectindex.isValid():
            Pos = selectindex.row()  # 取当前选择的数据项位置的顺序索引
        else:
            Pos = count  # 当前没有选择则插入到最后位置
        itemmodel.insertRow(Pos)  # 执行插入位置元素扩充
        index = itemmodel.index(Pos, 0)  # 取插入位置的元素项
        itemmodel.setData(index, content, Qt.DisplayRole)  # 将内容更新到插入位置
        m_ListView.setModel(itemmodel)

    # 点击查看详情
    def clickBDWList(self, qModelIndex):
        self.messageTipDialog('详情', "标题:" + self.titlelist[qModelIndex.row()] + "\n"
                                                                                "地址:" + self.urllist[qModelIndex.row()])

    # 保存数据库参数
    def SaveDBData(self):
        if len(self.db_addr_edit.toPlainText()) == 0:
            self.messageWarnDialog('警告', '数据库地址为必填项!')
        elif len(self.db_port_edit.toPlainText()) == 0:
            self.messageWarnDialog('警告', '数据库端口为必填项!')
        elif len(self.db_name_edit.toPlainText()) == 0:
            self.messageWarnDialog('警告', '数据库名称为必填项!')
        elif len(self.db_charset_edit.toPlainText()) == 0:
            self.messageWarnDialog('警告', '数据库编码为必填项!')
        else:
            saveDBSetting(self.db_addr_edit.toPlainText(), self.db_port_edit.toPlainText(),
                          self.db_usename_edit.toPlainText(), self.db_pwd_edit.toPlainText(),
                          self.db_name_edit.toPlainText(), self.db_charset_edit.toPlainText())
            self.messageTipDialog('提示', '数据库参数保存成功')

    # 设置已保存的数据库参数
    def SetDBData(self):
        if isDbs():
            dic = eval(read())
            self.db_addr_edit.setPlainText(dic['host'])
            self.db_port_edit.setPlainText(dic['port'])
            self.db_usename_edit.setPlainText(dic['user'])
            self.db_pwd_edit.setPlainText(dic['pwd'])
            self.db_name_edit.setPlainText(dic['db'])
            self.db_charset_edit.setPlainText(dic['charset'])

    # 显示警告消息提示框,参数title为提示框标题文字,message为提示信息
    def messageWarnDialog(self, title, message):
        msg_box = QMessageBox(QMessageBox.Warning, title, message)
        msg_box.exec_()

    # 显示提示消息提示框,参数title为提示框标题文字,message为提示信息
    def messageTipDialog(self, title, message):
        QMessageBox.information(QWidget(), title, message)

    # 设置字体大小
    def getFont(self):
        font = QtGui.QFont()
        font.setPointSize(5)
        return font

    def retranslateUi(self, mainWindow):
        _translate = QtCore.QCoreApplication.translate
        mainWindow.setWindowTitle(_translate("mainWindow", "大数据资讯快抓"))
        self.db_set.setText(_translate("mainWindow", "数据库参数"))
        self.db_addr.setText(_translate("mainWindow", "数据库地址:"))
        self.db_port.setText(_translate("mainWindow", "  端口号:"))
        self.db_charset.setText(_translate("mainWindow", "   编码:"))
        self.db_name.setText(_translate("mainWindow", "数据库名称:"))
        self.db_usename.setText(_translate("mainWindow", "  用户名:"))
        self.db_pwd.setText(_translate("mainWindow", "   密码:"))
        self.db_save_btn.setText(_translate("mainWindow", "保存"))
        self.showtable.setText(_translate("mainWindow", "大数据世界消息栏"))
        self.showtable_2.setText(_translate("mainWindow", "大数据资讯消息栏"))
        self.function_btn.setText(_translate("mainWindow", "功能按钮"))
        self.getbigdataworld_btn.setText(_translate("mainWindow", "抓取大数据世界"))
        self.getbigdatazx_btn.setText(_translate("mainWindow", "抓取大数据资讯"))
        self.bigdataworld_sum_tip.setText(_translate("mainWindow", "已抓取数量:"))
        self.bigdataworld_sum.setText(_translate("mainWindow", "-"))
        self.bigdatazx_sum_tip.setText(_translate("mainWindow", "已抓取数量:"))
        self.bigdatazx_sum.setText(_translate("mainWindow", "-"))


# 继承QThread
class Runthread(QtCore.QThread):
    # python3,pyqt5与之前的版本有些不一样
    #  通过类成员对象定义信号对象
    _signal = pyqtSignal(list)

    def __init__(self, parent=None):
        super(Runthread, self).__init__()

    def __del__(self):
        self.wait()

    def run(self):
        newData = getBigDataWorldData()
        # print(newData)
        self.callback(newData)
        # 处理你要做的业务逻辑,这里是通过一个回调来处理数据,这里的逻辑处理写自己的方法

    def callback(self, msg):
        # 信号焕发,我是通过我封装类的回调来发起的
        self._signal.emit(msg)


# 继承QThread
class Runthread1(QtCore.QThread):
    # python3,pyqt5与之前的版本有些不一样
    #  通过类成员对象定义信号对象
    _signal = pyqtSignal(list)

    def __init__(self, parent=None):
        super(Runthread1, self).__init__()

    def __del__(self):
        self.wait()

    def run(self):
        newData = getBigDataZXData()
        # print(newData)
        self.callback(newData)
        # 处理你要做的业务逻辑,这里是通过一个回调来处理数据,这里的逻辑处理写自己的方法

    def callback(self, msg):
        # 信号焕发,我是通过我封装类的回调来发起的
        self._signal.emit(msg)


def show_MainWindow():
    app = QtWidgets.QApplication(sys.argv)  # 首先必须实例化QApplication类,作为GUI主程序入口
    MainWindow = QtWidgets.QMainWindow()  # 实例化QtWidgets.QMainWindow类,创建自带menu的窗体类型QMainWindow
    ui = Ui_mainWindow()  # 实例UI类
    ui.setupUi(MainWindow)  # 设置窗体UI
    MainWindow.show()  # 显示窗体
    sys.exit(app.exec_())  # 当来自操作系统的分发事件指派调用窗口时,
    # 应用程序开启主循环(mainloop)过程,
    # 当窗口创建完成,需要结束主循环过程,
    # 这时候呼叫sys.exit()方法来,结束主循环过程退出,
    # 并且释放内存。为什么用app.exec_()而不是app.exec()?
    # 因为exec是python系统默认关键字,为了以示区别,所以写成exec_


if __name__ == '__main__':
    show_MainWindow()
getdata.py获取数据:
import re
import winreg

import pymysql
import requests_html
from lxml import etree
from 大数据资讯快抓.MysqlHelper import *
from 大数据资讯快抓.savedbsetting import *

session = requests_html.HTMLSession()


# 抓取大数据世界数据
def getBigDataWorldData():
    Data = []
    # 验证数据库参数是否存在
    if isDbs():
        helper = Helper()
        # 创建表
        sql = "CREATE TABLE bigdataworldinstro (id int primary key not null auto_increment,bdw_title varchar(500) DEFAULT NULL  ,bdw_url varchar(500) DEFAULT NULL UNIQUE);"
        helper.execute(sql)
        # 抓取数据
        url = 'http://www.thebigdata.cn/YeJieDongTai/default.html'
        response = session.get(url)
        response.encoding = 'UTF-8-SIG'
        pageSum = re.split(r'/', etree.HTML(response.text).xpath('//li[@class="p_total"]/text()')[0])[1]
        # print(pageSum)
        for i in range(1, int(pageSum) + 1):
            if i == 1:
                nodelist = etree.HTML(response.text).xpath('//div[@class="summary"]')
                for node in nodelist:
                    title = node.xpath('.//div[@class="title"]')[0].xpath('string(.)')
                    url = (node.xpath('.//div[@class="title"]/a/@href')[0]).replace('..', 'http://www.thebigdata.cn')
                    sql = "insert into bigdataworldinstro(bdw_title,bdw_url) values(%s,%s)on duplicate key update bdw_title = %s and bdw_url = %s;"
                    params = [pymysql.escape_string(title), url, pymysql.escape_string(title), url]
                    result = helper.execute(sql, params)
                    if str(result) == 'None':
                        print('*' * 100)
                        print(title)
                        print('大数据世界简介插入成功,开始查询并插入详情和图片')
                        getBigDataWorldDataDetails({'title': title, 'url': url}, Data)
                    elif str(result).__contains__('1292'):
                        # Truncated incorrect DOUBLE value:数据已存在
                        helper.rollback()
                    else:
                        print('插入失败数据回滚')
                        helper.rollback()
                    # print(title + '\n' + url)
            else:
                url = 'http://www.thebigdata.cn/YeJieDongTai/defaultp%s.html' % i
                response = session.get(url)
                response.encoding = 'UTF-8-SIG'
                nodelist = etree.HTML(response.text).xpath('//div[@class="summary"]')
                for node in nodelist:
                    title = node.xpath('.//div[@class="title"]')[0].xpath('string(.)')
                    url = (node.xpath('.//div[@class="title"]/a/@href')[0]).replace('..', 'http://www.thebigdata.cn')
                    sql = "insert into bigdataworldinstro(bdw_title,bdw_url) values(%s,%s)on duplicate key update bdw_title = %s and bdw_url = %s;"
                    params = [pymysql.escape_string(title), url, pymysql.escape_string(title), url]
                    result = helper.execute(sql, params)
                    if str(result) == 'None':
                        print('*' * 100)
                        print(title)
                        print('大数据世界简介插入成功,开始查询并插入详情和图片')
                        getBigDataWorldDataDetails({'title': title, 'url': url}, Data)
                    elif str(result).__contains__('1292'):
                        # Truncated incorrect DOUBLE value:数据已存在
                        helper.rollback()
                    else:
                        print('插入失败数据回滚')
                        helper.rollback()
                    # print(title + '\n' + url)

        else:
            if len(Data):
                return Data
            else:
                Data.append({'title': '无更新数据!', 'url': 'None'})
                return Data

    else:
        Data.append({'title': '数据库参数无效!', 'url': 'None'})
        return Data


# 大数据世界数据入库
def getBigDataWorldDataDetails(item, datalist):
    print(item)
    image_path = get_desktop() + '\imageDir\\'
    try:
        if image_path not in os.listdir():
            os.makedirs(image_path)
    except FileExistsError:
        pass
    title = item['title']
    url = item['url']
    try:
        newResponse = session.get(url)
    except Exception:
        newResponse = session.get(url)
    newResponse.encoding = 'UTF-8-SIG'
    content = re.sub(r'([\n\r\t])', '',
                     string=etree.HTML(newResponse.text).xpath('//div[@id="content"]')[0].xpath('string(.)'))
    time = re.search(r'(\d+-?)+', etree.HTML(newResponse.text).xpath('//td[@width="140"]/text()')[0]).group()
    photoList = []
    for i in etree.HTML(newResponse.text).xpath('//div[@id="content"]//img/@src'):
        photoList.append(i.replace('..', 'http://www.thebigdata.cn'))
    else:
        photoList.remove(photoList[len(photoList) - 1])
    helper = Helper()
    # 创建大数据世界详情表
    sql = "CREATE TABLE bigdataworlddetail (id int primary key not null auto_increment, bdwd_url varchar(500) DEFAULT NULL UNIQUE,bdwd_time datetime DEFAULT NULL,bdwd_content varchar(10000) DEFAULT NULL,foreign key(bdwd_url) references bigdataworldinstro(bdw_url));"
    helper.execute(sql)
    # 创建照片表
    sql1 = "CREATE TABLE bigdataphoto(id int primary key not null auto_increment, url varchar(500) DEFAULT NULL,image varchar(500) DEFAULT NULL UNIQUE);"
    helper.execute(sql1)
    # 插入详情数据
    sql2 = "insert into bigdataworlddetail(bdwd_url,bdwd_time,bdwd_content) values(%s,%s,%s)on duplicate key update bdwd_url = %s and bdwd_time = %s and bdwd_content = %s;"
    params = [url, time, pymysql.escape_string(content), url, time, pymysql.escape_string(content)]
    result = helper.execute(sql2, params)
    if str(result) == 'None':
        print(pymysql.escape_string(content))
        print('大数据世界详情插入成功,开始查询插入图片')
        print('图片list:%s' % photoList)
        for i in range(len(photoList)):
            imageName = image_path + "".join(re.findall(r'[^*"/:?\\|<>]', string=title)) + str(i + 1) + '.' + photoList[
                                                                                                                  i][
                                                                                                              len(
                                                                                                                  photoList[
                                                                                                                      i]) - 3:len(
                                                                                                                  photoList[
                                                                                                                      i])]

            if not os.path.exists(imageName):
                try:
                    newresponse = session.get(photoList[i], stream=True)
                except Exception:
                    newresponse = session.get(photoList[i], stream=True)
                with open(imageName, 'ab') as f:
                    for data in newresponse.iter_content(chunk_size=1024 * 1024):
                        f.write(data)
                        # 刷新文件
                        f.flush()
                    f.close()
                print('imageName---->%s' % imageName)
                sql3 = "insert into bigdataphoto (url,image) values (%s,%s)on duplicate key update url = %s and image = %s;"
                params = [url, pymysql.escape_string(imageName), url, pymysql.escape_string(imageName)]
                helper.execute(sql3, params)
            else:
                print('图片已存在!')
        else:
            print('图片插入完成')
            datalist.append(item)
    else:
        print('插入失败数据回滚')
        helper.rollback()


# 抓取大数据资讯数据
def getBigDataZXData():
    Data = []
    # 验证数据库参数是否存在
    if isDbs():
        helper = Helper()
        # 创建表
        sql = "CREATE TABLE bigdatazxinstro (id int primary key not null auto_increment,bdz_title varchar(500) DEFAULT NULL  ,bdz_url varchar(500) DEFAULT NULL UNIQUE);"
        helper.execute(sql)
        # 抓取数据
        page = 1
        url = 'http://bigdata.idcquan.com/news/%s.shtml' % str(page)
        response = session.get(url)
        response.encoding = 'UTF-8-SIG'
        error = etree.HTML(response.text).xpath('//div[@id="da-error-code"]')
        while not len(error):
            nodelist = etree.HTML(response.text).xpath('//div[@class="news_nr"]')
            for node in nodelist:
                title = node.xpath('.//span[@class="title"]/text()')[0]
                url = node.xpath('.//a[@class="d1"]/@href')[0]
                sql = "insert into bigdatazxinstro(bdz_title,bdz_url) values(%s,%s)on duplicate key update bdz_title = %s and bdz_url = %s;"
                params = [pymysql.escape_string(title), url, pymysql.escape_string(title), url]
                result = helper.execute(sql, params)
                # print(result)
                if str(result) == 'None':
                    print('*' * 100)
                    print(title)
                    print('大数据资讯简介插入成功,开始查询并插入详情和图片')
                    getBigDataZXDataDetails({'title': title, 'url': url}, Data)
                elif str(result).__contains__('1292'):
                    # Truncated incorrect DOUBLE value:数据已存在
                    helper.rollback()
                else:
                    print('插入失败数据回滚')
                    helper.rollback()
                # print(title + '\n' + url)
            else:
                page += 1
                url = 'http://bigdata.idcquan.com/news/%s.shtml' % str(page)
                response = session.get(url)
                response.encoding = 'UTF-8-SIG'
                error = etree.HTML(response.text).xpath('//div[@id="da-error-code"]')
        else:
            if len(Data):
                return Data
            else:
                Data.append({'title': '无更新数据!', 'url': 'None'})
                return Data

    else:
        Data.append({'title': '数据库参数无效!', 'url': 'None'})
        return Data


# 大数据世界资讯入库
def getBigDataZXDataDetails(item, datalist):
    print(item)
    image_path = get_desktop() + '\imageDir\\'
    try:
        if image_path not in os.listdir():
            os.makedirs(image_path)
    except FileExistsError:
        pass
    title = item['title']
    url = item['url']
    try:
        newResponse = session.get(url)
    except Exception:
        newResponse = session.get(url)
    newResponse.encoding = 'UTF-8-SIG'
    error = etree.HTML(newResponse.text).xpath('//div[@id="da-error-code"]')
    if (not len(error)) & str(url).__contains__('http://bigdata.idcquan.com/news'):
        content = re.sub(r'([\n\r\t])', '',
                         string=etree.HTML(newResponse.text).xpath(
                             '//div[@class="clear deatil article-content fontSizeSmall BSHARE_POP"]')[0].xpath(
                             'string(.)'))
        time = etree.HTML(newResponse.text).xpath('//div[@class="date"]/text()')[0]
        photoList = []
        for i in etree.HTML(newResponse.text).xpath(
                '//div[@class="clear deatil article-content fontSizeSmall BSHARE_POP"]//img/@src'):
            try:
                # 过滤图片格式
                photoList.append(re.search('(.*?)(.jpg|.jpeg|.png|.gif)', string=i).group())
            except AttributeError:
                print('errorCode ---->%s' % i)
        helper = Helper()
        # 创建大数据资讯详情表
        sql = "CREATE TABLE bigdatazxdetail (id int primary key not null auto_increment, bdzx_url varchar(500) DEFAULT NULL UNIQUE,bdzx_time datetime DEFAULT NULL,bdzx_content varchar(20000) DEFAULT NULL,foreign key(bdzx_url) references bigdatazxinstro(bdz_url));"
        helper.execute(sql)
        # 创建照片表
        sql1 = "CREATE TABLE bigdataphoto(id int primary key not null auto_increment, url varchar(500) DEFAULT NULL,image varchar(500) DEFAULT NULL UNIQUE);"
        helper.execute(sql1)
        # 插入详情数据
        sql2 = "insert into bigdatazxdetail(bdzx_url,bdzx_time,bdzx_content) values(%s,%s,%s)on duplicate key update bdzx_url = %s and bdzx_time = %s and bdzx_content = %s;"
        params = [url, time, pymysql.escape_string(content), url, time, pymysql.escape_string(content)]
        result = helper.execute(sql2, params)
        if str(result) == 'None':
            print(pymysql.escape_string(content))
            print('大数据资讯详情插入成功,开始查询插入图片')
            print('图片list:%s' % photoList)
            for i in range(len(photoList)):
                if not photoList[i].__contains__('http://'):
                    return
                imageName = image_path + "".join(re.findall(r'[^*"/:?\\|<>]', string=title)) + str(i + 1) + '.' + \
                            photoList[i][len(photoList[i]) - 3:len(photoList[i])]

                if not os.path.exists(imageName):
                    try:
                        newresponse = session.get(photoList[i], stream=True)
                    except Exception:
                        newresponse = session.get(photoList[i], stream=True)
                    with open(imageName, 'ab') as f:
                        for data in newresponse.iter_content(chunk_size=1024 * 1024):
                            f.write(data)
                            # 刷新文件
                            f.flush()
                        f.close()
                    print('imageName---->%s' % imageName)
                    sql3 = "insert into bigdataphoto (url,image) values (%s,%s)on duplicate key update url = %s and image = %s;"
                    params = [url, pymysql.escape_string(imageName), url, pymysql.escape_string(imageName)]
                    helper.execute(sql3, params)
                else:
                    print('图片已存在!')
            else:
                print('图片插入完成')
                datalist.append(item)
        else:
            print('插入失败数据回滚')
            helper.rollback()
    else:
        print('404 Error or unKnow error')


# 获取数据库对象
def Helper():
    dic = eval(read())
    helper = MysqlHelper(host=dic['host'], port=int(dic['port']), user=dic['user'], passwd=dic['pwd'], db=dic['db'],
                         charset=dic['charset'])
    return helper


# 获取桌面路径
def get_desktop():
    key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders')
    return winreg.QueryValueEx(key, "Desktop")[0]


if __name__ == '__main__':
    getBigDataWorldData()
savedbsetting.py保存数据库:
import os


def saveDBSetting(host, port, user, pwd, db, charset):
    dbDict = {'host': host, 'port': port, 'user': user, 'pwd': pwd, 'db': db, 'charset': charset}
    write(str(dbDict))


def write(dict):
    file = open('db.text', 'w', encoding='utf_8_sig')
    file.write(dict)
    file.close()


def read():
    file = open('db.text', 'r', encoding='utf_8_sig')
    data = file.readline()
    file.close()
    return data


def isDbs():
    isDbs = os.path.exists('db.text')  # 判断车站文件是否存在
    return isDbs
MysqlHelper.py数据库连接类:
from click._compat import raw_input
from pymysql import *

"""封装mysql连接类"""


class MysqlHelper:
    """初始化数据库参数"""

    def __init__(self, host, port, user, passwd, db, charset):
        # 数据库连接地址
        self.host = host
        # 地址端口
        self.port = port
        # 数据库用户名
        self.user = user
        # 数据库密码
        self.passwd = passwd
        # 数据库名称
        self.db = db
        # 编码
        self.charset = charset

    """连接数据库,获取Connection对象和cursor游标对象"""

    def open(self):
        self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db,
                            charset=self.charset)
        self.cursor = self.conn.cursor()

    """执行用户输入的sql语句,参数化sql语句中的输入值"""

    def execute(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 提交事务
            self.conn.commit()
            # 关闭数据库连接
            self.close()
            # print("sql执行完成")
        except Exception as e:
            # 发送错误回滚
            # self.rollback()
            return e

    def createDataBase(self, sql, params=()):
        try:
            # 打开数据库连接
            conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd,
                           charset=self.charset)
            cursor = conn.cursor()
            # 执行sql语句
            cursor.execute(sql, params)
            # 提交事务
            conn.commit()
            # 关闭数据库连接
            cursor.close()
            conn.close()
            # print("sql执行完成")
        except Exception as e:
            # 发送错误回滚
            # self.rollback()
            print(e)

    """返回sql全部查询结果"""

    def all(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 调用cursor的fetchall获取全部执行结果
            result = self.cursor.fetchall()
            # 关闭数据库连接
            self.close()
            # 返回执行结果
            return result
        except Exception as e:
            return e

    """返回sql查询结果一行"""

    def single(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 调用cursor的fetchone获取全部执行结果中的一条
            result = self.cursor.fetchone()
            # 关闭数据库连接
            self.close()
            # 返回执行结果
            return result
        except Exception as e:
            print(e)

    """数据库回滚"""

    def rollback(self):
        self.conn.rollback()

    """关闭数据库"""

    def close(self):
        self.cursor.close()
        self.conn.close()


"""测试用"""
if __name__ == '__main__':
    msh = MysqlHelper('localhost', 8080, 'root', '123', 'test', 'utf8')
    name = raw_input('请输入学生姓名:')
    sbname = raw_input('请输入科目名称:')
    sql = 'insert into students(name) values(%s)'
    sql1 = 'insert into subjects(sbname) values(%s)'
    sql2 = 'select id,name from students where id<5'
    msh.execute(sql, [name])
    msh.execute(sql1, [sbname])
    print(msh.all(sql2))

详细的源码请查看资源链接。

程序可能存在部分bug,欢迎交流指正。

你可能感兴趣的:(python)