PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)

有个问题没解决: 将运行过程显示在右边的界面框里的光标问题(光标问题容易解决), 及异常退出的问题.
错误代码:
QObject::connect: Cannot queue arguments of type ‘QTextCursor’
(Make sure ‘QTextCursor’ is registered using qRegisterMetaType().)

Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)
原因: 意外退出的原因结果调试得出的结果应该是,使用QTextBrowser时,由于使用了多线程,因为使用的方式是将其当做参数传入函数由线程调用, 当一个线程运行完销毁时,销毁了传入的QTextBrowser, 由于线程数据共享所以也导致界面意外崩溃.
解决: 采用进程间的队列通信,使得前面的问题得到了解决.
PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第1张图片
PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第2张图片
这幅图存在上述问题.
代码:(测试过了, 有问题, 主要是抓取数据时, cpu占用率太高(进程\线程切换太费系统资源), 内存占用太大变量用完没有删除, 没时间就不改了, 先储存着, 以后有时间再改, 一个系统的功能还未完全, 比如: 数据库查找, 本地txt保存, 多线程多进程启动时QTextBrowser的数据显示问题; 代码存在的问题: 代码健壮性不足, 代码冗余, 链接获取取巧了)
主运行代码(main.py):

from uI import UserInterface
from moduleFunc import *
from PyQt5.QtWidgets import QApplication
import sys
# 逻辑运行区
def logicalArea(UI):
    dbName = pymongoInit(UI)
    UI.editA.clear()
    UI.editA.append('欢迎来到逻辑运行区!')
    # 保存+显示
    if UI.uIControl[5][1].isChecked():
        UI.editA.append('逻辑保存')
        if UI.uIControl[7][1].isChecked():
            UI.editA.append('逻辑开启多线程')
            # 显示所有小说的标题及保存所有小说的内容
            if UI.uIControl[11][1].isChecked():
                UI.editA.append('逻辑开启多进程')
                # 获取整个站点的小说(待完成)
                getAllNovelChapterInfo(UI, saveFlag=True, dbName=dbName)
            # 获取一本小说的总info, 及保存小说(已完成)
            else:
                UI.editA.append('逻辑未开多进程')
                saveNovelChapterInfo(UI, dbName)
        else:
            UI.editA.append('逻辑未开多线程')
            # 显示所有小说的标题和url及保存相应信息
            if UI.uIControl[11][1].isChecked():
                UI.editA.append('逻辑开启多进程')
                # 显示并保存一个站点所有的小说信息
                saveAllNovelInfo(UI, dbName)
            # 显示单篇小说的内容及保存该内容至指定数据库*********************
            else:
                UI.editA.append('逻辑未开多进程')
                saveSimgleArt(UI, dbName)
    else:
        UI.editA.append('逻辑只显示')
        # 显示
        if UI.uIControl[7][1].isChecked():
            UI.editA.append('逻辑开启多线程')
            if UI.uIControl[11][1].isChecked():
                UI.editA.append('逻辑开启多进程')
                # 显示一个站点所有小说及每部小说的详细信息的信息(待完成----未开多线程深入一部, 多进入一个网页)
                getAllNovelChapterInfo(UI)
            else:
                UI.editA.append('逻辑未开多进程')
                # 显示一部小说的信息
                getNovelChapterInfo(UI=UI)
        else:
            UI.editA.append('逻辑未开多线程')
            # 显示所有小说的名字及url
            if UI.uIControl[11][1].isChecked():
                UI.editA.append('逻辑开启多进程')
                # 显示所有小说信息
                getAllNovelInfo(UI)
            else:
                UI.editA.append('逻辑未开多进程')
                # 显示一篇小说
                simgleArt(UI)
def initData(UI):
    UI.uIControl[0][1].setText('http://www.xbiquge.la/15/15409/')
    UI.uIControl[3][1].setText('
(.*?)

.*?

*.?
') UI.uIControl[6][1].setText('

(.*?)

') UI.uIControl[8][1].setText("
(.*?)
") UI.uIControl[10][1].setText(str(16)) UI.uIControl[12][1].setText(str(4)) UI.uIControl[13][1].setText('
  • (.*?)
  • ') UI.uIControl[2][1].setText('道君') UI.uIControl[14][1].setText('biquge') def main(): app = QApplication(sys.argv) UI = UserInterface() initData(UI) UI.uIControl[4][1].clicked.connect(lambda: logicalArea(UI)) sys.exit(app.exec_()) if __name__ == '__main__': main()

    自定义函数部分(moduleFunc.py):

    import pymongo, requests, re, os
    from PyQt5.QtWidgets import QMessageBox
    from threading import Thread
    from multiprocessing import Process, Pool
    # 创建数据库连接
    def pymongoInit(UI):
        client = pymongo.MongoClient('localhost', 27017, connect=False)
        dbName = UI.uIControl[14][1].text()
        biquge = client[dbName]
        return biquge
    # 创建数据表(已完成)
    def pymongeUser(UI=None, biquge=None, name=None, data=None, mulSaveFlag=False):
        try:
            table = biquge[name]
        except:
            if not mulSaveFlag:
                UI.queue.put('创建数据库{}失败'.format(name))
            else:
                print('\033[31;0m创建数据库{}失败\033[0m'.format(name))
            return 0
        # 判断重复
        if data['title'] not in [article['title'] for article in table.find()]:
            table.insert_one(data)
            if not mulSaveFlag:
                UI.queue.put('{0} {1} {2}'.format(name, data['title'], '数据存入成功'))
            else:
                print(('\033[32;0m{0} {1} {2}\033[0m'.format(name, data['title'], '数据存入成功')))
        else:
            # 存入的数据content是列表时,执行本操作
            if isinstance(data['content'], list):
                # 遍历所有数据库
                for obj in table.find():
                    # 查找数据库中content实例为list类型的值
                    if isinstance(obj['content'], list):
                        try:
                            newlyAdd = [item for item in data['content'] if list(item) not in obj['content']]
                        except:
                            print('作者修改了一部分文章')
                        else:
                            if newlyAdd != []:
                                data = {'title': data['title'], 'content': newlyAdd}
                                table.insert_one(data)
                            else:
                                if not mulSaveFlag:
                                    UI.queue.put('{}'.format(name) + '数据库链接更新已完成')
                                else:
                                    print(('\033[33;0m{}\033[0m'.format(name) + '\033[32;0m数据库链接更新已完成\033[0m'))
            else:
                UI.queue.put('\033[33;0m{}\033[0m'.format(name) + '\033[33;0m{}\033[0m'.format(data['title']) + '\033[33;0m数据已在数据库\033[0m')
    # 获取一章文章, 只返回,不显示(已完成)
    def getPage(UI=None, artUrl=None):
        headers = {'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)', }
        try:
            res = requests.get(artUrl, headers=headers)
        except:
            UI.queue.put('获取文章错误' + '{}'.format(artUrl))
            return 0
        else:
            res.encoding = res.apparent_encoding
            try:
                title = re.findall(UI.uIControl[6][1].text(), res.text, re.S)[0]
            except:
                QMessageBox.warning(UI, '警告', 'simglePage正则输入或网址错误, 请检查!')
                return 0
            else:
                try:
                    content = re.findall(UI.uIControl[3][1].text(), res.text, re.M)[0].replace(' ', '  ').replace(
                        '
    ', '\n') except: QMessageBox.warning(UI, '警告', 'pageName正则或网址错误, 请检查!') return 0 else: return {'title': title, 'content': content, 'id': int(artUrl.split('/')[-1].split('.')[0]), 'link': artUrl} # 保存一个线\进程的文章 def saveMulArt(UI, dbName, artUrls, name, mulSaveFlag=False): for link in artUrls: saveSimgleArt(UI, dbName, link, name, mulSaveFlag) # 获取一本小说的总info, 及保存小说(已完成) def saveNovelChapterInfo(UI, dbName, novUrl=None, mulSaveFlag=False): info = getNovelChapterInfo(UI=UI, novUrl=novUrl) if info == 0: return 0 pymongeUser(UI, dbName, info['title'], info, mulSaveFlag) threadNum = UI.uIControl[12][1].text() if threadNum.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '进程数未填入') return 0 threadNum = int(threadNum) if threadNum <= 0: QMessageBox.warning(UI, '警告', '进程数填写错误') return 0 links = [link for link, title in info['content']] linksPart = len(links) // threadNum for i in range(threadNum): if i < threadNum - 1: th = Thread(target=saveMulArt, args=(UI, dbName, links[i * linksPart: (i + 1) * linksPart], info['title'], mulSaveFlag)) else: th = Thread(target=saveMulArt, args=(UI, dbName, links[i * linksPart:], info['title'], mulSaveFlag)) th.start() UI.queueExist.append(th) # 获取所有小说的标题及url并保存相关信息(已完成) def saveAllNovelInfo(UI, dbName, rootUrl=None): info = getAllNovelInfo(UI, rootUrl) if info == 0: return 0 pymongeUser(UI, dbName, info['title'], info) # 获取一章文章并保存(已完成) def saveSimgleArt(UI, dbName, artUrl=None, name=None, mulSaveFlag=False): if artUrl == None: artUrl = UI.uIControl[0][1].text() if artUrl.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '网址未输入') return 0 info = simgleArt(UI, artUrl) if info == 0: return 0 if name == None: name = UI.uIControl[2][1].text() if name.replace(' ', '') == '': return 0 pymongeUser(UI, dbName, name, info, mulSaveFlag) # 完成函数,显示一个站点所有小说及每部小说的详细信息的信息(待完成----未开多线程深入一部, 多进入一个网页, 统计每部小说的信息) # saveFlag为True时变为保存整个站点数据的函数 def getAllNovelChapterInfo(UI, rootUrl=None, mulSaveFlag=False, dbName=None): info = getAllNovelInfo(UI, rootUrl) if info == 0: return 0 mulName = UI.uIControl[12][1].text() if mulName.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '进程数未填入') return 0 mulName = int(mulName) if mulName <= 0: QMessageBox.warning(UI, '警告', '进程数填写错误') return 0 regName = UI.uIControl[6][1].text() regCont = UI.uIControl[8][1].text() if regName.replace(' ', '') == '' or regCont.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '多进程名称和内容填入错误') return 0 links = [link for link, title in info['content']] linksPart = len(links) // mulName for i in range(mulName): if i < mulName - 1: pro = Process(target=getNovelChapterInfo_raw, args=(UI, links[i * linksPart: (i + 1) * linksPart], mulSaveFlag, dbName)) pro.start() else: pro = Process(target=getNovelChapterInfo_raw, args=(UI, links[i * linksPart:], mulSaveFlag, dbName)) pro.start() UI.queueProce.append(pro) # saveFlag = False多进程获取小说信息(已完成) # saveFlag = True获取整个站点的小说(待完成) def getNovelChapterInfo_raw(UI, linksList, mulSaveFlag=False, dbName=None): for link in linksList: if not mulSaveFlag: data = getNovelChapterInfo(UI, link, mulProcess=True) UI.queue.put('小说信息: {0} 共{1}章'.format(data['title'], len(data['content']))) else: saveNovelChapterInfo(UI, dbName, novUrl=link, mulSaveFlag=mulSaveFlag) # 获取一本小说的总info(已完成) def getNovelChapterInfo(UI=None, novUrl=None, mulProcess=False): headers = {'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)', } if novUrl == None: novUrl = UI.uIControl[0][1].text() if novUrl.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '请填写一部小说目录url') return 0 try: res = requests.get(novUrl, headers=headers) except: QMessageBox.warning(UI, '警告', '获取整部小说的url失败') return 0 else: res.encoding = res.apparent_encoding try: title = re.findall(UI.uIControl[6][1].text(), res.text, re.S)[0] except: QMessageBox.warning(UI, '警告', '获取小说的名称失败') return 0 try: content = re.findall(UI.uIControl[8][1].text(), res.text, re.S) except: QMessageBox.warning(UI, '警告', '获取小说的章节目录失败') return 0 if UI.uIControl[9][1].isChecked(): urlSplit = novUrl.split('/') linksSplit = [link.split('/') for link, title in content] links = [link for link, title in content] linksPart = ['/'.join([value for value in item if value in urlSplit]) for item in linksSplit] links = [novUrl + link.replace(part + '/', '') for link, part in zip(links, linksPart)] else: links = [novUrl + link for link, title in content] links = ['http://' + link.replace('http://', '').replace('//', '/') for link in links] tempTitles = [title for link, title in content] content = list(zip(links, tempTitles)) if not mulProcess: UI.queue.put('{0}'.format('\n'.join([title + '\n' + link for link, title in content]))) UI.queue.put('整篇小说信息获取成功! {0} 共{1}章'.format(title, len(content))) return {'title': title, 'content': content, 'link': novUrl} # 显示所有小说的标题及url(已完成) def getAllNovelInfo(UI, rootUrl=None): headers = {'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)', } if rootUrl == None: rootUrl = UI.uIControl[0][1].text() if rootUrl.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '请填写网站的总目录url') return 0 try: res = requests.get(rootUrl, headers=headers) except: QMessageBox.warning(UI, '警告', '提取网站小说总目录错误' + rootUrl) return 0 else: res.encoding = res.apparent_encoding try: info = re.findall(UI.uIControl[13][1].text(), res.text, re.S) info = [(link, title) for link, title in info if len(link.split('/')) > 5] except: QMessageBox.warning(UI, '警告', '提取网站所有的小说链接错误') return 0 saveName = UI.uIControl[14][1].text() if saveName.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '网站名称未填入') return 0 showInfo = [' '.join(info) for info in info] UI.queue.put('\n'.join(showInfo)) try: links, novelNames = zip(*info) except: QMessageBox.warning(UI, '警告', '您确认该网页的符合要求?') return 0 UI.queue.put('成功找到小说{}部'.format(str(len(links)))) return {'title': saveName, 'content': info} # 获取一章文章(已完成) def simgleArt(UI=None, artUrl=None): if artUrl == None: artUrl = UI.uIControl[0][1].text() if artUrl.replace(' ', '') == '': QMessageBox.warning(UI, '警告', '网址未输入') return 0 article = getPage(UI, artUrl) if article == 0: return 0 UI.editA.clear() UI.queue.put('{}'.format(article['title'])) article['content'] = article['content'].replace('小哥哥小姐姐们,推荐,收藏有木有', '').replace(' (三七中文 et)', '') UI.queue.put(article['content']) else: article = getPage(UI, artUrl) if article == 0: return 0 article['content'] = article['content'].replace('小哥哥小姐姐们,推荐,收藏有木有', '').replace(' (三七中文 et)', '') return {'title': article['title'], 'content': article['content'], 'id': article['id'], 'link': article['link']}

    UI部分代码(uI.py):

    import os, sys, functools
    #from queue import Queue
    from PyQt5.QtWidgets import (QWidget, QSplitter, QFrame, QFormLayout,
                                 QLabel, QLineEdit, QCheckBox, QPushButton,
                                 QVBoxLayout, QTextBrowser, QDesktopWidget,
                                 QApplication, QFileDialog, QMessageBox)
    from PyQt5.QtCore import Qt, QTimer
    from multiprocessing import Pipe, Queue
    class UserInterface(QWidget):
        def __init__(self, parent=None):
            super(UserInterface, self).__init__(parent)
            self.setWindowTitle('爬虫UI主窗口')
            self.centralScreen()
            self.uIDesign()
            self.operationInit()
            self.initTimer()
            self.show()
        def uIDesign(self):
            spliter = QSplitter(Qt.Horizontal, self)
            spliter.resize(self.size())
            self.framea  = QFrame()
            self.uIDesignL()
            self.frameb  = QFrame()
            self.uIDesignR()
            spliter.addWidget(self.framea)
            spliter.addWidget(self.frameb)
        # 左边ui
        def uIDesignL(self):
            self.framea.setStyleSheet('background-color: #D1EEEE')
            self.framea.resize(self.size().width()/2, self.size().height()/2)
            # 设置
            form = QFormLayout(self.framea)
            self.uIControl = ( (QLabel('网址:'),           QLineEdit()),
                               (QLabel('保存'),            QPushButton('本地保存路径')),
                               (QLabel('小说名称:'),        QLineEdit()),
                               (QLabel('simglePage正则:'), QLineEdit()),
                               (QLabel(),                 QPushButton('开始执行')),
                               (QLabel('本地保存?'),        QCheckBox('是')),
                               (QLabel('Name正则:'),       QLineEdit()),
                               (QLabel('开启多线程?'),      QCheckBox('是')),
                               (QLabel('mulPage正则:'),    QLineEdit()),
                               (QLabel('链接重复:'),        QCheckBox('是')),
                               (QLabel('多线程数:'),        QLineEdit()),
                               (QLabel('开启多进程?'),      QCheckBox('是')),
                               (QLabel('多进程数:'),        QLineEdit()),
                               (QLabel('root正则'),        QLineEdit()),
                               (QLabel('网站名:'),         QLineEdit()),)
            for row in self.uIControl:
                form.addRow(*row)
        # 控件事件
            self.uIControl[1][1].clicked.connect(self.savePath)
            self.uIControl[5][1].stateChanged.connect(self.checkSaveFile)
            self.uIControl[7][1].stateChanged.connect(self.checkMulThread)
            self.uIControl[11][1].stateChanged.connect(self.checkMulProcss)
        # 保存
        def savePath(self):
            dir, ok = QFileDialog.getSaveFileName(self, '保存', '', 'FilterFile (*)')
            if ok:
                os.mkdir(dir)
                self.uIControl[1][1].setText(dir)
        # 右边ui
        def uIDesignR(self):
            self.frameb.setStyleSheet('background-color: #F0F8FF')
            self.frameb.resize(self.size().width() / 2, self.size().height() / 2)
            layout = QVBoxLayout(self.frameb)
            layout.setContentsMargins(0, 0, 5, 0)
            label = QLabel('运行过程')
            self.editA = QTextBrowser()
            self.cursor = self.editA.textCursor()
            self.editA.setStyleSheet('background-color: white')
            layout.addWidget(label)
            layout.addWidget(self.editA)
    
    
        # 通知消息
        def noticeMes(self, message):
            QMessageBox.information(self, '通知', message)
        # 初始化操作
        def operationInit(self):
            self.uIControl[0][1].setStyleSheet('background-color: white')
            self.uIControl[3][1].setStyleSheet('background-color: white')
            self.uIControl[6][1].setStyleSheet('background-color: white')
            self.uIControl[14][1].setStyleSheet('background-color: white')
            for lock in [1, 2, 8, 10, 12, 13]:
                self.uIControl[lock][1].setEnabled(False)
        # 开启本地保存
        def checkSaveFile(self):
            if self.uIControl[5][1].isChecked():
                if not self.uIControl[7][1].isChecked() and not self.uIControl[11][1].isChecked():
                    self.uIControl[2][1].setStyleSheet('background-color: white')
                    self.uIControl[2][1].setEnabled(True)
                    self.uIControl[1][1].setEnabled(True)
                    self.editA.append('开启本地保存')
            else:
                for lock in [1, 2]:
                    self.uIControl[lock][1].setEnabled(False)
                self.editA.append('关闭本地保存')
                self.uIControl[1][1].setText('保存')
                self.uIControl[2][1].setStyleSheet('background-color: #D1EEEE')
        # 开启多线程
        def checkMulThread(self):
            if self.uIControl[7][1].isChecked():
                self.uIControl[2][1].setEnabled(False)
                self.uIControl[2][1].setStyleSheet('background-color: #D1EEEE')
                for unlock in [8, 10]:
                    self.uIControl[unlock][1].setEnabled(True)
                    self.uIControl[unlock][1].setStyleSheet('background-color: white')
                self.editA.append('开启多线程')
            else:
                if not self.uIControl[11][1].isChecked() and self.uIControl[5][1].isChecked():
                    self.uIControl[2][1].setEnabled(True)
                    self.uIControl[2][1].setStyleSheet('background-color: white')
                for lock in [8, 10]:
                    self.uIControl[lock][1].setEnabled(False)
                    self.uIControl[lock][1].setStyleSheet('background-color: #D1EEEE')
                self.editA.append('关闭多线程')
        # 开启多进程
        def checkMulProcss(self):
            if self.uIControl[11][1].isChecked():
                self.uIControl[2][1].setEnabled(False)
                self.uIControl[2][1].setStyleSheet('background-color: #D1EEEE')
                for unlock in [12, 13]:
                    self.uIControl[unlock][1].setEnabled(True)
                    self.uIControl[unlock][1].setStyleSheet('background-color: white')
                self.editA.append('开启多进程')
            else:
                if self.uIControl[5][1].isChecked() and not self.uIControl[7][1].isChecked():
                    self.uIControl[2][1].setEnabled(True)
                    self.uIControl[2][1].setStyleSheet('background-color: white')
                for lock in [12, 13]:
                    self.uIControl[lock][1].setEnabled(False)
                    self.uIControl[lock][1].setStyleSheet('background-color: #D1EEEE')
                self.editA.append('关闭多进程')
        # 初始化定时刷新QTextBrowser
        def initTimer(self):
            # 处理线程的队列
            self.queue = Queue()
            # 处理进程的队列
            self.queuep = Queue()
            # 处理线程存在与否的寄存列表初始化, 在线程下载小说的功能中使用
            self.queueExist = []
            # 处理进程存在与否的寄存列表初始化, 在进程下载小说的功能中使用
            self.queueProce = []
            self.time = QTimer(self)
            self.time.start(1000)
            self.time.timeout.connect(self.timeCustomEvent)
        # 初始化定时刷新QTextBrowser的槽函数
        def timeCustomEvent(self):
            if not self.queue.empty() and not self.uIControl[11][1].isChecked():
                outPut = self.queue.get()
                self.editA.append(outPut)
                # 如果列表中的所有线程都已销毁则执行该函数
                if not any([th.is_alive() for th in self.queueExist]) and self.uIControl[5][1].isChecked() and \
                        self.uIControl[7][1].isChecked():
                    #self.editA.append('{} 小说下载完毕'.format(outPut.split(' ')[0]))
                    novelName = outPut.split('>')[1].split(' ')[0]
                    self.editA.append('{} 小说下载完毕'.format(novelName))
            #elif not self.queuep.empty():
            '''
            else:
                outPutp = self.queuep.get()
                self.editA.append(outPutp)
                if not any(pro.is_alive() for pro in self.queueProce) and self.uIControl[5][1].isChecked() and \
                        self.uIControl[7][1].isChecked() and self.uIControl[11][1].isChecked():
                    self.editA.append('全站小说下载完毕')
            '''
        # 初始化界面对中
        def centralScreen(self):
            screen = QDesktopWidget().geometry()
            size = self.size()
            self.move((screen.width() - size.width()) / 2,
                      (screen.height() - size.height()) / 2)
        # 关闭事件确认
        def closeEvent(self, e):
            reply = QMessageBox.question(self, '确认', '您确认要关闭吗?', QMessageBox.Yes|QMessageBox.No, QMessageBox.No)
            if reply == QMessageBox.Yes:
                e.accept()
            else:
                e.ignore()
    if __name__ == '__main__':
        app = QApplication(sys.argv)
        self  = UserInterface()
        sys.exit(app.exec_())
    

    图片:
    , PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第3张图片
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第4张图片
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第5张图片
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第6张图片
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第7张图片
    最初思路的图片:
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第8张图片
    PyQt5界面多线程多进程爬虫(爬了600w张网页, 出现了一些问题)_第9张图片

    你可能感兴趣的:(Python)