简易小说爬虫(带有GUI界面)

简易小说爬虫(带有GUI界面)

效果:

简易小说爬虫(带有GUI界面)_第1张图片
简易小说爬虫(带有GUI界面)_第2张图片

特点:

  • 可实现简单的小说搜索功能
  • 可选择下载目录
  • 下载进度可视化
  • 多线程下载

代码部分:

python部分:

import random
from threading import Thread
from urllib.parse import quote

from PyQt5.QtCore import QThread, pyqtSignal, QFile, Qt
from PyQt5.QtGui import QIcon, QPalette, QBrush, QPixmap
from PyQt5.QtWidgets import QGridLayout, QLabel, QLineEdit, QPushButton, QListWidget, QProgressBar, QMessageBox, \
    QApplication, QFileDialog, QWidget
from bs4 import BeautifulSoup
import requests
import win # 引入qrc资源文件,代码在后面
from lxml import etree
import sys


def dataGet(url):
    """网页源代码获取"""
    
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]
    user_agent = random.choice(user_agent_list)
    headers = {'User-Agent': user_agent}
    i = 0
    while i < 4:  # 4s超时, 4次重试
        try:
            response = requests.get(url, headers, timeout=4)
            response.encoding = 'gbk'
            return response.text
        except requests.exceptions.RequestException:
            i += 1


def novelSearch(data):
    """在小说搜索网页获取小说信息"""
    
    soup = BeautifulSoup(data, features='lxml')
    lis = soup.find_all('li')
    novelList = []
    novelInfoList = []
    linkList = []
    for li in lis:
        html = etree.HTML(str(li))
        class_ = html.xpath('//span[@class="s1"]/text()')
        name = html.xpath('//span[@class="s2"]/a/text()')
        link = html.xpath('//span[@class="s2"]/a/@href')
        new = html.xpath('//span[@class="s3"]/a/text()')
        author = html.xpath('//span[@class="s4"]/text()')
        time = html.xpath('//span[@class="s5"]/text()')
        now = html.xpath('//span[@class="s7"]/text()')
        if class_ and now and new:
            novelList.append(name[0])
            novelInfoList.append([class_[0], name[0], link[0], new[0], author[0], time[0], now[0]])
            linkList.append(link[0])
    return [novelList, novelInfoList, linkList]


def chapterGet(data):
    """在目录界面获取小说章节"""
    
    html = etree.HTML(data)
    chapters_name = html.xpath('//dl/dd/a/text()')
    chapters_link = html.xpath('//dl/dd/a/@href')
    chapters = []
    for i, j in zip(chapters_name, chapters_link):
        chapters.append([i, j])
    return chapters


def contentGet(data):
    """获取小说内容"""
    
    string = data.replace('
'
, '').replace('
'
, '') html = etree.HTML(string) title = html.xpath('//div[@class="bookname"]/h1/text()') content = html.xpath('//div[@id="content"]/text()') return [title[0], content[0]] def Del_line(file_path): """删除文件空行""" with open(file_path, "r", encoding='utf-8') as f: res = f.readlines() res = [x for x in res if x.split()] with open(file_path, "w", encoding='utf-8') as f: f.write("".join(res)) class WorkThread(Thread): """多线程类""" def __init__(self, func, args=()): super(WorkThread, self).__init__() self.func = func self.args = args def run(self): self.result = self.func(*self.args) def get_result(self): """获取线程返回值""" try: return self.result except Exception: return None class SearchThread(QThread): """搜索线程""" _signal1 = pyqtSignal(list) _signal2 = pyqtSignal(list) _signal3 = pyqtSignal() def __init__(self): super(SearchThread, self).__init__() def __del__(self): self.wait() def set_name(self, string): self.novelName = string def run(self): searchURL = 'https://www.52bqg.com/modules/article/search.php?searchkey=' + self.novelName url = quote(searchURL, safe=";/?:@&=+$,", encoding="gbk") data = dataGet(url) lists = novelSearch(data) if lists[0]: self._signal1.emit(lists[0]) self._signal2.emit(lists[1]) else: self._signal3.emit() class DownThread(QThread): """下载线程""" _signal1 = pyqtSignal(int) _signal2 = pyqtSignal() def __init__(self): super(DownThread, self).__init__() def __del__(self): self.wait() def set_link(self, string): self.link = string def set_name(self, string): self.name = string def set_path(self, string): self.path = string def run(self): tar_url = self.link data = dataGet(tar_url) chapters = chapterGet(data) threads1 = [] for i in chapters: link = self.link + i[1] t = WorkThread(dataGet, args=(link,)) threads1.append(t) for i in threads1: i.start() datas = [] n = len(threads1) for i in threads1: s = threads1.index(i) index = int(100 * (s / n)) i.join() self._signal1.emit(index) datas.append(i.get_result()) contents = [] for i in datas: content = contentGet(i) contents.append(content) path = self.path + '/' + self.name + '.txt' f = open(path, 'a', encoding='utf-8') for i in contents: f.write(i[0] + '\n') f.write(i[1] + '\n') f.close() Del_line(path) self._signal1.emit(1000) self._signal2.emit() class MainWin(QWidget): """主窗口""" novelList = [] novelInfoList = [] dir_path = '' def __init__(self): super(MainWin, self).__init__() self.setWindowTitle("Downloader") self.setWindowIcon(QIcon(":/sources/images/icon.png")) self.setFixedSize(400, 600) self.layout = QGridLayout() self.layout.setSpacing(5) self.label = QLabel("小说名称:") self.layout.addWidget(self.label, 0, 0) self.input = QLineEdit() self.input.setPlaceholderText("输入小说名称") self.layout.addWidget(self.input, 0, 1, 1, 5) self.button = QPushButton("搜索") self.button.clicked.connect(self.search) self.layout.addWidget(self.button, 0, 6) self.label1 = QLabel("搜索结果:") self.layout.addWidget(self.label1, 1, 0) self.list = QListWidget() self.list.clicked.connect(self.choice) self.layout.addWidget(self.list, 2, 0, 1, 7) self.label2 = QLabel("下载地址:") self.label2.setObjectName("address") self.layout.addWidget(self.label2, 3, 0) self.input1 = QLineEdit() self.input1.setPlaceholderText("下载地址") self.input1.setFocusPolicy(Qt.NoFocus) self.layout.addWidget(self.input1, 3, 1, 1, 5) self.button1 = QPushButton("...") self.button1.clicked.connect(self.path) self.layout.addWidget(self.button1, 3, 6) self.button2 = QPushButton("开始下载") self.button2.clicked.connect(self.download) self.layout.addWidget(self.button2, 4, 0, 1, 7) self.label3 = QLabel("下载进度:") self.label3.setObjectName("pbar") self.layout.addWidget(self.label3, 5, 0) self.pbar = QProgressBar() self.layout.addWidget(self.pbar, 5, 1, 1, 6) self.setLayout(self.layout) self.thread = SearchThread() self.thread._signal1.connect(self.callback_1) self.thread._signal2.connect(self.callback_2) self.thread._signal3.connect(self.nothing) self.thread_1 = DownThread() self.thread_1._signal1.connect(self.pbarindex) self.thread_1._signal2.connect(self.finish) self.palette = QPalette() self.palette.setBrush(QPalette.Background, QBrush(QPixmap(":/sources/images/bg.jpg"))) self.setPalette(self.palette) self.show() def search(self): self.pbar.setValue(0) name = self.input.text() if name == '': QMessageBox.information(self, "提示", "请输入关键字", QMessageBox.Ok, QMessageBox.Ok) return self.thread.set_name(name) self.thread.start() def addItem(self): for i in self.novelList: n = self.novelList.index(i) self.list.addItem(i) QApplication.processEvents() def callback_1(self, msg): self.list.clear() self.novelList = msg self.addItem() def callback_2(self, msg): self.linList = [] for i in msg: self.linList.append(i[2]) i[0] = '小说分类:' + i[0] i[1] = '小说名称:' + i[1] i[2] = '小说链接:' + i[2] i[3] = '最新章节:' + i[3] i[4] = '小说作者:' + i[4] i[5] = '最近更新时间:' + i[5] i[6] = '更新状态:' + i[6] self.novelInfoList.append([i[0], i[1], i[2], i[3], i[4], i[5], i[6]]) def nothing(self): QMessageBox.information(self, "提示", "未搜索到任何结果", QMessageBox.Ok, QMessageBox.Ok) def choice(self, index): r = index.row() string = '\n'.join(self.novelInfoList[r]) box = QMessageBox.information(self, "详细信息", string, QMessageBox.No | QMessageBox.Yes, QMessageBox.Yes) if box == QMessageBox.Yes: self.novel_name = self.novelList[r] self.link = self.linList[r] def path(self): self.dir_path = QFileDialog.getExistingDirectory(self, "choose directory", "D:\\") self.input1.setText(self.dir_path) def download(self): if self.dir_path == '': QMessageBox.information(self, "提示", "未选择下载路径", QMessageBox.Ok, QMessageBox.Ok) return name = self.novel_name path = self.dir_path link = self.link self.thread_1.set_link(link) self.thread_1.set_name(name) self.thread_1.set_path(path) self.thread_1.start() def pbarindex(self, msg): if msg == 1000: self.pbar.setValue(100) return self.pbar.setValue(msg) def finish(self): QMessageBox.information(self, "提示", "下载完成", QMessageBox.Ok, QMessageBox.Ok) self.pbar.setValue(0) if __name__ == '__main__': app = QApplication(sys.argv) win = MainWin() file = QFile(':/sources/qss/style.css') file.open(QFile.ReadOnly) qss = str(file.readAll(), encoding='utf-8') file.close() win.setStyleSheet(qss) sys.exit(app.exec_())

qss部分 (个人缺少艺术细菌):

QMessageBox > QLabel {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLabel {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: white;
}

QLabel#address {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLabel#pbar {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLineEdit {
    background-color: rgb(255, 255, 255, 180);
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

QLineEdit:hover {
    border: 2px groove gray;
    background-color: rgb(255, 255, 255, 180);
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton {
    min-width: 50px;
    background-color: rgb(255, 255, 255, 180);
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton:hover {
    min-width: 50px;
    background-color: gainsboro;
    border: 2px groove #007bff;
    background-color: #007bff;
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton:pressed {
    min-width: 30px;
    border: 2px groove #007bff;
    background-color: azure;
    border-radius: 10px;
    padding: 2px 4px;
}

QListWidget {
    border: 2px groove gray;
    background-color: rgb(255, 255, 255, 180);
    border-radius: 10px;
    padding: 2px 4px;
}

QProgressBar {
    background-color: rgb(255, 255, 255, 180);
    max-height: 15px;
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

win.py:
不行,太多了,等下附上文件吧。
点击下载
提取码:cj8q
附上我的目录结构:
简易小说爬虫(带有GUI界面)_第3张图片

你可能感兴趣的:(简易小说爬虫(带有GUI界面))