使用qwebpage来下载资源

#coding=utf-8

__author__ = 'ds'

from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtNetwork import *
from PyQt4.QtWebKit import *
import thread
import threading
import os,sys

DEFAULT_DIR = ur'F:\图片\美女'

#定制网络管理模块,从网络模块中筛选出定制的数据部分
class NetworkManager(QNetworkAccessManager):
    def __init__(self, parent = None):
        super(NetworkManager, self).__init__(parent)
        self._buffer = {}

    def createRequest(self, QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device=None):
        reply = super(NetworkManager, self).createRequest(QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device)

        if reply.url().scheme() == 'http':
            reply.readyRead.connect(self.onReadReady)
            reply.finished.connect(self.onFinish)

        return reply

    def onReadReady(self):
        reply = self.sender()
        url = reply.url().toString()
        rx = QRegExp(u'\\d+/\\d+\\d+\.jpg')
        if url.indexOf(rx) == -1:
            return

        #因为加载顺序是异步的,所以不能使用直接缓存,最后直接在finish里面使用
        #因为每次的readReady可能来自不能的文件,通过文件来源进行键值对的缓存
        size = reply.size()
        if url in self._buffer:
            self._buffer[url].append(reply.peek(size))
        else:
            self._buffer[url] = reply.peek(size)

    def onFinish(self):
        reply = self.sender()
        url = reply.url().toString()
        if not url in self._buffer:
            return

        segments = url.split('/')
        file_name = unicode(segments[-1])
        file_dirname = segments[-3] + os.path.sep + segments[-2]
        file_dir = os.path.join(DEFAULT_DIR, unicode(file_dirname))
        if not os.path.isdir(file_dir):
            os.makedirs(file_dir)

        file_fullname = os.path.join(file_dir, file_name)
        f = QFile(QString(file_fullname))
        ok = f.open(QFile.WriteOnly)
        if not ok:
            print('打开文件失败:%s' % (file_fullname))
            return

        f.write(self._buffer[url])
        f.close()
        self._buffer.pop(url)
        print('成功下载文件到:%s 来自:%s' % (file_fullname, url))

#定制QWebPage,在下载完成当前页面后,继续下一页
class WebPage(QWebPage):
    def __init__(self, parent = None):
        super(WebPage, self).__init__(parent)
        self.loadFinished.connect(self.onLoadFinish)

    def onLoadFinish(self, ok):
        frame = self.mainFrame()
        last_url = frame.url()
        if not ok:
            print('当前页面加载失败,尝试重新加载:%s' % (last_url,))
            self.mainFrame().load(last_url)
            return

        pages = frame.findFirstElement('#pages')
        if not pages:
            print('当前页面不存在下一页 %s' % (last_url,))
            return

        next_page = None
        pages = pages.findAll('a.a1')
        if len(pages) == 0:
            print('找不到上-下页标签%s' % (last_url,))
            return

        for p in pages:
            if p.toPlainText() == u'下一页':
                next_page = p
                break

        if not next_page:
            print('找不到下一页:%s' % (last_url,))
            return

        href = next_page.attribute(u'href')
        print(str(href))
        rx = QRegExp(u'/g/\\d+/\\d+')
        if href.indexOf(rx) == -1:
            print('已经加载到最后')
            return

        last_url.setPath(href)
        self.mainFrame().load(last_url)
        print('加载下一页地址:%s' % (last_url.toString(),))

#class NetworkThread(threading.Thread):

if __name__ == '__main__':
    reload(sys)
    sys.setdefaultencoding('utf-8')
    app = QApplication(sys.argv)

    pages = []
    wvs = []
    PAGES = ['18214', '16751', '13207', '13206', '13205', '13148','11363']
    for p in PAGES:
        url = QUrl(u'http://www.zngirls.com/g/%s' % (p,))
        page = WebPage()
        page.setNetworkAccessManager(NetworkManager())
        page.mainFrame().load(url)
        pages.append(page)

        wv = QWebView()
        wv.setWindowTitle(p)
        wv.setPage(page)
        wv.show()
        wvs.append(wv)

    #wv2 = QWebView()
    #wv2.show()

    app.exec_()

你可能感兴趣的:(Qt,qtwebkit,python)