#coding=utf-8 __author__ = 'ds' from PyQt4.QtCore import * from PyQt4.QtGui import * from PyQt4.QtNetwork import * from PyQt4.QtWebKit import * import thread import threading import os,sys DEFAULT_DIR = ur'F:\图片\美女' #定制网络管理模块,从网络模块中筛选出定制的数据部分 class NetworkManager(QNetworkAccessManager): def __init__(self, parent = None): super(NetworkManager, self).__init__(parent) self._buffer = {} def createRequest(self, QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device=None): reply = super(NetworkManager, self).createRequest(QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device) if reply.url().scheme() == 'http': reply.readyRead.connect(self.onReadReady) reply.finished.connect(self.onFinish) return reply def onReadReady(self): reply = self.sender() url = reply.url().toString() rx = QRegExp(u'\\d+/\\d+\\d+\.jpg') if url.indexOf(rx) == -1: return #因为加载顺序是异步的,所以不能使用直接缓存,最后直接在finish里面使用 #因为每次的readReady可能来自不能的文件,通过文件来源进行键值对的缓存 size = reply.size() if url in self._buffer: self._buffer[url].append(reply.peek(size)) else: self._buffer[url] = reply.peek(size) def onFinish(self): reply = self.sender() url = reply.url().toString() if not url in self._buffer: return segments = url.split('/') file_name = unicode(segments[-1]) file_dirname = segments[-3] + os.path.sep + segments[-2] file_dir = os.path.join(DEFAULT_DIR, unicode(file_dirname)) if not os.path.isdir(file_dir): os.makedirs(file_dir) file_fullname = os.path.join(file_dir, file_name) f = QFile(QString(file_fullname)) ok = f.open(QFile.WriteOnly) if not ok: print('打开文件失败:%s' % (file_fullname)) return f.write(self._buffer[url]) f.close() self._buffer.pop(url) print('成功下载文件到:%s 来自:%s' % (file_fullname, url)) #定制QWebPage,在下载完成当前页面后,继续下一页 class WebPage(QWebPage): def __init__(self, parent = None): super(WebPage, self).__init__(parent) self.loadFinished.connect(self.onLoadFinish) def onLoadFinish(self, ok): frame = self.mainFrame() last_url = frame.url() if not ok: print('当前页面加载失败,尝试重新加载:%s' % (last_url,)) self.mainFrame().load(last_url) return pages = frame.findFirstElement('#pages') if not pages: print('当前页面不存在下一页 %s' % (last_url,)) return next_page = None pages = pages.findAll('a.a1') if len(pages) == 0: print('找不到上-下页标签%s' % (last_url,)) return for p in pages: if p.toPlainText() == u'下一页': next_page = p break if not next_page: print('找不到下一页:%s' % (last_url,)) return href = next_page.attribute(u'href') print(str(href)) rx = QRegExp(u'/g/\\d+/\\d+') if href.indexOf(rx) == -1: print('已经加载到最后') return last_url.setPath(href) self.mainFrame().load(last_url) print('加载下一页地址:%s' % (last_url.toString(),)) #class NetworkThread(threading.Thread): if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf-8') app = QApplication(sys.argv) pages = [] wvs = [] PAGES = ['18214', '16751', '13207', '13206', '13205', '13148','11363'] for p in PAGES: url = QUrl(u'http://www.zngirls.com/g/%s' % (p,)) page = WebPage() page.setNetworkAccessManager(NetworkManager()) page.mainFrame().load(url) pages.append(page) wv = QWebView() wv.setWindowTitle(p) wv.setPage(page) wv.show() wvs.append(wv) #wv2 = QWebView() #wv2.show() app.exec_()