#coding=utf-8
__author__ = 'ds'
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtNetwork import *
from PyQt4.QtWebKit import *
import thread
import threading
import os,sys
DEFAULT_DIR = ur'F:\图片\美女'
#定制网络管理模块,从网络模块中筛选出定制的数据部分
class NetworkManager(QNetworkAccessManager):
def __init__(self, parent = None):
super(NetworkManager, self).__init__(parent)
self._buffer = {}
def createRequest(self, QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device=None):
reply = super(NetworkManager, self).createRequest(QNetworkAccessManager_Operation, QNetworkRequest, QIODevice_device)
if reply.url().scheme() == 'http':
reply.readyRead.connect(self.onReadReady)
reply.finished.connect(self.onFinish)
return reply
def onReadReady(self):
reply = self.sender()
url = reply.url().toString()
rx = QRegExp(u'\\d+/\\d+\\d+\.jpg')
if url.indexOf(rx) == -1:
return
#因为加载顺序是异步的,所以不能使用直接缓存,最后直接在finish里面使用
#因为每次的readReady可能来自不能的文件,通过文件来源进行键值对的缓存
size = reply.size()
if url in self._buffer:
self._buffer[url].append(reply.peek(size))
else:
self._buffer[url] = reply.peek(size)
def onFinish(self):
reply = self.sender()
url = reply.url().toString()
if not url in self._buffer:
return
segments = url.split('/')
file_name = unicode(segments[-1])
file_dirname = segments[-3] + os.path.sep + segments[-2]
file_dir = os.path.join(DEFAULT_DIR, unicode(file_dirname))
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
file_fullname = os.path.join(file_dir, file_name)
f = QFile(QString(file_fullname))
ok = f.open(QFile.WriteOnly)
if not ok:
print('打开文件失败:%s' % (file_fullname))
return
f.write(self._buffer[url])
f.close()
self._buffer.pop(url)
print('成功下载文件到:%s 来自:%s' % (file_fullname, url))
#定制QWebPage,在下载完成当前页面后,继续下一页
class WebPage(QWebPage):
def __init__(self, parent = None):
super(WebPage, self).__init__(parent)
self.loadFinished.connect(self.onLoadFinish)
def onLoadFinish(self, ok):
frame = self.mainFrame()
last_url = frame.url()
if not ok:
print('当前页面加载失败,尝试重新加载:%s' % (last_url,))
self.mainFrame().load(last_url)
return
pages = frame.findFirstElement('#pages')
if not pages:
print('当前页面不存在下一页 %s' % (last_url,))
return
next_page = None
pages = pages.findAll('a.a1')
if len(pages) == 0:
print('找不到上-下页标签%s' % (last_url,))
return
for p in pages:
if p.toPlainText() == u'下一页':
next_page = p
break
if not next_page:
print('找不到下一页:%s' % (last_url,))
return
href = next_page.attribute(u'href')
print(str(href))
rx = QRegExp(u'/g/\\d+/\\d+')
if href.indexOf(rx) == -1:
print('已经加载到最后')
return
last_url.setPath(href)
self.mainFrame().load(last_url)
print('加载下一页地址:%s' % (last_url.toString(),))
#class NetworkThread(threading.Thread):
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
app = QApplication(sys.argv)
pages = []
wvs = []
PAGES = ['18214', '16751', '13207', '13206', '13205', '13148','11363']
for p in PAGES:
url = QUrl(u'http://www.zngirls.com/g/%s' % (p,))
page = WebPage()
page.setNetworkAccessManager(NetworkManager())
page.mainFrame().load(url)
pages.append(page)
wv = QWebView()
wv.setWindowTitle(p)
wv.setPage(page)
wv.show()
wvs.append(wv)
#wv2 = QWebView()
#wv2.show()
app.exec_()