最近一直在学习python,想写一些练习。看别人都是爬壁纸什么的,感觉那些也没啥意思,所有我就去爬番号了。
如果不想自己复制代码可以自行下载
点我下载,提取码:9nas
主要是下次再Start.py执行时,若解释器发现这个Html.py 脚本没有修改过,就会跳过编译这一步,直接运行以前生成的保存在
__pycache__文件夹里的 *.pyc 文件,大大缩短项目运行前的准备时间。
#1.网页操作类库
import urllib.request,urllib.parse,bs4
class Html(object):
'''[网页类]
[对"GET"、"POST"方式获取网页的二次封装,基于urllib.request类库]
'''
def __init__(self,url,headers,requestMode = "GET",data = None):
'''[初始化]
[类的数据初始化]
Arguments:
url {[str]} -- [url网址]
headers {[dict]} -- [网站报头表单]
Keyword Arguments:
requestMode {str} -- [请求模式["GET","POST"]] (default: {"GET"})
data {[dict]} -- [POST请求表单] (default: {None})
'''
self.url = url
self.headers = headers
self.requestMode = requestMode
self.data = data
#获取响应文本,并转换成soup对象
self.responseSoup = bs4.BeautifulSoup(self.getResponse(),"html.parser")
def __str__(self):
'''[类信息]
[用于打印类信息]
'''
return self.url + "\n" + _
str(self.headers) + "\n" + _
str(self.requestMode) + "\n" + _
str(self.data)
def getResponse(self):
'''[获取响应]
[获取网页响应,并返回字符串]
Returns:
[str] -- [网页响应]
'''
#判断请求模式
if self.requestMode == "GET":
#创建请求对象
request = urllib.request.Request(self.url,headers = self.headers)
else:
#判断"POST"请求所需要的表单
if len(self.data) == 0:
#打印错误信息
print("POST请求所需的表单data为空,退出函数。")
return
else:
#将data表单编码
data = urllib.parse.urlencode(self.data).encode("utf-8")
#创建带data表单的请求
request = urllib.request.Request(url = self.url,headers = self.headers,data = data)
#循环获取响应,避免获取失败直接结束函数
while True:
#发送请求,得到响应
response = urllib.request.urlopen(request,timeout = 30)
#判断是否获取成功
if response.getcode() == 200:
break
else:
#打印错误信息,继续循环
print("获取网页失败,重新获取。错误报告:%d"%response.getcode())
#返回响应
return response
def labelSelect(self,label):
'''[标签选择]
[用于获取soup对象中的标签]
Arguments:
label {[str]} -- [选择条件]
'''
if self.responseSoup == None:
#打印错误提示
print("soup对象为空。")
else:
#标签
return self.responseSoup.select(label)
def printResponse(self):
'''[打印响应]
[在控制台打印响应文本]
'''
print(self.responseSoup.prettify())
def downloadImage(imageHtml,filename):
'''[下载图片]
[下载图片到指点地址]
Arguments:
imageHtml {[str]} -- [图片url]
filename {[type]} -- [图片地址]
'''
try:
#下载图片
urllib.request.urlretrieve(imageHtml,filename = filename)
except Exception as e:
print("%s--下载出错"%imageHtml)
def main():
'''[主函数]
[用于测试类,被引用时不会执行]
'''
if __name__ == '__main__':
main()
#1.自定义类
import Html
#2.系统模块
import time,threading,queue,os
#基础地址
baseUrl = "http://nanrenvip.cc"
#女优作品,字典队列
AV = queue.Queue()
#首层报头
firstHeaders = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
,'Accept-Language':'zh-CN,zh;q=0.9'
,'Cache-Control':'no-cache'
,'Connection':'keep-alive'
,'Cookie':'fikker-QnSp-g2oO=iKQJO0GhsfKh7h3VS0kRIhGLYXCZJH9F; uv_cookie_126326=1; UBGLAI63GV=ydzyn.1547129688; fikker-vSjQ-xL6G=03JePCRqBCD5DLHAISRLxwNwncVYbCOU; __jclm_cpv_r_61506_cpv_plan_ids=%7C1365%7C%7C2093%7C%7C2013%7C%7C2092%7C%7C1776%7C%7C1780%7C%7C1364%7C; fikker-Ajmv-gZS6=iKSTQXOxMHSpeHwxUMSvCbWX5OSp9hoF; Hm_lvt_474976084829d4090d0d97d377ac5b38=1547017567,1547129535,1547131172,1547133591; Hm_lvt_60852cb607c7b21f13202e5e672131ce=1547017605,1547129688,1547133594; Hm_lpvt_60852cb607c7b21f13202e5e672131ce=1547133594; Hm_lpvt_474976084829d4090d0d97d377ac5b38=1547133594; uqcpvcouplet_fidx=6'
,'Host':'nanrenvip.cc'
,'Pragma':'no-cache'
,'Referer':'http://nanrenvip.cc/olds.html'
,'Upgrade-Insecure-Requests':'1'
,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6814.400 QQBrowser/10.3.3005.400'
}
#作品集合报头
gatherHeaders = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
,'Accept-Language':'zh-CN,zh;q=0.9'
,'Cache-Control':'no-cache'
,'Connection':'keep-alive'
,'Cookie':'fikker-QnSp-g2oO=iKQJO0GhsfKh7h3VS0kRIhGLYXCZJH9F; uv_cookie_126326=1; UBGLAI63GV=ydzyn.1547129688; fikker-vSjQ-xL6G=03JePCRqBCD5DLHAISRLxwNwncVYbCOU; __jclm_cpv_r_61506_cpv_plan_ids=%7C1365%7C%7C2093%7C%7C2013%7C%7C2092%7C%7C1776%7C%7C1780%7C%7C1364%7C; fikker-Ajmv-gZS6=iKSTQXOxMHSpeHwxUMSvCbWX5OSp9hoF; Hm_lvt_474976084829d4090d0d97d377ac5b38=1547017567,1547129535,1547131172,1547133591; Hm_lvt_60852cb607c7b21f13202e5e672131ce=1547017605,1547129688,1547133594; Hm_lpvt_60852cb607c7b21f13202e5e672131ce=1547133594; Hm_lpvt_474976084829d4090d0d97d377ac5b38=1547133594; uqcpvcouplet_fidx=6'
,'Host':'nanrenvip.cc'
,'Pragma':'no-cache'
,'Referer':'http://nanrenvip.cc/olds.html'
,'Upgrade-Insecure-Requests':'1'
,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6814.400 QQBrowser/10.3.3005.400'
}
#封面报头
coverHeaders = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
,'Accept-Language':'zh-CN,zh;q=0.9'
,'Cache-Control':'no-cache'
,'Connection':'keep-alive'
,'Cookie':'fikker-QnSp-g2oO=iKQJO0GhsfKh7h3VS0kRIhGLYXCZJH9F; uv_cookie_126326=1; UBGLAI63GV=ydzyn.1547129688; fikker-vSjQ-xL6G=03JePCRqBCD5DLHAISRLxwNwncVYbCOU; __jclm_cpv_r_61506_cpv_plan_ids=%7C1365%7C%7C2093%7C%7C2013%7C%7C2092%7C%7C1776%7C%7C1780%7C%7C1364%7C; fikker-Ajmv-gZS6=iKSTQXOxMHSpeHwxUMSvCbWX5OSp9hoF; Hm_lvt_474976084829d4090d0d97d377ac5b38=1547017567,1547129535,1547131172,1547133591; Hm_lvt_60852cb607c7b21f13202e5e672131ce=1547017605,1547129688,1547133594; Hm_lpvt_60852cb607c7b21f13202e5e672131ce=1547133594; Hm_lpvt_474976084829d4090d0d97d377ac5b38=1547133594; uqcpvcouplet_fidx=6'
,'Host':'nanrenvip.cc'
,'Pragma':'no-cache'
,'Referer':'http://nanrenvip.cc/olds.html'
,'Upgrade-Insecure-Requests':'1'
,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6814.400 QQBrowser/10.3.3005.400'
}
def getAV():
'''[获取AV]
[将获取的女优名字,番号,封面url放入队列]
'''
global AV
#读取上一次下载到的页码
htmlPage = pageRecord("r")
#记录女优数
AVnumber = 0
#临时作品字典
tempGatherDict = {}
#第一层html对象
firstHtml = Html.Html(baseUrl + "/nvyouku/1-0-0-0-0-0-" + str(htmlPage) + ".html",firstHeaders,"GET")
#根据是否有a标签判断页尾
while len(firstHtml.labelSelect(".avps_ny a")) != 0:
#遍历女优
for a1 in firstHtml.labelSelect(".avps_ny a"):
#这里的a标签已经是bs4对象
AVname = a1.select("span.fh_bt")[0].get_text()
AVnumber += 1
print("-------------------------第%d名:%s-------------------------"%(AVnumber,AVname))
try:
#作品集合对象
gatherHtml = Html.Html(baseUrl + a1["href"],gatherHeaders,"GET")
except Exception as e:
print("作品集对象--创建失败")
continue
#遍历作品集
for index,a2 in enumerate(gatherHtml.labelSelect(".avps a")):
try:
#封面对象
coverHtml = Html.Html(baseUrl + a2["href"],coverHeaders,"GET")
AVid = coverHtml.labelSelect("h1.heading")[0].get_text()
except Exception as e:
print("封面对象--创建失败")
continue
#临时作品组装{番号:url}
tempGatherDict[AVid] = baseUrl + coverHtml.labelSelect("img.lazyload")[0]["data-original"]
print("爬取作品:%s\n还剩:%d张"%(AVid,(len(gatherHtml.labelSelect(".avps a")) - index - 1)))
#添加女优名字、作品
AV.put({AVname:tempGatherDict})
#清空临时字典
tempGatherDict = {}
#爬取下一页的女优们
htmlPage += 1
firstHtml = Html.Html(baseUrl + "/nvyouku/1-0-0-0-0-0-" + str(htmlPage) + ".html",firstHeaders,"GET")
#爬取完毕
print("爬取完毕")
AV.put("end")
def pageRecord(mode,page = None):
'''[记录页码]
[写入和读取TXT文件来保存页码]
Arguments:
mode {[模式]} -- [对文件的操作模式]
Keyword Arguments:
page {[页码]} -- [要写入的页码] (default: {None})
Returns:
[整数] -- [返回页码]
'''
if mode == "w":
#写入页数
f = open(".\\Page.txt","w")
f.write(str(page))
f.close()
if mode == "r":
#读取页数
f = open(".\\Page.txt","r")
page = int(f.read())
f.close()
return page
def downloadImage(queue):
'''[下载图片]
[下载队列里的图片到指定路径]
Arguments:
queue {[Queue]} -- [队列]
'''
while True:
print("等待下载")
#获取要下载的作品
gather = queue.get()
#判断是否已经没有作品
if gather == "end":
#结束线程
print("***全部下载完成***")
return
#遍历女优
for AVname in gather:
# #判断是否存在文件夹如果不存在则创建为文件夹
if not os.path.exists(".\\Performer\\" + AVname):
os.makedirs(".\\Performer\\" + AVname)
#遍历番号
for AVid in gather[AVname]:
#判断图片是否已经下载
if not os.path.exists(".\\Performer\\" + AVname + "\\" + AVid + gather[AVname][AVid][-4:]):
print("开始下载:%s"%gather[AVname][AVid])
Html.downloadImage(gather[AVname][AVid],".\\Performer\\" + AVname + "\\" + AVid + gather[AVname][AVid][-4:])
print("下载完成!:%s"%AVname)
def main():
'''[主函数]
[程序入口]
'''
print("程序已启动(*^▽^*)")
#启动下载线程
download = threading.Thread(target = downloadImage,args = ([AV]))
download.start()
#将AV添加到下载队列
getAV()
if __name__ == '__main__':
#程序入口
main()