前言
- 其实最早知道python爬虫就是知乎接触的妹子图爬虫,这次等于是自己写的;很多类库都是自己喜欢那个就用那个了;
思路
- 首先我们需要一个导航页,然后找到我们需要的类似于列表之类的,然后得到链接集合
def getPicture():
url = "http://www.mmjpg.com/tag/meixiong"
r = requests.get(url, headers=getHeaders());
r.encoding = 'utf8'
q = PyQuery(r.text)
list = list()
for each in q('body > div.main.topno > div.pic > ul > li>a').items():
list.append(each.attr.href)
for href in list:
downLoad(href)
- 然后在每个链接中,我们发现,是用js加载全部图片的,这里我的处理方式是直接拿到图片地址,因为从第1张开始,编号就只是2,3,4.。。。而已;
def downLoad(href):
r = requests.get(href, getHeaders())
r.encoding = 'utf8'
q = PyQuery(r.text)
total = int(q('#page > a:nth-child(9)').text())
title = q('body > div.main > div.article > h2').text()
href = q('div.content > a> img').attr.src[0:-5]
dirName = u"【{}P】{}".format(total, title)
mkpath = "c:\\mymeizi\\" + dirName + "\\"
if makedir(mkpath):
print mkpath + "目录已创建"
for x in range(1, total + 1):
urllib.urlretrieve(href + str(x) + ".jpg", mkpath + "/%s.jpg" % x)
完整代码
import requests
from pyquery import PyQuery
import re
import urllib
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def getPicture():
url = "http://www.mmjpg.com/tag/meixiong"
r = requests.get(url, headers=getHeaders());
r.encoding = 'utf8'
q = PyQuery(r.text)
list = list()
for each in q('body > div.main.topno > div.pic > ul > li>a').items():
list.append(each.attr.href)
for href in list:
downLoad(href)
def downLoad(href):
r = requests.get(href, getHeaders())
r.encoding = 'utf8'
q = PyQuery(r.text)
total = int(q('#page > a:nth-child(9)').text())
title = q('body > div.main > div.article > h2').text()
href = q('div.content > a> img').attr.src[0:-5]
dirName = u"【{}P】{}".format(total, title)
mkpath = "c:\\mymeizi\\" + dirName + "\\"
if makedir(mkpath):
print mkpath + "目录已创建"
for x in range(1, total + 1):
urllib.urlretrieve(href + str(x) + ".jpg", mkpath + "/%s.jpg" % x)
def makedir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
if not isExists:
print path + ' 创建成功'
os.makedirs(path)
return True
else:
print path + ' 目录已存在'
return False
def getHeaders():
headers = {
'Host': 'www.mmjpg.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8'
}
return headers
if __name__ == '__main__':
getPicture()
总结
- 爬这种图站真的是激动
- 其实就写了2个小时,而且很大程度上是边查边写的,什么编码问题真的是要不得啊;