又是我啦~~~
最近上网的时候老看到有人用Python爬取表情包,心痒痒自己也整了一个。
使用到的扩展库:BeautifulSoup, requests
pip install beautifulsoup4 requests
先进入到网站:https://www.doutula.com/photo/list/
看下总页数:
哦豁,3千多页,不慌,点到下一页看看url组成:
好像有点规律,其实每一页的url就是由https://www.doutula.com/photo/list/?page=再加上页数构成的,于是写出如下代码:
# coding: utf-8
from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_
class BiaoQingBao(object):
self.first_page_link = 'https://www.doutula.com/photo/list/?page='
self.limit_page = 3425
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36 '
}
( 这里我偷了一下懒,那个页数需要手动改代码更新,没有一开始就先获取总页数
好了,获得url的规律后,最激动人心的一步就来了,找到图片的地址
右键一张图片,检查:
这个网页的结构很简单。可以看到,有三个属性都有储存图片的url,那究竟是哪一个呢?
事实上,楼主试过,那个data-backup的属性才是图片的真正url(具体自行尝试,其他那两个爬下来网址就变了,有大神解答下吗
根据上面发现,增加下面代码:
def request(self, url):
return get(url, headers=self.headers)
def next_page(self):
if self.page_num < self.limit_page:
self.page_num += 1
else:
print u'下载完毕,正在退出。。。'
exit()
def get_picture(self, limit=3420):
print u'正在创建目录。。。'
try:
mkdir('d:\\biaoqingbao')
except WindowsError:
pass
for i in xrange(limit):
print u'正在创建第%d页的目录。。。' % self.page_num
try:
mkdir('d:\\biaoqingbao\\' + str(self.page_num))
except WindowsError:
pass
print u'正在获取响应。。。'
req = self.request(self.first_page_link + str(self.page_num))
sleep(1)
soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
picture_links = [i['data-backup'] for i in picture_tags]
picture_names = [i['alt'] for i in picture_tags]
picture_dict = dict(zip(picture_names, picture_links))
for j in picture_dict:
if j != self.page_title and self.page_title is not None:
continue
chdir(u'd:\\表情包下载器\\')
self.write_last_download(self.page_num, j)
chdir('d:\\biaoqingbao\\' + str(self.page_num))
print u'正在下载: %s' % j
f = open(j + '.png', 'wb')
req_ = self.request(picture_dict[j])
sleep(0.5)
f.write(req_.content)
f.close()
print u'翻页中。。。'
self.next_page()
因为上面我们发现了url构成的规律,所以我们可以手动将页数加1,在接到之前的那个url前缀上,就可以实现翻页功能。
表情那么多,那用户退出后难道就要重新下载吗?
所以,在写两个函数进行配置文件的读写:
@classmethod
def write_last_download(cls, download_page, download_title):
f = open_('settings.ini', 'w', 'utf-8')
f.write(u'last download page: ' + unicode(download_page) + u'\n')
f.write(u'last download title: ' + unicode(download_title) + u'\n')
f.close()
@classmethod
def load_last_download(cls):
try:
f = open_('settings.ini', 'r', 'utf-8')
buf = f.read()
download_record = []
for i in buf:
if i.isdigit():
download_record.append(int(i))
if len(download_record) == 2:
f.close()
return download_record[0], download_record[1]
except IOError:
return 1, None
(请自行在d盘目录下创建“表情包下载器”这个目录(也可以直接修改源码
汇总代码如下:
# coding: utf-8
from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_
class BiaoQingBao(object):
def __init__(self):
self.page_num = self.load_last_download()[0]
self.page_title = self.load_last_download()[1]
self.first_page_link = 'https://www.doutula.com/photo/list/?page='
self.limit_page = 3425
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36 '
}
@classmethod
def write_last_download(cls, download_page, download_title):
f = open_('settings.ini', 'w', 'utf-8')
f.write(u'last download page: ' + unicode(download_page) + u'\n')
f.write(u'last download title: ' + unicode(download_title) + u'\n')
f.close()
@classmethod
def load_last_download(cls):
try:
f = open_('settings.ini', 'r', 'utf-8')
buf = f.read()
download_record = []
for i in buf:
if i.isdigit():
download_record.append(int(i))
if len(download_record) == 2:
f.close()
return download_record[0], download_record[1]
except IOError:
return 1, None
def request(self, url):
return get(url, headers=self.headers)
def next_page(self):
if self.page_num < self.limit_page:
self.page_num += 1
else:
print u'下载完毕,正在退出。。。'
exit()
def get_picture(self, limit=3420):
print u'正在创建目录。。。'
try:
mkdir('d:\\biaoqingbao')
except WindowsError:
pass
for i in xrange(limit):
print u'正在创建第%d页的目录。。。' % self.page_num
try:
mkdir('d:\\biaoqingbao\\' + str(self.page_num))
except WindowsError:
pass
print u'正在获取响应。。。'
req = self.request(self.first_page_link + str(self.page_num))
sleep(1)
soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
picture_links = [i['data-backup'] for i in picture_tags]
picture_names = [i['alt'] for i in picture_tags]
picture_dict = dict(zip(picture_names, picture_links))
for j in picture_dict:
if j != self.page_title and self.page_title is not None:
continue
chdir(u'd:\\表情包下载器\\')
self.write_last_download(self.page_num, j)
chdir('d:\\biaoqingbao\\' + str(self.page_num))
print u'正在下载: %s' % j
f = open(j + '.png', 'wb')
req_ = self.request(picture_dict[j])
sleep(0.5)
f.write(req_.content)
f.close()
print u'翻页中。。。'
self.next_page()
if __name__ == '__main__':
biaoqingbao = BiaoQingBao()
while True:
print u'你想获取几页的表情包:'
answer = raw_input('>>> ')
if answer:
try:
if 0 <= int(answer) <= 3425:
break
else:
print u'请规范输入! 只能输入大于0小于3420的整数!'
except ValueError:
print u'请输入整数!'
else:
print u'输入不能为空!'
biaoqingbao.get_picture(int(answer))
print u'感谢使用!可以在D盘biaoqingbao目录下找到结果~~~'