教你用Python爬取表情包网站下的全部表情图片

教你用Python爬取表情包网站下的全部表情图片

又是我啦~~~
最近上网的时候老看到有人用Python爬取表情包,心痒痒自己也整了一个。

使用到的扩展库:BeautifulSoup, requests

pip install beautifulsoup4 requests

分析:

先进入到网站:https://www.doutula.com/photo/list/
看下总页数:
在这里插入图片描述
哦豁,3千多页,不慌,点到下一页看看url组成:
在这里插入图片描述
好像有点规律,其实每一页的url就是由https://www.doutula.com/photo/list/?page=再加上页数构成的,于是写出如下代码:

# coding: utf-8

from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_

class BiaoQingBao(object):
	self.first_page_link = 'https://www.doutula.com/photo/list/?page='
    self.limit_page = 3425
    self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

( 这里我偷了一下懒,那个页数需要手动改代码更新,没有一开始就先获取总页数

好了,获得url的规律后,最激动人心的一步就来了,找到图片的地址
右键一张图片,检查:
教你用Python爬取表情包网站下的全部表情图片_第1张图片
这个网页的结构很简单。可以看到,有三个属性都有储存图片的url,那究竟是哪一个呢?

事实上,楼主试过,那个data-backup的属性才是图片的真正url(具体自行尝试,其他那两个爬下来网址就变了,有大神解答下吗

根据上面发现,增加下面代码:

def request(self, url):
        return get(url, headers=self.headers)

def next_page(self):
        if self.page_num < self.limit_page:
            self.page_num += 1
        else:
            print u'下载完毕,正在退出。。。'
            exit()

def get_picture(self, limit=3420):
    print u'正在创建目录。。。'
    try:
        mkdir('d:\\biaoqingbao')
    except WindowsError:
        pass

    for i in xrange(limit):
        print u'正在创建第%d页的目录。。。' % self.page_num
        try:
            mkdir('d:\\biaoqingbao\\' + str(self.page_num))
        except WindowsError:
            pass
        print u'正在获取响应。。。'
        req = self.request(self.first_page_link + str(self.page_num))
        sleep(1)
        soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
            picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
        picture_links = [i['data-backup'] for i in picture_tags]
        picture_names = [i['alt'] for i in picture_tags]
        picture_dict = dict(zip(picture_names, picture_links))
        for j in picture_dict:
            if j != self.page_title and self.page_title is not None:
                continue
            chdir(u'd:\\表情包下载器\\')
            self.write_last_download(self.page_num, j)
            chdir('d:\\biaoqingbao\\' + str(self.page_num))
            print u'正在下载: %s' % j
            f = open(j + '.png', 'wb')
            req_ = self.request(picture_dict[j])
            sleep(0.5)
            f.write(req_.content)
            f.close()
        print u'翻页中。。。'
        self.next_page()

因为上面我们发现了url构成的规律,所以我们可以手动将页数加1,在接到之前的那个url前缀上,就可以实现翻页功能。

表情那么多,那用户退出后难道就要重新下载吗?
所以,在写两个函数进行配置文件的读写:

@classmethod
def write_last_download(cls, download_page, download_title):
    f = open_('settings.ini', 'w', 'utf-8')
    f.write(u'last download page: ' + unicode(download_page) + u'\n')
    f.write(u'last download title: ' + unicode(download_title) + u'\n')
    f.close()

@classmethod
def load_last_download(cls):
    try:
        f = open_('settings.ini', 'r', 'utf-8')
        buf = f.read()
        download_record = []
        for i in buf:
            if i.isdigit():
                download_record.append(int(i))
        if len(download_record) == 2:
            f.close()
            return download_record[0], download_record[1]
    except IOError:
        return 1, None

(请自行在d盘目录下创建“表情包下载器”这个目录(也可以直接修改源码

汇总代码如下:

# coding: utf-8

from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_


class BiaoQingBao(object):

    def __init__(self):
        self.page_num = self.load_last_download()[0]
        self.page_title = self.load_last_download()[1]
        self.first_page_link = 'https://www.doutula.com/photo/list/?page='
        self.limit_page = 3425
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

    @classmethod
    def write_last_download(cls, download_page, download_title):
        f = open_('settings.ini', 'w', 'utf-8')
        f.write(u'last download page: ' + unicode(download_page) + u'\n')
        f.write(u'last download title: ' + unicode(download_title) + u'\n')
        f.close()

    @classmethod
    def load_last_download(cls):
        try:
            f = open_('settings.ini', 'r', 'utf-8')
            buf = f.read()
            download_record = []
            for i in buf:
                if i.isdigit():
                    download_record.append(int(i))
            if len(download_record) == 2:
                f.close()
                return download_record[0], download_record[1]
        except IOError:
            return 1, None

    def request(self, url):
        return get(url, headers=self.headers)

    def next_page(self):
        if self.page_num < self.limit_page:
            self.page_num += 1
        else:
            print u'下载完毕,正在退出。。。'
            exit()

    def get_picture(self, limit=3420):
        print u'正在创建目录。。。'
        try:
            mkdir('d:\\biaoqingbao')
        except WindowsError:
            pass

        for i in xrange(limit):
            print u'正在创建第%d页的目录。。。' % self.page_num
            try:
                mkdir('d:\\biaoqingbao\\' + str(self.page_num))
            except WindowsError:
                pass
            print u'正在获取响应。。。'
            req = self.request(self.first_page_link + str(self.page_num))
            sleep(1)
            soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
            picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
            picture_links = [i['data-backup'] for i in picture_tags]
            picture_names = [i['alt'] for i in picture_tags]
            picture_dict = dict(zip(picture_names, picture_links))
            for j in picture_dict:
                if j != self.page_title and self.page_title is not None:
                    continue
                chdir(u'd:\\表情包下载器\\')
                self.write_last_download(self.page_num, j)
                chdir('d:\\biaoqingbao\\' + str(self.page_num))
                print u'正在下载: %s' % j
                f = open(j + '.png', 'wb')
                req_ = self.request(picture_dict[j])
                sleep(0.5)
                f.write(req_.content)
                f.close()
            print u'翻页中。。。'
            self.next_page()


if __name__ == '__main__':
    biaoqingbao = BiaoQingBao()
    while True:
        print u'你想获取几页的表情包:'
        answer = raw_input('>>> ')
        if answer:
            try:
                if 0 <= int(answer) <= 3425:
                    break
                else:
                    print u'请规范输入! 只能输入大于0小于3420的整数!'
            except ValueError:
                print u'请输入整数!'
        else:
            print u'输入不能为空!'
    biaoqingbao.get_picture(int(answer))
    print u'感谢使用!可以在D盘biaoqingbao目录下找到结果~~~'

效果图:
教你用Python爬取表情包网站下的全部表情图片_第2张图片
教你用Python爬取表情包网站下的全部表情图片_第3张图片
教你用Python爬取表情包网站下的全部表情图片_第4张图片
喜欢的话点个赞再走呗~~~

你可能感兴趣的:(爬虫,Python,python)