python爬去风之动漫---海贼王案例

  • 使用的库
from bs4 import BeautifulSoup
import requests
import os
import re
from pyquery import PyQuery as pq
from urllib3.exceptions import InsecureRequestWarning
import urllib3
urllib3.disable_warnings(InsecureRequestWarning)
  • 步骤思想

1.先是获取漫画主页的集数

2.根据拿取的集数访问固定页面(这个页面有规律,这个很容易看出来)

3.让后就是爬取图片保存到本地

主要拿取的数据为js数据,这里需要单独拿取每一章节的图片地址

直接上代码:

user-agent  如果拿取不到数据,这里需要修改其值

class downIamge(object):
    # ================================== 抓取多页数据 ==================================
    def parseMultiplePages(self, url, page, page_num):
        self.page = page
        self.page_num = page_num
        self.hread = {
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'https://manhua.fzdm.com/2/' + str(self.page) + '//index_' + str(self.page_num) + '.html',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'

        }
        try:
            wb_data = requests.session().get(url, headers=self.hread, verify=False)
            if wb_data.status_code == 200:
                # print(wb_data.content)
                soup = BeautifulSoup(wb_data.text, 'lxml')
                # for child in soup.descendants:
                title = soup.find_all('script', type='text/javascript')

                # print(title[1])
                for name in title[1]:
                    spl=re.search(r'"\S*?\.jpg"',str(name))   #正则表达式,拿取每个图片的url地址
                    self.new_mhurl=spl.group()

            else:
                print("超过访问限制")
        except:
            for page_num in range(0, 18):
                url = 'http://manhua.fzdm.com/2/{}/index_{}.html'.format(935, page_num)

                parseMultiplePages(url, 935, page_num)

    def downImage(self):
        path = "海贼王/" + str(self.page)
        url = re.sub('"', '', str(self.new_mhurl))
        imageUrl = "http://p17.xiaoshidi.net/" + url
        print(imageUrl)
        wb_data = requests.session().get(imageUrl, headers=self.hread, verify=False)

        if not os.path.exists(path):
            os.mkdir("海贼王/" + str(self.page))
        else:
            print(self.page_num)
            with open(path + '/{}.jpg'.format(self.page_num), 'wb')as f:
                f.write(wb_data.content)

    def getNumber(self):
        self.hread = {
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'https://manhua.fzdm.com/2/',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'

        }
        url = 'https://manhua.fzdm.com/2/'
        rep_data = requests.get(url, headers=self.hread, verify=False).text
        doc = pq(rep_data)
        data = doc('.pure-g  li a').items()
        thisset = list()

        for title in data:
            t = title.attr('href')
            thisset.append(t)
        return thisset[3:5]  #这里爬去为最新的前两章节


if __name__ == '__main__':
    # dl_chapters(935,938)
    dwon = downIamge()
    number = dwon.getNumber()
    for page in number:
        for page_num in range(0, 20):
            url = 'http://manhua.fzdm.com/2/{}index_{}.html'.format(page, page_num)
            dwon.parseMultiplePages(url, page, page_num)
            dwon.downImage()

 

你可能感兴趣的:(python)