from bs4 import BeautifulSoup import requests import os import re from pyquery import PyQuery as pq from urllib3.exceptions import InsecureRequestWarning import urllib3 urllib3.disable_warnings(InsecureRequestWarning)
1.先是获取漫画主页的集数
2.根据拿取的集数访问固定页面(这个页面有规律,这个很容易看出来)
3.让后就是爬取图片保存到本地
主要拿取的数据为js数据,这里需要单独拿取每一章节的图片地址
直接上代码:
user-agent 如果拿取不到数据,这里需要修改其值
class downIamge(object): # ================================== 抓取多页数据 ================================== def parseMultiplePages(self, url, page, page_num): self.page = page self.page_num = page_num self.hread = { 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://manhua.fzdm.com/2/' + str(self.page) + '//index_' + str(self.page_num) + '.html', 'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' } try: wb_data = requests.session().get(url, headers=self.hread, verify=False) if wb_data.status_code == 200: # print(wb_data.content) soup = BeautifulSoup(wb_data.text, 'lxml') # for child in soup.descendants: title = soup.find_all('script', type='text/javascript') # print(title[1]) for name in title[1]: spl=re.search(r'"\S*?\.jpg"',str(name)) #正则表达式,拿取每个图片的url地址 self.new_mhurl=spl.group() else: print("超过访问限制") except: for page_num in range(0, 18): url = 'http://manhua.fzdm.com/2/{}/index_{}.html'.format(935, page_num) parseMultiplePages(url, 935, page_num) def downImage(self): path = "海贼王/" + str(self.page) url = re.sub('"', '', str(self.new_mhurl)) imageUrl = "http://p17.xiaoshidi.net/" + url print(imageUrl) wb_data = requests.session().get(imageUrl, headers=self.hread, verify=False) if not os.path.exists(path): os.mkdir("海贼王/" + str(self.page)) else: print(self.page_num) with open(path + '/{}.jpg'.format(self.page_num), 'wb')as f: f.write(wb_data.content) def getNumber(self): self.hread = { 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://manhua.fzdm.com/2/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134' } url = 'https://manhua.fzdm.com/2/' rep_data = requests.get(url, headers=self.hread, verify=False).text doc = pq(rep_data) data = doc('.pure-g li a').items() thisset = list() for title in data: t = title.attr('href') thisset.append(t) return thisset[3:5] #这里爬去为最新的前两章节 if __name__ == '__main__': # dl_chapters(935,938) dwon = downIamge() number = dwon.getNumber() for page in number: for page_num in range(0, 20): url = 'http://manhua.fzdm.com/2/{}index_{}.html'.format(page, page_num) dwon.parseMultiplePages(url, page, page_num) dwon.downImage()