python 爬取漫画《黑鹭尸体宅配便》

想看《黑鹭尸体宅配便》这部漫画,全网找了一圈,资源特别少,手机资源上很少,虽然网页能看,但是想用手机看比较耗费流量。就想把漫画下载下来,能放在手机上看

用requests+re访问漫画网站,下载漫画到电脑上。


非常开心 ,该网站只有1-15卷,16-21卷的mobi版从百度云下载了

这部漫画的资源太少了,能找到很幸运了

缺点是太慢了。【后期看看再改改】


import re
import requests
import os
def gethtml(url):
    try:
        headers={
            'Referer':'http://www.manhuadb.com/manhua/1001',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
            'Host':'www.manhuadb.com',
        }
        r=requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding='utf-8'
        return r
    except:
        return ''




def download(jpgurl,path):
    html=gethtml(jpgurl)
    if not os.path.exists(path[0]):
        os.mkdir(path[0])
    path='%s/%s'%(path[0],path[1])
    with open(path+'.jpg','wb') as f:
        f.write(html.content)


def main():
    url='http://www.manhuadb.com/manhua/1001'
    html=gethtml(url).text
    urlslist=re.findall('',html)
    for url1 in urlslist:
        url='http://www.manhuadb.com'+url1
        html=gethtml(url).text
        name1=re.findall('

(.*?)

'
,html)[0] pagelist=re.findall('',html) pages=int(re.findall('共 (\d*) 页',html)[0]) for page in pagelist[:pages]: pageurl='http://www.manhuadb.com'+page[0] path=[name1,page[1]] print(pageurl,page[1]) html=gethtml(pageurl).text jpgurl='http://www.manhuadb.com'+re.findall(r',html)[0] download(jpgurl,path) main()
 
  

更新多线程方式:

import re
import requests
import os
import threading      #加入多线程模块
def gethtml(url):
    try:
        headers={
            'Referer':'http://www.manhuadb.com/manhua/1001',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
            'Host':'www.manhuadb.com',
        }
        r=requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding='utf-8'
        return r
    except:
        return ''




def download(jpgurl,path):
    html=gethtml(jpgurl)
    if not os.path.exists(path[0]):
        os.mkdir(path[0])
    path='%s/%s'%(path[0],path[1])
    with open(path+'.jpg','wb') as f:
        f.write(html.content)

def loop(page,name1):                          #爬取这一集中的所有图片
    pageurl = 'http://www.manhuadb.com' + page[0]
    path = [name1, page[1]]
    print(pageurl, page[1])
    html = gethtml(pageurl).text
    jpgurl = 'http://www.manhuadb.com' + re.findall(r', html)[0]
    download(jpgurl, path)

def main():

    url='http://www.manhuadb.com/manhua/1001'
    html=gethtml(url).text
    urlslist=re.findall('',html)
    for url1 in urlslist:
        url='http://www.manhuadb.com'+url1
        html=gethtml(url).text
        name1=re.findall('

(.*?)

'
,html)[0] pagelist=re.findall('',html) pages=int(re.findall('共 (\d*) 页',html)[0]) i=0 threads = []                                                #本集漫画 for page in pagelist[:pages]: t = threading.Thread(target=loop, args=(page,name1))    #记录爬取漫画一页为一个线程 threads.append(t)                                       #把这一页加入列表 threads[i].start()                                      #开始爬取这一页 i=i+1                                                   #记录第多少集 threads[-1].join()                                          #这一集最后一页爬完再执行,爬下一集 main()

总结一下:昨天2000张漫画用时1个小时,一个网站本身比较慢,另一个是因为单线程

                今天3000张漫画用时1分钟,太快了


你可能感兴趣的:(python 爬取漫画《黑鹭尸体宅配便》)