多线程图片爬虫


import os,requests,html5lib,re,threading
from bs4 import BeautifulSoup
def downloadXXOOimage(startComic,endComic):
    pre_url = 'XXX_' #网址隐藏
    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
               "Accept-Encoding": "gzip",
               "Accept-Language": "zh-CN,zh;q=0.8",
               "Referer": "http://www.example.com/",
               "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
               }
    for urlNumber in range(startComic,endComic):
        print('已进入%s页面' % (urlNumber))  # 进入目录页
        url=pre_url+str(urlNumber)+'.html'
        res1 = requests.get(url, headers)
        res1.raise_for_status()
        soup1 = BeautifulSoup(res1.text, 'html5lib')
        comElem1 = soup1.select('div[class="typelist"] > ul li a')  # 在目录页查找图片页地址集合
        for content_url in comElem1:  # 取出当前目录页地址集合的的每张个图片页地址
            imgpage_url ='XXX' + content_url.get('href')  #网址隐藏
            res2 = requests.get(imgpage_url, headers)
            res2.raise_for_status()
            soup2 = BeautifulSoup(res2.text, 'html5lib')
            title = soup2.title.string
            title_name = re.findall('(.*?)-', title)  # 过滤掉网站名
            print('已打开%s页面' % title_name[0])  # 进入图片页地址
            path_name = os.path.join("d:\\photo", title_name[0])
            os.mkdir(path_name)
            os.chdir(path_name)
            comElem2 = soup2.select('#view1 img')  # 查找图片页地址上的每张图片
            for test_url in comElem2:  # 取出每张图片的地址
                img_url = test_url.get('src')
                res3 = requests.get(img_url, headers=headers)
                # imgFile = open(cur_path+"\\"+title_name[0]+"\\"+os.path.basename(img_url), 'wb')
                imgFile = open(os.path.basename(img_url), 'wb')
                print('正在下载%s张图片' % os.path.basename(img_url))  # 进入图片页地址
                for chunk in res3.iter_content(10000):
                    imgFile.write(chunk)
                imgFile.close()
downloadThreads =[]
for i in range(1,160,10):
    downloadThread=threading.Thread(target=downloadXXOOimage,args=(i,i+9))
    downloadThreads.append(downloadThread)
    downloadThread.start()


你可能感兴趣的:(多线程图片爬虫)