Python爬虫实战之利用多线程爬取千图网的素材图片

千图网电商淘宝素材网址:https://www.58pic.com/piccate/3-0-0-p1.html

from  urllib import  request
import urllib
import random
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
import re
def get_ip(): 
    fr=open('ip.txt','r')  ##代理IP池
    ips=fr.readlines()
    new=[]
    for line in ips:
        temp=line.strip()
        new.append(temp)
    ip=random.choice(new)
    return ip
    print(ip)
proxy =get_ip()
proxy_handler = ProxyHandler({
'http': 'http://' + proxy,
'https': 'https://' + proxy
})
opener = build_opener(proxy_handler)
import  threading
class One(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        try:
            for i in range(1,5,2):
                pageurl='http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html'
                data =urllib.request.urlopen(pageurl).read().decode('utf-8','ignore')
                pat='class="thumb-box".*?src="(.*?).jpg!'
                image_url=re.compile(pat).findall(data)  ##爬取高清图片部分地址
                for j in range(0,len(image_url)):
                    try:
                        this_list=image_url[j]
						#通过观察高请图片完整地址,添加完整地址
                        this_url=this_list+'.jpg!/fw/1024/watermark/url/L2ltYWdlcy93YXRlcm1hcmsveGlhb3R1LnBuZw==/align/center' 
                        file='D:/软件(学习)/Python/Test/chapter6/qiantu.photo/'+str(i)+str(j)+'.jpg'  #保存图片
                        urllib.request.urlretrieve(this_url,file)
                        print('第'+str(i)+'页第'+str(j)+'个图片成功')
                    except urllib.error.URLError as e:
                        print(e.reason)

        except URLError as e:
            print(e.reason)

采用多线程的方法

class Two(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        try:
            for i in range(2, 5, 2):
                pageurl = 'http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html'
                data = urllib.request.urlopen(pageurl).read().decode('utf-8', 'ignore')
                pat = 'class="thumb-box".*?src="(.*?).jpg!'
                image_url = re.compile(pat).findall(data)
                for j in range(0, len(image_url)):
                    try:
                        this_list = image_url[j]
                        this_url = this_list + '.jpg!/fw/1024/watermark/url/L2ltYWdlcy93YXRlcm1hcmsveGlhb3R1LnBuZw==/align/center'
                        file = 'D:/软件(学习)/Python/Test/chapter6/qiantu.photo/' + str(i) + str(j) + '.jpg'
                        urllib.request.urlretrieve(this_url, file)
                        print('第' + str(i) + '页第' + str(j) + '个图片成功')
                    except urllib.error.URLError as e:
                        print(e.reason)

        except URLError as e:
            print(e.reason)
one=One()
one.start()
two=Two()
two.start()

Python爬虫实战之利用多线程爬取千图网的素材图片_第1张图片
Python爬虫实战之利用多线程爬取千图网的素材图片_第2张图片
爬取成功!!!

你可能感兴趣的:(爬虫项目)