python多线程爬取图片_Python多线程爬取网站图片

多线程执行爬虫避免某个网络资源卡住其他资源下载;

Python线程相关知识点:

import threading  引入线程

t = Thread(target,args=None)   定义一个线程

t.start() 线程开始

t.setDaemon(False) 默认 设置线程后台模式运行;

t.setDaemon(True) 设置线程前台模式运行;

t.join (当前程序)等待线程t执行完毕;

lock=threading.RLOCK() 创建线程锁对象

lock.acquire() 强迫lock获取线程锁,如果被占用则等待

lock.release() 释放锁

from bs4 import BeautifulSoup

from bs4 import UnicodeDammit

import urllib.request

import threading

def imageSpider(start_url):

global threads

global count

try:

urls=[]

req=urllib.request.Request(start_url,headers=headers)

data = urllib.request.urlopen(req)

data=data.read()

dammit=UnicodeDammit(data,["utf-8","gbk"])

data=dammit.unicode_markup

soup=BeautifulSoup(data,"html.parser")

images=soup.select("img")

print (images)

for image in images:

try:

src=image["src"]

url = urllib.request.urljoin(start_url,src)

if url not in urls:

urls.append(url)

print (url)

count=count+1

T=threading.Thread(target=download,args=(url,count))

T.setDaemon(False)

T.start()

threads.append(T)

except Exception as err:

print (err)

except Exception as err:

print (err)

def download(url,count):

try:

if(url[len(url)-4]=="."):

ext=url[len(url)-4:]

else:

ext=".jpg"

req=urllib.request.Request(url,headers=headers)

data = urllib.request.urlopen(req,timeout=100)

data=data.read()

fobj=open("images\\"+str(count)+ext,"wb")

fobj.write(data)

fobj.close()

print ("downloaded"+str(count)+ext)

except Exception as err:

print (err)

start_url="https://www..net/"

headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

count=0

threads=[]

imageSpider(start_url)

for t in threads:

t.join()

print("the End")

你可能感兴趣的:(python多线程爬取图片)