python--多种方式爬取网站图片源码

获取图片网址:

import urllib
import requests
url="http://www.doutula.com/"
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
	}
res=requests.get(url,headers=headers)
from lxml import etree
html=etree.HTML(res.text)
srcs=html.xpath(".//img/@data-original")
for i in srcs:
	print(i)

 获取图片并下载下来

import urllib
import requests
url="http://www.doutula.com/"
headers={
    'referer':'http://www.doutula.com/',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }
res=requests.get(url,headers=headers)
from lxml import etree
html=etree.HTML(res.text)
srcs=html.xpath(".//img/@data-original")
for i in srcs:
    filename=i.split('/')[-1]
    img=requests.get(i,headers=headers,verify=False)
    with open('imgs/'+filename,'wb') as file:
        file.write(img.content)#图片的字节内容
    print(i,filename)

快速分页下载并存到不同目录下

import urllib
import requests
import os
import time
from concurrent import futures
url="http://www.doutula.com/"
headers={
	'referer':'http://www.doutula.com/',
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
	}

from lxml import etree

def download_img(src,dirname):
	filename=src.split('/')[-1]
	dirnames='imgs/{}'.format(dirname)
	try:
		if not os.path.exists(dirname):
			os.makedirs(dirnames)
	except:
		pass
	img=requests.get(url,headers=headers,verify=False)
	with open('{}/{}'.format(dirnames,filename),'wb') as file:
		file.write(img.content)#图片的字节内容
def get_page(url):
	res=requests.get(url,headers=headers)
	dirname=url.split('page=')[-1]
	print(res,url)
	html=etree.HTML(res.text)
	srcs=html.xpath(".//img/@data-original")
	ex=futures.ThreadPoolExecutor(max_workers=40)
	for src in srcs:
		ex.submit(download_img,src,dirname)
	#for src in srcs:
	#	download_img(src,dirname)
	next_link=html.xpath('.//a[@rel="next"]/@href')	
	return next_link
def main():
	next_link_base="http://www.doutula.com/article/list/?page="
	current_num=0
	next_link=['http://www.doutula.com']
	while next_link:
		time.sleep(0.2)
		current_num=current_num+1
		next_link=get_page(next_link_base+str(current_num))
		if current_num>=4:
			break
if __name__=="__main__":
	main()

 

你可能感兴趣的:(Python)