获取图片网址:
import urllib
import requests
url="http://www.doutula.com/"
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
res=requests.get(url,headers=headers)
from lxml import etree
html=etree.HTML(res.text)
srcs=html.xpath(".//img/@data-original")
for i in srcs:
print(i)
获取图片并下载下来
import urllib
import requests
url="http://www.doutula.com/"
headers={
'referer':'http://www.doutula.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
res=requests.get(url,headers=headers)
from lxml import etree
html=etree.HTML(res.text)
srcs=html.xpath(".//img/@data-original")
for i in srcs:
filename=i.split('/')[-1]
img=requests.get(i,headers=headers,verify=False)
with open('imgs/'+filename,'wb') as file:
file.write(img.content)#图片的字节内容
print(i,filename)
快速分页下载并存到不同目录下
import urllib
import requests
import os
import time
from concurrent import futures
url="http://www.doutula.com/"
headers={
'referer':'http://www.doutula.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
from lxml import etree
def download_img(src,dirname):
filename=src.split('/')[-1]
dirnames='imgs/{}'.format(dirname)
try:
if not os.path.exists(dirname):
os.makedirs(dirnames)
except:
pass
img=requests.get(url,headers=headers,verify=False)
with open('{}/{}'.format(dirnames,filename),'wb') as file:
file.write(img.content)#图片的字节内容
def get_page(url):
res=requests.get(url,headers=headers)
dirname=url.split('page=')[-1]
print(res,url)
html=etree.HTML(res.text)
srcs=html.xpath(".//img/@data-original")
ex=futures.ThreadPoolExecutor(max_workers=40)
for src in srcs:
ex.submit(download_img,src,dirname)
#for src in srcs:
# download_img(src,dirname)
next_link=html.xpath('.//a[@rel="next"]/@href')
return next_link
def main():
next_link_base="http://www.doutula.com/article/list/?page="
current_num=0
next_link=['http://www.doutula.com']
while next_link:
time.sleep(0.2)
current_num=current_num+1
next_link=get_page(next_link_base+str(current_num))
if current_num>=4:
break
if __name__=="__main__":
main()