Python爬取高清图片

坑了这么久,现在填上。
环境win10,python
之前爬过一些图片的网站:https://images.pexels.com和https://unsplash.com,根据网上找的一些资料和自己看的,现在贴出代码。

import requests  
import re  
import os  
import time  

def get_url(url):  
    kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}  
    try:  
        r = requests.get(url,headers = kw)  
        r.raise_for_status()  
        r.encoding = r.apparent_encoding  
        return r  
    except:  
        print('wrong!!!!!!!!!!!')  

def get_photourl(photo_url):  
    kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}  
    try:  
        r = requests.get(photo_url,headers = kw)  
        r.raise_for_status()  
        r.encoding = r.apparent_encoding  
        return r  
    except:  
        return 'wrong'  

def get_photos(url,new_fpath):  
    result = get_url(url)  
    pattern = re.compile(r'src="https://images.pexels.com/photos/(\d+)/(.*?)\?h=350&auto=compress&cs=tinysrgb"', re.S)
    #真正的下载链接是static,不是images开头  
    #print(result.text)
    items = re.findall(pattern, result.text)
    #print(items)
    for item in items:  
        try:  
            photo_url = 'https://static.pexels.com/photos/' + str(item[0]) + '/' + str(item[1])
            #把图片链接中的images,改成了static
            print(photo_url)
            save(photo_url,item,new_fpath)  
            time.sleep(1)  
        except:  
            continue  

def makedir(new_fpath,i,key):  
    E = os.path.exists(new_fpath)  
    if not E:  
        os.makedirs(new_fpath)  
        os.chdir(new_fpath)  
        print('文件夹'+ key + '_page' + str(i + 1) + '创建成功!')  
    else:  
        print('文件夹已存在!')  

def save(photo_url,item,new_fpath):  
    Final_fpath = new_fpath + '/' +str(item[0])+str(item[1])  

    print('正在下载图片......')  

    result = get_photourl(photo_url)  
    if result != 'wrong':  
        print('下载成功!')  
    else:  
        print('失败')  

    E = os.path.exists(Final_fpath)  

    if not E:  
        try:  
            with open(Final_fpath,'wb') as f:  
                f.write(result.content)  
        except:  
            print('下载失败!')  
    else:  
        print('图片已存在')  

def main():  
    key = input('请输入搜索关键词(英文):')  

    url =  'https://www.pexels.com/search/' + key + '/'  

    num = int(input('请输入一共要下载的页数:'))#默认从第1页开始下载  

    fpath = '*****'  
    for i in range(2,num):  
        new_fpath = fpath + '/Photo2.0/' + key + '_page' + str(i + 1)  
        makedir(new_fpath,i,key)  
        if i >= 1:  
            new_url = url + '?page=' + str(i + 1)
            print(new_url)
            get_photos(new_url,new_fpath)  
        else:  
            get_photos(url,new_fpath)  
        time.sleep(3)  

main() 

爬取https://unsplash.com的时候需要利用selenium模拟下拉操作。需要pip安装一下,并下载chromedriver放在谷歌的安装目录(Chrome\Application)。

from selenium import webdriver#实现自动下拉
from lxml import etree#定位元素(更加高效)
from urllib.parse import urlparse#解析图片的名称
import urllib.request#urlretrieve()下载保存图片
import re
import time


class Unsplash:
    #初始化构造函数
    def __init__(self):
        self.url='https://unsplash.com/search/photos/label'#请求地址
        self.save_path="****"#图片保存路径
        self.driver=webdriver.Chrome()
        #self.driver = webdriver.PhantomJS()
    #实现下拉动作,并返回网页源代码,times:下拉次数
    def do_scroll(self,times):
        #打开目标网址
        driver=self.driver
        driver.get(self.url)
        #模拟下拉操作
        for i in range(times):
            print('正在下拉'+str(i+1)+'次:')
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print('等待'+str(i+1)+'次网页加载')
            time.sleep(40)
        #解析网页源码
        #html=etree.HTML(driver.page_source)
        html = driver.page_source
        return html

    #下载图片保存到本地
    def save_img(self,src,img_name):
        urllib.request.urlretrieve(src, filename=self.save_path + img_name)

    def get_pic(self, html):
        #获取a标签的style内容
        #all_uls = html.xpath('//a[@class="cV68d"]/@style')
        pattern = re.compile(r'img src="https://images.unsplash.com/photo(.*?)"',re.S)
        items = re.findall(pattern, html)
        # 获取图片下载地址,
        count = 1
        for url in items:
            #使用正则表达式获取想要的src地址
            #src = re.findall(r'url\(\"(.*?)\"\)',url,re.S)[0]
            url = 'https://images.unsplash.com/photo'+url
            #print(url)
            #使用urlparse解析地址,使用path的内容,去除不需要的参数
            #fina_src=urlparse(' ' + src).path.strip()
            # 保存图片的名字
            #img_name = fina_src.split('/')[-1]+'.jpg'
            #print(fina_src,img_name)
            count += 1
            img_name = str(count)+'.jpg'
            self.save_img(url,img_name)

    def main(self):
        #获取源码
        html=self.do_scroll(20)
        print("开始下载图片")
        self.get_pic(html)
img=Unsplash()
img.main()

你可能感兴趣的:(python)