【爬虫】批量下载某壁纸网站的图片

这个网站有些飞机很漂亮，一个一个下太慢了，就练了一把 jsoup

https://10wallpaper.com

翠花，上酸菜！错了上代码

package net.downPic.downPic_Jsoup;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Timestamp;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import net.DownloadV2;

//下载壁纸
public class downPicJsoup {
   public static void main(String[] args) throws IOException {

       int page = 1;

       while (page <= 1) {
           downPicJsoup down = new downPicJsoup();
           String startURL="https://10wallpaper.com/cn/Military_wallpapers.html";
           Document doc = (Document)Jsoup.connect(startURL).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0").timeout(30000).get();
           Elements eles = doc.getElementById("pics-list").getElementsByTag("a"); //根据元素的ID查找元素
           for (Element ele : eles){
               //获取图片链接
               String pageURL = "https://10wallpaper.com"+ele.attr("href").toString();
               down.getAllPicsInPage(pageURL);
               //第二页https://10wallpaper.com/cn/list/Aviation_aircraft_photography_HD_wallpaper/page/2
               pageURL=pageURL.substring(0, pageURL.indexOf(".html"))+"/page/2";
               down.getAllPicsInPage(pageURL);
               pageURL=pageURL.substring(0, pageURL.indexOf(".html"))+"/page/3";
               down.getAllPicsInPage(pageURL);
           }

       }
   }

   public static void getAllPicsInPage(String savePath) throws IOException {

       StringBuffer urlBase = new StringBuffer();
       urlBase.append("https://10wallpaper.com/cn/list/Aviation_aircraft_photography_HD_wallpaper.html");
       StringBuffer url = new StringBuffer();
       url.append(urlBase);

       //下载网页文件
       String threadUrl = url.toString();
       Document doc = (Document)Jsoup.connect(threadUrl).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0").timeout(30000).get();
       //System.out.println(doc);

       Elements eles = doc.getElementsByTag("img"); //根据css素选择器（HTML自带的）获取元素
       System.out.println(eles);

       for (Element ele : eles){
           //获取图片链接
           String picUrl = "https://10wallpaper.com"+ele.attr("src").toString();
           if (picUrl.equals("https://10wallpaper.com/ad/ad-cn.jpg")) {
               continue;
           }

           //缩略图替换成高清图片
           picUrl=picUrl.replaceAll("medium", "1920x1200");
           //获取原始图片文件名
           int start = picUrl.lastIndexOf("/")+1;

           //获取图片说明
           String picAlt = ele.attr("alt").toString();
           //删除特殊符号
           picAlt=picAlt.replaceAll(" ", "_");
           picAlt=picAlt.substring(0, picAlt.indexOf("浏览:"));

           //定义图片名称
           String fileName="F://飞机/"+picAlt+"_"+picUrl.substring(start);
           System.out.println(picUrl);

           DownloadV2 dl =new DownloadV2();
           dl.getFile(picUrl, fileName);
       }



   }

}

package net;

/*V2输入不稳定网络不稳定的时候InputStream获取的数据不完整会补0，造成数据不准确*/

public class DownloadV2 {

   public static void main(String[] args) {
       DownloadV2 dl=new DownloadV2();
       //http://www.lingoes.cn/download/lingoes_2.9.2_x64_cn.exe
       //http://yinyueshiting.baidu.com/data2/music/240885332/124380645248400128.mp3?xcode=685dbad67b8536228e84979b09d702df
       String urlstring="https://10wallpaper.com/wallpaper/2560x1600/1204/F_35_jet-Military_aircraft_wallpaper_2560x1600.jpg";
       int start = urlstring.lastIndexOf("/")+1;

       int end=urlstring.lastIndexOf("?");
       String filename;
       if (end == -1){
           filename="F://"+urlstring.substring(start);
       }
       //对于URL中有?的做特殊处理
       else{
           filename="F://"+urlstring.substring(start,end);
       }

       dl.getFile(urlstring,filename);
       System.exit(0);
   }

   public void getFile(String urlstring,String filename) {

       try {
           URL url=new URL(urlstring);
           System.setProperty("http.agent", "IE/6.0");
           //URLConnection urlConection = url.openConnection();
           HttpURLConnection http = (HttpURLConnection)url.openConnection();
           //http.setRequestProperty("Referer","http://www.lingoes.cn/");
           http.setRequestMethod("GET");
           //http.setRequestProperty("User-agent", "IE/6.0");
           http.setRequestProperty("User-agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
           //URLConnection.getInputStream()获取的是一个InputStream对象
           InputStream is= http.getInputStream();
           //InputStream方法有问题，因此要转换为BufferedInputStream
           BufferedInputStream bis = new BufferedInputStream(is);
           File file = new File(filename);
           FileOutputStream fileos = new FileOutputStream(file);
           System.out.println("开始下载");
           int readnum;
           byte[] buffer = new byte[1024];
           //while((readnum=is.read())!=-1){
           while((readnum=bis.read(buffer))!=-1){
               fileos.write(buffer,0,readnum);
           }
           System.out.println("下载完毕");
           fileos.close();
           is.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

}

【爬虫】批量下载某壁纸网站的图片

你可能感兴趣的:(【爬虫】批量下载某壁纸网站的图片)