Java爬虫--爬取知乎图片

为了熟悉一下Java的网络编程方面的知识,就动手做了一个爬虫,很简单,主要就是根据目标网页链接获取网页源码,再提取出其中的链接。程序主要有五个类
Java爬虫--爬取知乎图片_第1张图片
按顺序来说,Download_links.java类

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Download_links {
    //此类用来获取目标网页链接
    public static String downloadLinks(String URL){
        //通过目标链接获取网页源码
        String targetHTML=(getURLMessage.getMessage(URL));
        //开始解析,获取网页内所有的链接。这里的正则表达式属于拿来主义。。
        String patt="(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
        Pattern pattern=Pattern.compile(patt);
        Matcher matcher=pattern.matcher(targetHTML);
        HashSet hashSet=new HashSet<>();;
        while (matcher.find()) {
            //结果集筛选,符合长度的字符留下存入hashset确保结果的唯一性
            //知乎网页的标准形式:
            //https://www.zhihu.com/question/58498720/answer/617768326
            //https://pic3.zhimg.com/50/v2-4fd5bfe8b9094f011f7210358449df8a_hd.jpg
            if(matcher.group().length()=="https://www.zhihu.com/question/58498720/answer/617768326".length()){
                hashSet.add(matcher.group());
            }
        }
        //链接获取完毕,写入到txt文件。
        Iterator iterator=hashSet.iterator();
        int count=0;
        try {
            for (int i = 0; i < hashSet.size(); i++) {
                new RW_File().write_txt("C:\\Users\\12733\\Desktop\\links\\HtmlLinks.txt", iterator.next().toString()+"\n", true);
            }
        }catch (IOException e){
            System.out.println("写入失败!");
            e.printStackTrace();
        }
        return "htmlLinks写入完成!";
    }
}

调用getURLMessage方法获取目标网页代码,再使用正则提取出其中所有的链接,又因为知乎中类似的链接长度是相等的,所以可以根据连接长度提取出所需要的链接然后使用HashSet进行过存储确保唯一性;

getURLMessage.java类

public class getURLMessage {
    //获取目标链接的html数据
    public static String getMessage (String URL){
        String contents=null;
        try {
            HttpClient httpClient = HttpClients.createDefault();//创建客户端
            HttpGet httpGet = new HttpGet(URL);//初始化
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763");//设置头部
            HttpResponse response = httpClient.execute(httpGet);
            contents = EntityUtils.toString(response.getEntity(), "utf-8");//utf-8
        }catch (Exception e){
            e.printStackTrace();
        }
        return contents;
    }
}

然后就是开始下载图片了,下载图片有两个类,一个是 Download_pic类,一个是linkTOpic类 前者是可以根据图片的链接进行下载,后者是集成了前几个类的功能,可以根据目标网页链接提取出图片数据进行下载

Download_pic

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
public class Download_pic {
    //链接url下载图片
    static void downloadPicture(String urlList, String path) {
        URL url = null;
        try {
            url = new URL(urlList);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            File file=new File(path+urlList.substring(30,40)+".jpg");
            System.out.println(path+urlList.substring(30,40)+".jpg"+"---写入成功!");
            if (!file.exists())
                file.createNewFile();
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            ByteArrayOutputStream output = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int length;
            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            fileOutputStream.write(output.toByteArray());
            dataInputStream.close();
            fileOutputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

linkTOpic类

import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class linkTOpic {
    //此类是根据网页链接获取图片,并非像Download_pic直接根据图片链接下载图片
    //大致和Download_links差不多,只是这个是下载图片
    public static String linkTOpic(String URL) throws Exception{
        //通过目标链接获取网页源码
        String targetHTML=(getURLMessage.getMessage(URL));
        //开始解析,获取网页内所有的链接。这里的正则表达式属于拿来主义。。
        String patt="(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
        Pattern pattern=Pattern.compile(patt);
        Matcher matcher=pattern.matcher(targetHTML);
        HashSet hashSet=new HashSet<>();;
        while (matcher.find()) {
            //结果集筛选,符合长度的字符留下存入hashset确保结果的唯一性
            //知乎网页的标准形式:
            //https://www.zhihu.com/question/58498720/answer/617768326
            //https://pic3.zhimg.com/50/v2-4fd5bfe8b9094f011f7210358449df8a_hd.jpg
            if(matcher.group().length()=="https://pic3.zhimg.com/50/v2-4fd5bfe8b9094f011f7210358449df8a_hd.jpg".length()){
                hashSet.add(matcher.group());
            }
        }
        //图片链接存储获取完毕,开始下载
        //使用迭代器遍历hashset结果集并且下载图片
        Iterator iterator=hashSet.iterator();
        for(int i=0;i

你可能感兴趣的:(Java爬虫)