springboot实现java爬虫获取静态网页,图片,css,js

实现流程

1.1项目搭建

首先先创建一个springboot项目不懂的可以看下这个博客

1.2添加依赖



    4.0.0
    
        org.springframework.boot
        spring-boot-starter-parent
        2.2.1.RELEASE
         
    
    com.reptile
    demo
    0.0.1-SNAPSHOT
    demo
    Demo project for Spring Boot

    
        1.8
    

    
        
            org.springframework.boot
            spring-boot-starter
        
        
            org.jsoup
            jsoup
            1.11.3
        

        
        
            commons-io
            commons-io
            2.5
        

        
            org.apache.httpcomponents
            httpclient
            4.5.5
        
        
        
            junit
            junit
            4.13-beta-2
        
        
            org.springframework.boot
            spring-boot-starter-test
            test
            
                
                    org.junit.vintage
                    junit-vintage-engine
                
            
        
    

    
        
            
                org.springframework.boot
                spring-boot-maven-plugin
            
        
    



1.3测试运行代码,自己创建一个测试类就可以

package com.reptile.demo.test;
/**
 * 读取网页的数据,并进行分析
 * xutao   2018-11-22  09:09
 */
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.File;
import java.io.FileWriter;
import java.io.*;
import java.net.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class test {
    //确定爬取的网页地址
    static String url1 = "http://www.xuantech.cn/";
    //控制台获取指定网页的网页数据,这个主要是用来看得到的数据是什么
    @Test
    public void findStatic() {
        //建立url爬取核心对象
        try {
            URL url = new URL(url1);
            //通过url建立与网页的连接
            URLConnection conn = url.openConnection();
            //通过链接取得网页返回的数据
            InputStream is = conn.getInputStream();
            System.out.println(conn.getContentEncoding());
            //一般按行读取网页数据,并进行内容分析
            //因此用BufferedReader和InputStreamReader把字节流转化为字符流的缓冲流
            //进行转换时,需要处理编码格式问题   注意一般为GBK或者UTF-8(乱码就换另外一个)
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "GBK"));
            //按行读取并打印
            String line = null;
            while ((line = br.readLine()) != null) {
                System.out.println(line);
            }
            br.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

 

1.4获取整个网页源代码


    //下载静态页面
    @Test
    public void downloadStaticInfo(){
        URL url = null;
        try {
            //想要读取的url地址
            url = new URL("http://www.xuantech.cn/");
            //读取文件到那个路径 自己定义
            File fp = new File("D:/爬虫demo/staticInfo.html");
            //建立文件输出流
            OutputStream os = new FileOutputStream(fp);
            //打开url连接
            URLConnection conn = url.openConnection();
            BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
            String urlString = "";
            String current;
            while ((current = in.readLine()) != null) {
                urlString += current;
            }
            os.write(urlString.getBytes());
            os.close();

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

1.5:获取页面js

  1. 打开你需要爬的网页检查他的代码
  2. 比如我是谷歌浏览器快捷键F12,其他浏览器有其他快捷启动方式,然后会是这么一个页面
    springboot实现java爬虫获取静态网页,图片,css,js_第1张图片
    注意,重点来了,你需要获取什么(无论是css,js,还是图片),你就需要获得他的标签,比如
 //下载css
    @Test
    public void findCss() throws ClientProtocolException, IOException {
        // 创建httpclient实例
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 创建httpget实例  后方网页地址
        HttpGet httpget = new HttpGet("http://www.xuantech.cn/");
        // 执行get请求
        CloseableHttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        // 获取返回实体
        String content = EntityUtils.toString(entity, "UTF-8");
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(content);
        // 获取指定的 
        Elements elements = doc.select("head link");
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            // 获取  的 href
            String url = element.attr("href");
            System.out.println(url);
            // 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全
            String url2 = "http://www.xuantech.cn/" + url;
            System.out.println(url2);
            HttpGet picturehttpGet = new HttpGet(url2);
            CloseableHttpResponse pictureResponse = httpclient.execute(picturehttpGet);
            HttpEntity pictureEntity = pictureResponse.getEntity();
            InputStream inputStream = pictureEntity.getContent();
            // 使用 common-io 下载图片到本地,注意图片名不能重复
            FileUtils.copyToFile(inputStream, new File("D://爬虫demo//Css//" + i + ".js"));
            pictureResponse.close(); // pictureResponse关闭
        }
        response.close(); // response关闭
        httpclient.close(); // httpClient关闭
    }

springboot实现java爬虫获取静态网页,图片,css,js_第2张图片

  1. Elements elements = doc.select(“head link”); 这段代码主要是找到head link
  2. String url = element.attr(“href”);主要是根据上方的代码获得具体的url地址,可以自行输出查看
  3. FileUtils.copyToFile(inputStream, new File(“D://爬虫demo//JS//” + i + “.js”));这段代码的后缀具体跟着你需要获取内容的后缀而命名

1.6:图片和css获取和js获取代码相同,只需要改上方这三个注意的地方,以及存放地址

记录学习生活

你可能感兴趣的:(springboot实现java爬虫获取静态网页,图片,css,js)