Java爬虫入门(一)

package 爬虫;

import java.io.*;
import java.net.*;

public class pachong1 {
    public static void main(String[] args) {
        //设置爬取网页的网址
        String strurl = "https://www.w3cschool.cn/java/java-tutorial.html";
        try{
            //先拿到这个url
            URL url = new URL(strurl);
            //通过url建立与网页之间的连接
            URLConnection com = url.openConnection();
            //通过这个连接呢,取得这网页返回回来的数据
            InputStream word = com.getInputStream();
            System.out.println(com.getContentEncoding());

            //按行读取网页的数据,并进行内容分析
            //使用BufferedReader和InputStreamReader把字节流转化成字符流的缓冲流(=。= 禁止套娃)
            //字符编码设置gbk或者utf8
            BufferedReader br = new BufferedReader(new InputStreamReader(word,"utf8"));

            //创建一个String来装这些字符串
            String line = null;

            //去除空行 遍历打印
            while ((line = br.readLine()) != null){
                System.out.println(line);
            }
            //关闭缓冲
            br.close();

        }catch (Exception e){
            e.printStackTrace();
        }
    }

}

你可能感兴趣的:(Java爬虫入门到实战)