java爬虫 jsoup(一)

网络爬虫,是一种规定,自动抓取万维网信息的程序和脚本
入门程序
1.先创建一个maven项目
导入依赖

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>

在resources下面建一个log4j.properties

#将等级为DEBUG的日志信息输出到console和file这两个目的地,console和file的定义在下面的代码
log4j.rootLogger=DEBUG,console

#控制台输出的相关设置
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.Threshold=DEBUG
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern= [%c]-%m%n


创建一个maven项目
创建一个fristTest的·类,写代码:

package com.wh;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class fristTest {
    public static void main(String[] args) throws IOException {
//        1.打开浏览器,创建一个httpClient对象
        CloseableHttpClient httpClient =
                HttpClients.createDefault();
//        2.输入网址,发起get请求创建httpget对象
        HttpGet httpGet = new HttpGet("http://jiaoyu.xiangmu.com/");
//        3.发起请求,使用httpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
//        4.解析响应,获取数据
        if (response.getStatusLine().getStatusCode() == 200){
            HttpEntity httpEntity = response.getEntity();
            String content = EntityUtils.toString(httpEntity,"utf8");
            System.out.println(content);
        }

    }
}

爬虫分为三个部分:采集,处理,存储。。

httpClient–get

package com.wh;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

public class HttpGetTest {
    public static void main(String[] args) throws Exception {

//        创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //        创建uriBuilder
        URIBuilder uriBuilder = new URIBuilder("http://www.vixue.com/");
//        uriBuilder.setParameter("keys","news");
            uriBuilder.setPath("/news");
//    创建httpGet对象,设置url访问地址
        HttpGet httpGet = new HttpGet(uriBuilder.build());
//    使用httpClient发起请求,获取response
        System.out.println("*****************************");
        System.out.println("请求的信息"+httpGet);
        System.out.println("********************************");



        CloseableHttpResponse response=null;
        try {
             response= httpClient.execute(httpGet);
//            解析响应
            if(response.getStatusLine().getStatusCode()==200){
                String content=   EntityUtils.toString(response.getEntity(), "utf8");
//                System.out.println(content.length());
                System.out.println("****************************");
                System.out.println(content.length());
                System.out.println("***************************");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
//            关闭response
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }



    }
}


你可能感兴趣的:(java,网络爬虫)