爬虫优化(连接超时改进,如何防封)

连接超时改进

private static String execute(HttpRequestBase request) {
    //设置创建连接最长的时间
    RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(1000*10)
    .setConnectionRequestTimeout(600*10) //设置获取连接的最长时间
    .setSocketTimeout(100*1000).build(); //设置数据传输的最长时间
    request.setConfig(requestConfig);

    CloseableHttpClient createDefault = HttpClients.createDefault();
    try {
        CloseableHttpResponse response = createDefault.execute(request);
        return EntityUtils.toString(response.getEntity(), Charset.forName("UTF-8"));
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

如何避免被发现

public static HttpGet getHttpGet(String url) {
    HttpGet httpGet = new HttpGet(url);
    //我是专门用来模拟浏览器,让我的请求看起来和真实的浏览器一样
    httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
    httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch, br");
    httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
    httpGet.setHeader("Cache-Control", "no-cache");
    httpGet.setHeader("Cookie", "screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D;");
    //根据url解析出主机地址
    String host = "";
    try {
        host = new URL(url).getHost();
    } catch (MalformedURLException e) {
        System.err.println(e.getMessage());
    }
    httpGet.setHeader("Host", host);
    //TODO 通过配置文件动态获取
    httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
    return httpGet;
}

你可能感兴趣的:(爬虫)