使用这个之前需要了解几个知识点:
HTTP报文
http://www.oschina.net/question/565065_81309
HttpWatch工具
httpWatch是强大的网页数据分析工具,安装后将集成到Internet Explorer工具栏中。它不用代理服务器或一些复杂的网络监控工具,就能抓取请求及响应的完整信息,包括Cookies、消息头、查询参数、响应报文等,是Web应用开发人员的必备工具。
MIME类型
http://www.cnblogs.com/zhongcj/archive/2008/11/03/1325293.html
有意思的网站:http://top.51.la/os.htm
详解HttpURLConnection
http://blog.csdn.net/woxueliuyun/article/details/43267365
重要:再在好多网站为了保护自己的数据,查询的结果是异步加载。如果还是像下面这样直接去获取一个网页,往往是得不到真的结果。
这个时候就要 打开浏览器的开发模式,再刷新页面查看“NetWork”里面加载的JS。或者用工具分析: Java抓取网页数据(原网页+Javascript返回数据)
还有一种 好像有一个叫 js渲染的东西。
上代码:
package org.express.util; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; import org.apache.log4j.Logger; /** * java程序连接互联网工具类 <p> * 使用HttpURLConnection * * @author wangxinyu * * @see [相关类/方法] * @since [产品/模块版本] */ public class HttpUrlConnUtil { private static final Logger logger = Logger.getLogger(HttpUrlConnUtil.class); public static final String DEF_CHATSET = "UTF-8"; public static final int DEF_CONN_TIMEOUT = 30000; public static final int DEF_READ_TIMEOUT = 30000; public static String userAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"; /** * * @param strUrl 请求地址 * @param params 请求参数 * @param method 请求方法 GET POST * @return 网络请求字符串 * @throws Exception * @description http://www.oschina.net/question/565065_81309?fromerr=7bNcYdxi */ public static String send(String strUrl, Map<String, Object> params, String method) throws Exception { HttpURLConnection conn = null; BufferedReader reader = null; String rs = null; try { // GET请求,有参数就放在url中 (默认为GET方式) if (method == null & "GET".equals(method) & !params.isEmpty()) { strUrl = strUrl + "?" + urlencode(params); } logger.info("strUrl:"+strUrl); // 设置连接 URL url = new URL(strUrl); conn = (HttpURLConnection) url.openConnection(); /* 请求行 */ if (method == null || method.equals("GET")) { } else { conn.setRequestMethod("POST"); conn.setDoOutput(true); // 将参数要放在http正文内(post必需) } //置是否从httpUrlConnection读入,默认情况下是true; conn.setDoInput(true); /* 请求头 */ //设置 HttpURLConnection的接收的文件类型 conn.setRequestProperty("Accept","image/gif, image/jpeg, image/pjpeg, image/pjpeg, " + "application/x-shockwave-flash, application/xaml+xml, " + "application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, " + "application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"); // 设置 HttpURLConnection的接收语音 conn.setRequestProperty("Accept-Language", Locale.getDefault().toString()); // 指定请求uri的源资源地址(告诉服务器你是从哪儿来的) conn.setRequestProperty("Referer", "你大爷"); // 设置 HttpURLConnection的字符编码 conn.setRequestProperty("Accept-Charset", "UTF-8"); //持续连接 conn.setRequestProperty("Connection", "Keep-Alive"); // 设定传送的内容类型 MIME conn.setRequestProperty("Content-type", "text/html"); //浏览器类型 conn.setRequestProperty("User-agent", userAgent); conn.setUseCaches(false); // 不使用缓存 //超时设置,防止网络异常的情况下,可能会导致程序僵死而不继续往下执行 conn.setConnectTimeout(DEF_CONN_TIMEOUT); // 连接被阻塞时长(连接主机超时),防卡死 conn.setReadTimeout(DEF_READ_TIMEOUT); // 从主机读取数据超时 conn.setInstanceFollowRedirects(false); // 系统不自动处理重定向 //请求头参数配置必须要在connect之前完成, conn.connect(); // 建立连接 /* 请求正文 */ // post方式 发送请求参数 if (params != null && method.equals("POST")) { try (DataOutputStream out = new DataOutputStream(conn.getOutputStream())) { String pendParamsString = urlencode(params); logger.info("POST请求的参数:"+pendParamsString); out.writeBytes(pendParamsString); } } // 获取响应 InputStream is = conn.getInputStream(); // int state = conn.getResponseCode(); // 获取响应状态码 reader = new BufferedReader(new InputStreamReader(is, DEF_CHATSET)); // 获取响应体 StringBuffer sb = new StringBuffer(); String strRead = null; while ((strRead = reader.readLine()) != null) { sb.append(strRead); } rs = sb.toString(); printResponseHeader(conn); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { reader.close(); } if (conn != null) { conn.disconnect(); } } return rs; } /** 打印出响应头 */ private static void printResponseHeader(HttpURLConnection http) throws UnsupportedEncodingException { Map<String, String> header = getHttpResponseHeader(http); for (Map.Entry<String, String> entry : header.entrySet()) { String key = entry.getKey() != null ? entry.getKey() + ":" : ""; logger.info(key + entry.getValue()); } } private static Map<String, String> getHttpResponseHeader(HttpURLConnection http) throws UnsupportedEncodingException { Map<String, String> header = new LinkedHashMap<String, String>(); for (int i = 0;; i++) { String mine = http.getHeaderField(i); if (mine == null) break; header.put(http.getHeaderFieldKey(i), mine); } return header; } /** 将map型转为请求参数型 */ private static String urlencode(Map<String, ?> data) { StringBuilder sb = new StringBuilder(); // map.entrySet() 可遍历K和V // 见:http://blog.csdn.net/wangxy799/article/details/49991023 for (Map.Entry<String, ?> i : data.entrySet()) { try { sb.append(i.getKey()).append("=").append(URLEncoder.encode(i.getValue() + "", "UTF-8")).append("&"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } sb.deleteCharAt(sb.length()-1); //删除最后多余的一个& return sb.toString(); } public static void main(String[] args) throws Exception { Map<String,Object> params = new HashMap<String, Object>(); // params.put("wd", "爱奇艺"); // String string = HttpUrlConnUtil.send("https://www.baidu.com/s",params,"GET"); System.out.println("params.size:"+params.size()+"; params.isEmpty:"+params.isEmpty()); String string = HttpUrlConnUtil.send("https://www.hao123.com/",params,"GET"); System.out.println(string); } }