请尊重原创,转载请注明出处:http://my.oschina.net/u/1789904/blog/386576
核心:htmlparser框架
HtmlParser爬取搜狗百科名人数据:
/** * 从百科搜索中获取百科地址 * @param url * @param charset * @param timeOut * @return * @throws IOException */ private Map<String, String> parserBaike(String url, String charset, int timeOut) throws IOException { WebHttpClient util=new WebHttpClient(); String content=util.getWebContentByGet(url,charset,timeOut); if(content == null){ return null; } Map<String, String> map = new HashMap<>(); Map<String, String> subMap = new HashMap<>(); try { //开始解析 Node node = null; /********************* 解析名字 **********************/ // 过滤出class为term的<span>元素 Parser parser = Parser.createParser(content, charset); AndFilter filter = new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("id","title")); NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); map.put("name", node.toPlainTextString().trim()); } /********************* 解析简介 **********************/ // 过滤出class为start-time的<span>元素 Parser parser2 = Parser.createParser(content, charset); AndFilter filter2 = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class","abstract")); NodeList nodeList2 = parser2.parse(filter2); for (int i = 0; i < nodeList2.size(); i++) { node = nodeList2.elementAt(i); String name = node.toPlainTextString().trim(); System.out.println("name:" + name); map.put("intro", name); } // 过滤出id为J_SingleEndTimeLabel的<span>元素 Parser parser3 = Parser.createParser(content, charset); AndFilter filter3 = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","")); NodeList nodeList3 = parser3.parse(filter3); for (int i = 0; i < nodeList3.size(); i++) { node = nodeList3.elementAt(i); String imgUrl = findHttp(node.toHtml()); System.out.println("imgUrl:" + imgUrl); map.put("logo", imgUrl); } /********************* 解析表格数据 **********************/ // 过滤出class为box post的<div>元素 Parser parser4 = Parser.createParser(content, charset); //AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_tbl")); AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_list")); NodeList tableList = parser4.extractAllNodesThatMatch(andFilter); System.out.println("tableList.size:" + tableList.size()); //tableList.size() 有两个tableList for (int i=0; i<tableList.size(); i++) { TableTag table = (TableTag) tableList.elementAt(i); //取得表中的行集 TableRow[] rows = table.getRows(); //遍历每行 for (int r=0; r<rows.length; r++) { TableRow tr = rows[r]; //行中的列和标题 TableColumn[] td = tr.getColumns(); TableHeader[] header =tr.getHeaders(); System.out.println("td.length:" + td.length); for (int c=0; c<td.length; c++) { String head = header[c].toPlainTextString(); String col = td[c].toPlainTextString().trim(); if (head.equals("出生地")) { System.out.println("======出生地:" + col); map.put("home", col); } subMap.put(head, col); System.out.println(head + ":" + col); } } } } catch (ParserException e) { e.printStackTrace(); } map.put("list",subMap.toString()); return map; }
WebHttpClient.java package org.jun.utils; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; /** * @author xiejunbo * */ public class WebHttpClient { public WebHttpClient(){ } public String getWebContentByGet(String urlString, final String charset, int timeout) throws IOException { if (urlString == null || urlString.length() == 0) { return null; } urlString = (urlString.startsWith("http://") || urlString .startsWith("https://")) ? urlString : ("http://" + urlString) .intern(); URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); // 增加报头,模拟浏览器,防止屏蔽 conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); //conn.setRequestProperty("User-Agent","Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10"); // 只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些 conn.setRequestProperty("Accept", "text/html"); conn.setConnectTimeout(timeout); try { if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { return null; } } catch (IOException e) { e.printStackTrace(); return null; } InputStream input = conn.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(input,charset)); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { sb.append(line).append("\r\n"); } if (reader != null) { reader.close(); } if (conn != null) { conn.disconnect(); } return sb.toString(); } public String getWebContentByGet(String urlString) throws IOException { return getWebContentByGet(urlString, "iso-8859-1", 5000); } public String getWebContentByPost(String urlString,String data, final String charset, int timeout)throws IOException{ if (urlString == null || urlString.length() == 0) { return null; } urlString = (urlString.startsWith("http://") || urlString .startsWith("https://")) ? urlString : ("http://" + urlString).intern(); URL url = new URL(urlString); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); // 设置是否向connection输出,因为这个是post请求,参数要放在 http正文内,因此需要设为true connection.setDoOutput(true); connection.setDoInput(true); connection.setRequestMethod("POST"); // Post 请求不能使用缓存 connection.setUseCaches(false); connection.setInstanceFollowRedirects(true); connection.setRequestProperty("Content-Type","application/x-www-form-urlencoded"); // 增加报头,模拟浏览器,防止屏蔽 connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows vista)"); // 只接受text/html类型,当然也可以接受图片,pdf,*/*任意 connection.setRequestProperty("Accept", "text/xml"); connection.setConnectTimeout(timeout); connection.connect(); DataOutputStream out = new DataOutputStream(connection.getOutputStream()); String content = URLEncoder.encode(data, "utf-8");//+URLEncoder.encode("中文 ", "utf-8"); out.writeBytes(content); out.flush(); out.close(); try { //必须写在发送数据的后面 if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { return null; } } catch (IOException e) { e.printStackTrace(); return null; } BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),charset)); String line; StringBuffer sb=new StringBuffer(); while ((line = reader.readLine()) != null) { sb.append(line).append("\r\n"); } if (reader != null) { reader.close(); } if (connection != null) { connection.disconnect(); } return sb.toString(); } public String getWebContentByPost(String urlString,String data) throws IOException { return getWebContentByPost(urlString, data,"iso-8859-1", 5000); } public static void main(String[] args) throws IOException { WebHttpClient client=new WebHttpClient(); // String s = client.getWebContentByGet("http://www.baidu.com"); // s = new String(s.getBytes("iso-8859-1"), "gb2312"); String s = client.getWebContentByPost("http://localhost:8080/Lottery/login.portal","action=login&loginname=13761083826&password=111111"); s = new String(s.getBytes("iso-8859-1"), "UTF-8"); System.out.println(s); } }