一、页面的爬取
如何获取网页内容,不明觉厉呀,原来就是一个HttpClient搞定。 比如下面获取聚划算首页的例子,简洁完整地展示了HttpClient的使用过程。
import java.io.IOException; import org.apache.commons.httpclient.*; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; public class HttpClientTest { public static void main(String[] args){ spiderPage("http://ju.taobao.com/"); } private static void spiderPage(String url) { //1.构造HttpClient的实例 HttpClient httpClient = new HttpClient(); //2.创建GET方法的实例 GetMethod getMethod = new GetMethod(url); //3.使用系统提供的默认的恢复策略 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler()); //4.执行getMethod try { int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus. SC_OK) { System.err.println("Method failed: "+ getMethod.getStatusLine()); } //5.读取内容并处理 byte[] responseBody = getMethod.getResponseBody(); System.out.println(new String(responseBody, "GBK")); } catch (HttpException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { //6.释放连接 getMethod.releaseConnection(); } } }爬取到的数据很长,截取了其中有用的数据部分,截图如下(为了紧凑把行首空格去掉了)
从上图可以看到,能得到商品名称、价格、折扣等信息。其实在<a href>标签里有商品的item_id,据此就可得到商品的所有信息。
为了展示直观,我们以取商品的名称列表为例。商品数据位于"class"属性值为"clearfix"的<ul>标签里,<ul>标签下有多个<li>标签。具体标签结构如下:
<ul class="clearfix">
<li>
<div>
<a href>
<h3 title>
</div>
</li>
<li>
<div>
<a href>
<h3 title>
</div>
</li>
</ul>
import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.httpclient.*; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class HttpClientTest2 { private static String spiderPage(String url) { //1.构造HttpClient的实例 HttpClient httpClient = new HttpClient(); //2.创建GET方法的实例 GetMethod getMethod = new GetMethod(url); //3.使用系统提供的默认的恢复策略 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler()); //4.执行getMethod String pageStr = ""; try { int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus. SC_OK) { System.err.println("Method failed: "+ getMethod.getStatusLine()); } //5.读取内容并处理 byte[] responseBody = getMethod.getResponseBody(); System.out.println(new String(responseBody, "GBK")); pageStr = new String(responseBody, "GBK"); } catch (HttpException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { //6.释放连接 getMethod.releaseConnection(); } return pageStr; } private static List<String> processData(String pageStr){ List<String> itemTitles = new ArrayList<String>(); //1.创建parser对象 Parser parser = new Parser(); try { parser.setInputHTML(pageStr); //2.创建AndFilter实例 AndFilter itemFilter = new AndFilter( new TagNameFilter("ul"), new HasAttributeFilter("class","clearfix")); //3.筛选出所有"class"属性值为"clearfix"的<ul>标签节点 NodeList ulList = parser.extractAllNodesThatMatch(itemFilter); if(ulList!=null && ulList.size()>0){ Tag ulTag = (Tag)ulList.elementAt(0); //获取<ul>标签下有多个<li>标签 List<Tag> liTags = getChildren(ulTag,"li"); for (Tag liTag : liTags) { //逐层获取标签 List<Tag> divTags = getChildren(liTag,"div"); List<Tag> ahrefTags = getChildren(divTags.get(0), "a"); String title = getChildren(ahrefTags.get(0), "h3").get(0).getAttribute("title"); itemTitles.add(title); } } } catch (ParserException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return itemTitles; } public static List<Tag> getChildren(Tag parent, String tagname) { List<Tag> list = new ArrayList<Tag>(); NodeList nList = parent.getChildren(); if(nList!=null){ for (int i = 0; i < nList.size(); i++) { try { Tag tag = (Tag) nList.elementAt(i); if (tag.getTagName().equalsIgnoreCase(tagname)) list.add(tag); } catch (Exception e) { } } } return list; } public static void main(String[] args){ String pageStr = spiderPage("http://ju.taobao.com/"); List<String> titleList = processData(pageStr); for(String title:titleList){ System.out.println(title); } } }