四十一、页面爬取处理

一、页面的爬取
    如何获取网页内容,不明觉厉呀,原来就是一个HttpClient搞定。 比如下面获取聚划算首页的例子,简洁完整地展示了HttpClient的使用过程。

import java.io.IOException;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class HttpClientTest {
	public static void main(String[] args){
		spiderPage("http://ju.taobao.com/");
	}

	private static void spiderPage(String url) {
		//1.构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		//2.创建GET方法的实例
		GetMethod getMethod = new GetMethod(url);
		//3.使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
		//4.执行getMethod
		try {
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus. SC_OK) {
				System.err.println("Method failed: "+ getMethod.getStatusLine());
			}
			//5.读取内容并处理
			byte[] responseBody = getMethod.getResponseBody();
			System.out.println(new String(responseBody, "GBK"));
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			//6.释放连接
			getMethod.releaseConnection();
		}
	}
}
爬取到的数据很长,截取了其中有用的数据部分,截图如下(为了紧凑把行首空格去掉了)

四十一、页面爬取处理_第1张图片

从上图可以看到,能得到商品名称、价格、折扣等信息。其实在<a href>标签里有商品的item_id,据此就可得到商品的所有信息。

为了展示直观,我们以取商品的名称列表为例。商品数据位于"class"属性值为"clearfix"的<ul>标签里,<ul>标签下有多个<li>标签。具体标签结构如下:

<ul class="clearfix">
<li>
<div>
<a href>
<h3 title>
</div>
</li>
<li>
<div>
<a href>
<h3 title>
</div>
</li>
</ul>


二、数据的处理

通过第一步我们已经爬取到了数据,String形式的页面源代码,下面是如何处理数据,拆分出需要的数据。同样也很简单,只需要使用org.htmlparser.Parser.


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HttpClientTest2 {

	private static String spiderPage(String url) {
		//1.构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		//2.创建GET方法的实例
		GetMethod getMethod = new GetMethod(url);
		//3.使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
		//4.执行getMethod
		String pageStr = "";
		try {
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus. SC_OK) {
				System.err.println("Method failed: "+ getMethod.getStatusLine());
			}
			//5.读取内容并处理
			byte[] responseBody = getMethod.getResponseBody();
			System.out.println(new String(responseBody, "GBK"));
			pageStr =  new String(responseBody, "GBK");
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			//6.释放连接
			getMethod.releaseConnection();
		}
		
		return pageStr;
	}
	
	private static List<String> processData(String pageStr){
		List<String> itemTitles = new ArrayList<String>();
		//1.创建parser对象
		Parser parser = new Parser();
		try {
			parser.setInputHTML(pageStr);
			//2.创建AndFilter实例 
			AndFilter itemFilter =  new AndFilter( new TagNameFilter("ul"), new HasAttributeFilter("class","clearfix")); 
			//3.筛选出所有"class"属性值为"clearfix"的<ul>标签节点
			NodeList ulList = parser.extractAllNodesThatMatch(itemFilter);
			if(ulList!=null && ulList.size()>0){
				Tag ulTag = (Tag)ulList.elementAt(0);
				//获取<ul>标签下有多个<li>标签
				List<Tag> liTags = getChildren(ulTag,"li");		
				for (Tag liTag : liTags) {
					//逐层获取标签
					List<Tag> divTags = getChildren(liTag,"div");
					List<Tag> ahrefTags = getChildren(divTags.get(0), "a");
					String title = getChildren(ahrefTags.get(0), "h3").get(0).getAttribute("title");
					itemTitles.add(title);
				}
			}
		} catch (ParserException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return itemTitles;
	}
	
	public static List<Tag> getChildren(Tag parent, String tagname) {
		List<Tag> list = new ArrayList<Tag>();

		NodeList nList = parent.getChildren();
		if(nList!=null){
			for (int i = 0; i < nList.size(); i++) {
				try {
					Tag tag = (Tag) nList.elementAt(i);
					if (tag.getTagName().equalsIgnoreCase(tagname))
						list.add(tag);
				} catch (Exception e) {
				}
			}
		}
		return list;
	}
	
	public static void main(String[] args){
		String pageStr = spiderPage("http://ju.taobao.com/");
		List<String> titleList = processData(pageStr);
		for(String title:titleList){
			System.out.println(title);
		}
	}
}

程序运行结果如下:




你可能感兴趣的:(四十一、页面爬取处理)