blog迁移至:
http://www.micmiu.com
原先一个对google、百度的搜索页面简单解析处理的小程序,前几天突然发现对google的搜索结果处理不起作用了,百度的一切正常。经测试估计是google做了相应的限制,后来便想到了先用httpclient模拟客户端访问,获取搜索结果的页面,然后再用HTMLParser处理。
HttpClient 是 Apache Jakarta Common 下的子项目,可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,网站:
http://hc.apache.org/
问题解决前后的测试代码如下:
原先的代码TestGoogleSearch.java:
package com.htmlparser;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
/**
* 测试 google 搜索结果页面处理
* @author Michael
*
*/
public class TestGoogleSearch {
/**
* @param args
*/
public static void main(String[] args) {
TestGoogleSearch test = new TestGoogleSearch();
String url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nero9%E5%88%BB%E5%BD%95ape&aq=f&aqi=&aql=&oq=&gs_rfai=";
test.parseLinkFilter(url);
}
/**
*
* @param url
*/
private void parseLinkFilter(String url) {
System.out.println("NodeFilter start...");
try {
HttpURLConnection.setFollowRedirects(true);
URL netUrl = new URL(url);
HttpURLConnection con = (HttpURLConnection) netUrl.openConnection();
con.setInstanceFollowRedirects(false);
con.connect();
Parser parser = new Parser(con);
parser.setEncoding(parser.getEncoding());
NodeFilter filter = new TagNameFilter("A");
NodeList list = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < list.size(); i++) {
LinkTag tag = (LinkTag) list.elementAt(i);
System.out.println(tag.getLinkText() + " -- " + tag.getLink());
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("NodeFilter end");
}
}
运行结果:
NodeFilter start...
here -- http://www.google.com.hk/url?sa=p&hl=zh-CN&cki=PREF%3DID%3Db49ef17c9c585188:FF%3D1:LD%3Dzh-CN:NW%3D1:TM%3D1288754215:LM%3D1288754215:S%3DDvpSJRMCSGZqN7mE&q=http://www.google.com.hk/search%3Fhl%3Dzh-CN%26source%3Dhp%26q%3Dnero9%25E5%2588%25BB%25E5%25BD%2595ape%26aq%3Df%26aqi%3D%26aql%3D%26oq%3D%26gs_rfai%3D&ust=1288754245627953&usg=AFQjCNFqlSLKTXZTp3Vk7WcVsOHOkhNeMQ
NodeFilter end
从打印结果会发现只能解析出一个google自身的连接
修改后的代码TestParserGoogleSearch.java:
package com.htmlparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
/**
*
* @author Michael
*
*/
public class TestParserGoogleSearch {
/**
* @param args
*/
public static void main(String[] args) {
TestParserGoogleSearch parser = new TestParserGoogleSearch();
String url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nero9%E5%88%BB%E5%BD%95ape&aq=f&aqi=&aql=&oq=&gs_rfai=";
String html = parser.getUrlHtmlByHttpClient(url);
parser.parseHtmlLink(html);
}
/**
* 处理搜索结果字符串
* @param htmlstr
*/
private void parseHtmlLink(String htmlstr) {
try {
Parser parser = Parser.createParser(htmlstr, "utf-8");
// 创建TagNameFilter实例
TagNameFilter filter = new TagNameFilter("A");
// 筛选出所有A标签节点
NodeList nodes = parser.extractAllNodesThatMatch(filter);
if (nodes != null) {
System.out.println(nodes.size());
for (int i = 0; i < nodes.size(); i++) {
LinkTag tag = (LinkTag) nodes.elementAt(i);
System.out.println(tag.getLinkText() + " -- "
+ tag.getLink());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 模拟客户端访问获取搜索结果页面
* @param url
* @return
*/
private String getUrlHtmlByHttpClient(String url) {
String searchHtml = null;
HttpClient httpClient = new HttpClient();
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
5000);
GetMethod getMethod = new GetMethod(url);
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
InputStream bodyIs = getMethod.getResponseBodyAsStream();//
System.out.println("get reoponse body stream:" + bodyIs);
//如果中文乱码 修改字符集
// BufferedReader br = new BufferedReader(
// new InputStreamReader(bodyIs,"GBK"));
BufferedReader br = new BufferedReader(
new InputStreamReader(bodyIs));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
searchHtml = sb.toString();
return searchHtml;
} catch (HttpException e) {
System.out.println("Please check your http address!");
e.printStackTrace();
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
} finally {
getMethod.releaseConnection();
}
}
}
运行结果:
get reoponse body stream:org.apache.commons.httpclient.AutoCloseInputStream@193385d
85
图片 -- http://www.google.com.hk/images?hl=zh-cn&q=nero9%E5%88%BB%E5%BD%95ape&um=1&ie=UTF-8&source=og&sa=N&tab=wi
视频 -- http://www.google.com.hk/search?hl=zh-cn&q=nero9%E5%88%BB%E5%BD%95ape&um=1&ie=UTF-8&tbo=u&tbs=vid:1&source=og&sa=N&tab=wv
。。。。。。
省略一些打印结果
这样可以解析出所有的连接,加上相应的参数也可以翻页。
OK,到此就是基本的解决过程了,希望能给某位需要的仁兄提供帮助。