最近在做全文检索。所以数据很纠结。没办法 抓一些行业新闻。于是乎用了 httpclient
上代码。分享下
TEbInformationModel model = new TEbInformationModel();
HttpClient httpclient = new DefaultHttpClient();
httpclient.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET,HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);
HttpPost httppost = new HttpPost(httpurl);
httppost.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httppost.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
httppost.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);
HttpResponse response = httpclient.execute(httppost);
InputStream is = response.getEntity().getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(is,"GBK"));
StringBuffer sbf = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null)
{
sbf.append(line);
}
/** 回收资源 */
br.close();
String title = getSubTitle(getStringNoBlank(getTitle(sbf.toString(),"title")));
String context = getSubContext(getStringNoBlank(getTitle(sbf.toString(),"content")));
String key = getSubKey(getStringNoBlank(getTitle(sbf.toString(),"key")));
System.out.println("标题:"+title);
System.out.println("内容:"+context);
System.out.println("关键字:"+key);
正则匹配的部分
private String getStringNoBlank(String str) {
if(str!=null && !"".equals(str)) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
String strNoBlank = m.replaceAll("");
return strNoBlank;
}else {
return str;
}
}
public String getSubTitle(String str){
return str.substring(str.indexOf("<h1>")+4, str.lastIndexOf("</h1>"));
}
public String getSubContext(String str){
return str.substring(str.indexOf("<P>")+3, str.lastIndexOf("</P>"));
}
public String getSubKey(String str){
return str.substring(str.indexOf("</b>")+4, str.lastIndexOf("</p>"));
}
private String getTitle( String s,String type)
{
String regex = null;
String title = "";
final List<String> list = new ArrayList<String>();
if("title".equals(type)){
regex = "<div class=\"zz_leftneirong1\">.*?</h1>";
}else if("content".equals(type)){
regex = "<div class=\"zz_leftneirong4\" id=\"content\" name=\"content\">.*? </div>";
}else{
regex = " <p class=\"key\"><b>本文关键词:</b>.*?</p>";
}
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++)
{
title = title + list.get(i);
}
return title;
}