常常需要爬取百度统计出来的数据,难免要进行百度的模拟登陆!现将程序贴出来,供他人也供自己以后使用:
package org.baidu;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
/**
* HTTPCLIENT请求封装。
* @author kevin
*/
@SuppressWarnings("deprecation")
public class BaiduConnectService {
private CookieStore cookieStore = new BasicCookieStore();
private BaiduConnectService(){}
private static class BaiduConnectServiceContainer{
private static BaiduConnectService bc = new BaiduConnectService();
}
public static BaiduConnectService getInstance(){
System.out.println("初始化:BaiduConnectService.");
return BaiduConnectServiceContainer.bc;
}
public HttpResponse execute(String url) throws Exception{
return this.execute(url,null);
}
public HttpResponse execute(String url, List params) throws Exception{
HttpClient httpClient = new DefaultHttpClient(
new ThreadSafeClientConnManager());
HttpResponse response = null;
HttpUriRequest request = null;
if (params != null) {
HttpPost httpPost = new HttpPost(url);
try {
HttpEntity postBodyEnt = new UrlEncodedFormEntity(params);
httpPost.setEntity(postBodyEnt);
} catch (Exception e) {
e.printStackTrace();
}
request = httpPost;
} else {
HttpGet httpGet = new HttpGet(url);
request = httpGet;
}
HttpContext localContext = new BasicHttpContext();
localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
response = httpClient.execute(request, localContext);
System.out.println("[HTTP状态码:" + response.getStatusLine().getStatusCode() + "]" + "-->Request URL:" + url);
return response;
}
public CookieStore getCookieStore() {
return cookieStore;
}
public void setCookieStore(CookieStore cookieStore) {
this.cookieStore = cookieStore;
}
}
package org.baidu;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 百度登录SERVICE.
* @author kevin
*/
public class BaiduLoginService {
private BaiduConnectService bc = BaiduConnectService.getInstance();
private static final String BAIDU_URL = "http://www.baidu.com";
private static final String TOKEN_GET_URL = "https://passport.baidu.com/v2/api/?getapi&tpl=mn&apiver=v3&class=login&logintype=dialogLogin";
private static final String LOGIN_POST_URL = "https://passport.baidu.com/v2/api/?login";
private static final String QUERY_GET_URL="http://index.baidu.com/?tpl=trend&word=%D5%F7%B2%F0";
private String username;
private String password;
private String verifycode;
private String codestring;
private String token;
public BaiduLoginService(String username,String password,String verifycode,String codestring){
this.username = username;
this.password = password;
this.verifycode = verifycode;
this.codestring = codestring;
}
// test
public static void main(String[] args) throws Exception{
// 请确保你在www.baidu.com可以登录成功。
new BaiduLoginService("账号", "密码","","").login();
// 下载百度文库。
// new BaiduDownloadService("http://wenku.baidu.com/view/71ce3ec60c22590102029dd1.html").download();
}
public void login() throws Exception{
System.out.println("准备登录 . Usename:"+username);
// 预登录,获取cookie以便获取token.
bc.execute(BAIDU_URL);
this.initToken();
System.out.println("正在登录。");
HttpResponse response = bc.execute(LOGIN_POST_URL, produceFormEntity());
String result = EntityUtils.toString(response.getEntity());
String statusCode = this.substring(result, "error=", "'");
System.out.println("百度返回的状态码:" + statusCode);
// 自动识别验证码。
// tools.autoCode(codestring);
EntityUtils.consume(response.getEntity());
System.out.println("--------------------------------");
if(!checkLogin()){
System.out.println("登录异常或频繁,需要验证码,codeString为:" + this.substring(result, "codestring=", "&"));
System.out.println("登录结果:" + username + " 登录失败.");
}else{
System.out.println("登录结果:" + " 登录成功.");
}
// this.queryKeywordsUrl();
}
private void queryKeywordsUrl() throws Exception {
System.out.println("获取关键词的百度指数...");
HttpResponse response = bc.execute(QUERY_GET_URL);
String str = EntityUtils.toString(response.getEntity());
System.out.println(str);
// 未被收录,如要查看相关数据,您需要购买创建新词的权限。
if(str.contains("未被收录")){
System.out.println("关键词未被收录");
}else{
System.out.println("关键词已被收录");
}
// Pattern pattern = Pattern.compile("token\" : \"(.*?)\"");
// Matcher matcher = pattern.matcher(str);
// if(matcher.find()){
// token = matcher.group(1);
// }
// System.out.println("Token已获取:"+token);
}
public NodeList getNodeByName(String content,String tag,String name){
Parser parser = Parser.createParser(content, "utf-8");
AndFilter filter = new AndFilter(new TagNameFilter(tag),new HasAttributeFilter("name",name));
try {
return parser.parse(filter);
} catch (ParserException e) {
e.printStackTrace();
return null;
}
}
// 登录POST参数
private List produceFormEntity() throws UnsupportedEncodingException{
List list = new ArrayList();
list.add(new BasicNameValuePair("tt", ""+System.currentTimeMillis()));
list.add(new BasicNameValuePair("tpl", "mn"));
list.add(new BasicNameValuePair("token", token));
list.add(new BasicNameValuePair("isPhone", ""));
list.add(new BasicNameValuePair("username", username));
list.add(new BasicNameValuePair("password", password));
list.add(new BasicNameValuePair("verifycode", verifycode));
list.add(new BasicNameValuePair("codestring", codestring));
return list;
}
private void initToken() throws Exception{
System.out.println("获取百度Token...");
HttpResponse response = bc.execute(TOKEN_GET_URL);
String str = EntityUtils.toString(response.getEntity());
Pattern pattern = Pattern.compile("token\" : \"(.*?)\"");
Matcher matcher = pattern.matcher(str);
if(matcher.find()){
token = matcher.group(1);
}
System.out.println("Token已获取:"+token);
}
private boolean checkLogin() throws Exception{
HttpResponse response = bc.execute(BAIDU_URL);
boolean res = false;
String content = EntityUtils.toString(response.getEntity());
if(!content.contains("登录")){
res = true;
}
EntityUtils.consume(response.getEntity());
return res;
}
public static String substring(String str, String s1, String s2) {
// 1、先获得0-s1的字符串,得到新的字符串sb1
// 2、从sb1中开始0-s2获得最终的结果。
try {
StringBuffer sb = new StringBuffer(str);
String sb1 = sb.substring(sb.indexOf(s1) + s1.length());
return String.valueOf(sb1.substring(0, sb1.indexOf(s2)));
} catch (StringIndexOutOfBoundsException e) {
return str;
}
}
}
运行BaiduLoginService即可实现登陆!
项目所需的jar包如下: