常常需要爬取百度统计出来的数据,难免要进行百度的模拟登陆!现将程序贴出来,供他人也供自己以后使用:
package org.baidu; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.CookieStore; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.protocol.ClientContext; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; /** * HTTPCLIENT请求封装。 * @author kevin */ @SuppressWarnings("deprecation") public class BaiduConnectService { private CookieStore cookieStore = new BasicCookieStore(); private BaiduConnectService(){} private static class BaiduConnectServiceContainer{ private static BaiduConnectService bc = new BaiduConnectService(); } public static BaiduConnectService getInstance(){ System.out.println("初始化:BaiduConnectService."); return BaiduConnectServiceContainer.bc; } public HttpResponse execute(String url) throws Exception{ return this.execute(url,null); } public HttpResponse execute(String url, List<NameValuePair> params) throws Exception{ HttpClient httpClient = new DefaultHttpClient( new ThreadSafeClientConnManager()); HttpResponse response = null; HttpUriRequest request = null; if (params != null) { HttpPost httpPost = new HttpPost(url); try { HttpEntity postBodyEnt = new UrlEncodedFormEntity(params); httpPost.setEntity(postBodyEnt); } catch (Exception e) { e.printStackTrace(); } request = httpPost; } else { HttpGet httpGet = new HttpGet(url); request = httpGet; } HttpContext localContext = new BasicHttpContext(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); response = httpClient.execute(request, localContext); System.out.println("[HTTP状态码:" + response.getStatusLine().getStatusCode() + "]" + "-->Request URL:" + url); return response; } public CookieStore getCookieStore() { return cookieStore; } public void setCookieStore(CookieStore cookieStore) { this.cookieStore = cookieStore; } }
package org.baidu; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * 百度登录SERVICE. * @author kevin */ public class BaiduLoginService { private BaiduConnectService bc = BaiduConnectService.getInstance(); private static final String BAIDU_URL = "http://www.baidu.com"; private static final String TOKEN_GET_URL = "https://passport.baidu.com/v2/api/?getapi&tpl=mn&apiver=v3&class=login&logintype=dialogLogin"; private static final String LOGIN_POST_URL = "https://passport.baidu.com/v2/api/?login"; private static final String QUERY_GET_URL="http://index.baidu.com/?tpl=trend&word=%D5%F7%B2%F0"; private String username; private String password; private String verifycode; private String codestring; private String token; public BaiduLoginService(String username,String password,String verifycode,String codestring){ this.username = username; this.password = password; this.verifycode = verifycode; this.codestring = codestring; } // test public static void main(String[] args) throws Exception{ // 请确保你在www.baidu.com可以登录成功。 new BaiduLoginService("账号", "密码","","").login(); // 下载百度文库。 // new BaiduDownloadService("http://wenku.baidu.com/view/71ce3ec60c22590102029dd1.html").download(); } public void login() throws Exception{ System.out.println("准备登录 . Usename:"+username); // 预登录,获取cookie以便获取token. bc.execute(BAIDU_URL); this.initToken(); System.out.println("正在登录。"); HttpResponse response = bc.execute(LOGIN_POST_URL, produceFormEntity()); String result = EntityUtils.toString(response.getEntity()); String statusCode = this.substring(result, "error=", "'"); System.out.println("百度返回的状态码:" + statusCode); // 自动识别验证码。 // tools.autoCode(codestring); EntityUtils.consume(response.getEntity()); System.out.println("--------------------------------"); if(!checkLogin()){ System.out.println("登录异常或频繁,需要验证码,codeString为:" + this.substring(result, "codestring=", "&")); System.out.println("登录结果:" + username + " 登录失败."); }else{ System.out.println("登录结果:" + " 登录成功."); } // this.queryKeywordsUrl(); } private void queryKeywordsUrl() throws Exception { System.out.println("获取关键词的百度指数..."); HttpResponse response = bc.execute(QUERY_GET_URL); String str = EntityUtils.toString(response.getEntity()); System.out.println(str); // 未被收录,如要查看相关数据,您需要购买创建新词的权限。 if(str.contains("未被收录")){ System.out.println("关键词未被收录"); }else{ System.out.println("关键词已被收录"); } // Pattern pattern = Pattern.compile("token\" : \"(.*?)\""); // Matcher matcher = pattern.matcher(str); // if(matcher.find()){ // token = matcher.group(1); // } // System.out.println("Token已获取:"+token); } public NodeList getNodeByName(String content,String tag,String name){ Parser parser = Parser.createParser(content, "utf-8"); AndFilter filter = new AndFilter(new TagNameFilter(tag),new HasAttributeFilter("name",name)); try { return parser.parse(filter); } catch (ParserException e) { e.printStackTrace(); return null; } } // 登录POST参数 private List<NameValuePair> produceFormEntity() throws UnsupportedEncodingException{ List<NameValuePair> list = new ArrayList<NameValuePair>(); list.add(new BasicNameValuePair("tt", ""+System.currentTimeMillis())); list.add(new BasicNameValuePair("tpl", "mn")); list.add(new BasicNameValuePair("token", token)); list.add(new BasicNameValuePair("isPhone", "")); list.add(new BasicNameValuePair("username", username)); list.add(new BasicNameValuePair("password", password)); list.add(new BasicNameValuePair("verifycode", verifycode)); list.add(new BasicNameValuePair("codestring", codestring)); return list; } private void initToken() throws Exception{ System.out.println("获取百度Token..."); HttpResponse response = bc.execute(TOKEN_GET_URL); String str = EntityUtils.toString(response.getEntity()); Pattern pattern = Pattern.compile("token\" : \"(.*?)\""); Matcher matcher = pattern.matcher(str); if(matcher.find()){ token = matcher.group(1); } System.out.println("Token已获取:"+token); } private boolean checkLogin() throws Exception{ HttpResponse response = bc.execute(BAIDU_URL); boolean res = false; String content = EntityUtils.toString(response.getEntity()); if(!content.contains("登录")){ res = true; } EntityUtils.consume(response.getEntity()); return res; } public static String substring(String str, String s1, String s2) { // 1、先获得0-s1的字符串,得到新的字符串sb1 // 2、从sb1中开始0-s2获得最终的结果。 try { StringBuffer sb = new StringBuffer(str); String sb1 = sb.substring(sb.indexOf(s1) + s1.length()); return String.valueOf(sb1.substring(0, sb1.indexOf(s2))); } catch (StringIndexOutOfBoundsException e) { return str; } } }
运行BaiduLoginService即可实现登陆!
项目所需的jar包如下: