HttpClient模拟登陆人人网,并且爬取日志内容(一)

使用HttpClient最新版本,下载地址Download

注释已经写的比较清楚了,就不再说明了。

爬取到的html内容,既可以用java正则表达式截取,也可以用jsoup(下载地址:Download),目前是使用jsoup,这个用起来非常方便,不用自己去辛苦的写正则表达式。


大家注意了,这种方式有时候会失败,因为用这种方式过于频繁登陆renren.com,可能帐号会被限制,而且会要求输入验证码,所以出现验证码的话,就无法正常登陆renren.com了。

将方法提取了,这样便于阅读。

同时处理了HttpClient的cookie默认策略报警告的异常。

/*
 * ====================================================================
 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see <http://www.apache.org/>.
 * renren.com
 * <input type="hidden" name="origURL" value="http://www.renren.com/home" />
 * <input type="hidden" name="domain" value="renren.com" />
 * <input type="hidden" name="key_id" value="1" />
 * <input type="submit" id="login" class="input-submit login-btn" value="登录人人网" tabindex="5"/>
 * http://s.xnimg.cn/a36853/n/apps/login/login-all.js
 */
package org.apache.http.examples.client;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.cookie.ClientCookie;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecFactory;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.cookie.params.CookieSpecPNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.sun.xml.internal.ws.transport.http.client.HttpCookie;

/**
 * 
 * Purpose:
 * 
 * @author: shihuangzhe.com
 * @since: JDK 1.6
 * @date: 2012-4-28
 * 
 */
public class RrLogin {
	/** 帐号 */
	private static final String userName = "xxxxxx@yahoo.com.cn";
	/** 密码 */
	private static final String password = "xxxxxxxx";
	/** 网域 */
	private static final String domain = "renren.com";
	/** key_id */
	private static final String keyID = "1";
	/** 表单提交url */
	private static String loginURL = "http://www.renren.com/PLogin.do";
	/** 登陆成功后,跳转到我自己的blog日志,人人默认跳转路径为 http://www.renren.com/home */
	private static final String targetUrl = "http://blog.renren.com/blog/84082953/398292611";
	/** 表单域常量(跳转url) */
	private static final String _ORGI_URL = "origURL";
	/** 表单域常量(网域) */
	private static final String _DOMAIN = "domain";
	/** 表单域常量(key_id) */
	private static final String _KEY_ID = "key_id";
	/** 表单域常量(帐号) */
	private static final String _EMAIL = "email";
	/** 表单域常量(密码) */
	private static final String _PASSWORD = "password";
	/** ThreadSafeClientConnManager保证多线程安全 */
	private DefaultHttpClient client = new DefaultHttpClient(
			new ThreadSafeClientConnManager());
	/** response相应 */
	private HttpResponse response;

	/**
	 * Purpose: 登陆renren.com
	 * 
	 * @throws Exception
	 * @return: void
	 */
	private boolean login(String userName, String password) {
		boolean isLogin = false;
		HttpPost httpost = new HttpPost(loginURL);
		// 为请求参数赋值
		List<NameValuePair> nvps = new ArrayList<NameValuePair>();
		nvps.add(new BasicNameValuePair(_ORGI_URL, targetUrl));
		nvps.add(new BasicNameValuePair(_DOMAIN, domain));
		nvps.add(new BasicNameValuePair(_KEY_ID, keyID));
		nvps.add(new BasicNameValuePair(_EMAIL, userName));
		nvps.add(new BasicNameValuePair(_PASSWORD, password));
		try {
			httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
			// 获取请求相应
			response = client.execute(httpost);
			System.out.println(response.getStatusLine());// 返回302
			// 设置cookie,renren.com用于身份验证的cookie有两个,名字分别是p和t.
			// HttpClientParams.setCookiePolicy(client.getParams(),
			// CookiePolicy.BROWSER_COMPATIBILITY);
			// 因为HttpClient 4.0默认cookie策略会报WARN警告,所以手动定制cookie策略
			CookieSpecFactory csf = new CookieSpecFactory() {
				public CookieSpec newInstance(HttpParams params) {
					return new BrowserCompatSpec() {
						@Override
						public void validate(Cookie cookie, CookieOrigin origin)
								throws MalformedCookieException {
							// nothing to do
						}
					};
				}
			};
			client.getCookieSpecs().register("easy", csf);
			client.getParams().setParameter(ClientPNames.COOKIE_POLICY, "easy");
			isLogin = true;
		} catch (UnsupportedEncodingException e) {
			System.err.println("UnsupportedEncodingException!");
		} catch (ClientProtocolException e) {
			System.err.println("ClientProtocolException!");
		} catch (IOException e) {
			System.err.println("IOException!");
		} finally {
			httpost.abort();
		}
		return isLogin;
	}

	/**
	 * Purpose: 获取blog内容
	 * 
	 * @param response
	 * @return: String
	 */
	private void showResult(String userName, String password) {
		try {
			if (!login(userName, password)) {
				System.err.println("登陆失败!");
				System.exit(0);
			}
			/*
			 * 注意,因为renren.com登陆成功后,需要再次经过
			 * http://www.renren.com/callback.do?t=da278e2526f9b2387ea22e57578a85d93
			 * &
			 * origURL=http%3A%2F%2Fblog.renren.com%2Fblog%2F84082953%2F398292611
			 * &needNotify=false 这种方式跳转,所以需要再次处理发一次请求
			 */

			// 打印所有相应头
			Header[] headers = response.getAllHeaders();
			for (int i = 0; i < headers.length; i++) {
				Header header = headers[i];
				System.out.println(header.getName() + ": " + header.getValue());
			}
			// 获取真实跳转路径
			Header locationHeader = response.getFirstHeader("Location");
			// 频繁登陆,failCode = 512,就会要求输入验证码登陆了
			HttpGet httpget = new HttpGet(locationHeader.getValue());
			HttpResponse response2 = client.execute(httpget);
			System.out.println(response2.getStatusLine()); // HTTP/1.1 200 OK
			// 获取Entity
			HttpEntity entity = response2.getEntity();
			// 解析html,拿出blog
			String[] context = htmlToPlainText(EntityUtils.toString(entity));
			System.out.println("---------解析后的内容----------- ");
			System.out.print("Title:  ");
			System.out.println(context[0]);
			System.out.print("Context:  ");
			System.out.println(context[1]);
		} catch (ClientProtocolException e) {
			System.err.println("ClientProtocolException!");
		} catch (ParseException e) {
			System.err.println("ParseException!");
		} catch (IOException e) {
			System.err.println("IOException!");
		} finally {
			// When HttpClient instance is no longer needed,
			// shut down the connection manager to ensure
			// immediate deallocation of all system resources
			client.getConnectionManager().shutdown();
		}
	}

	// /**
	// * Purpose: 正则提取blog内容
	// * @param orgTest
	// * @return: String
	// */
	// private String printBlog(String orgTest) {
	// // 正则匹配规则
	// // String regexp = "<div\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</div>";
	// // String regexp =
	// "(<div id=\"blogContent\" class=\"text-article\")(.+?)( </div>)";
	// Pattern pattern =
	// Pattern.compile("<div\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</div>");
	// Matcher m = pattern.matcher(orgTest);
	// if (!m.find()) {
	// return null;
	// }
	// return m.group(0);
	// }
	/**
	 * Purpose: 使用jsoup解析Html
	 * 
	 * @param html
	 * @return: String[]
	 */
	private static String[] htmlToPlainText(String html) {
		String[] content = new String[] { "", "" };
		Document doc = Jsoup.parse(html);
		// 提取blog标题
		Elements titles = doc.select("h3.title-article>strong");
		for (Element oneSelect : titles)
			content[0] += oneSelect.text();
		// 提取blog内容
		Elements contents = doc.select("div#blogContent");
		for (Element oneSelect : contents)
			content[1] += oneSelect.text();
		return content;
	}

	/**
	 * Purpose: 测试
	 * 
	 * @param args
	 * @return: void
	 */
	public static void main(String[] args) {
		RrLogin renRen = new RrLogin();
		renRen.showResult(userName, password);
	}
}


你可能感兴趣的:(apache,html,String,正则表达式,header,domain)