使用httpclient、htmlcleaner 、xpath 采集新浪微博3G站点数据

0、背景

原来对新浪微博的采集主要靠对weibo.com这个入口进行,但是最近发现有人使用weibo.cn这个入口操作。应为weibo.cn对应的是微博的3g版本,基本没有广告,页面小,这样下载数据量会小很多,并且3g版本的采集比较简单。于是就有了这个3g版本的采集程序。

写出来,分享给大家,希望对有需要的朋友有所帮助。


使用到的类库:httpclient、htmlcleaner

httpclient负责处理http的get和post请求,下载页面;htmlcleaner负责将下载的页面转化为规范的xml,之后用xpath匹配所需内容。


1、基础的http请求类

package cn.mingyuan.weibo.commons;

import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.DefaultHttpClient;

/**
 * http请求基类
 * 
 * @author mingyuan
 * 
 */
public abstract class RequestCommons {
	protected HttpClient httpclient = null;

	public RequestCommons() {
		initHttpClient();
	}

	/**
	 * 初始化httpclient
	 */
	protected void initHttpClient() {
		httpclient = new DefaultHttpClient();
	}

	protected HttpClient getHttpClient() {
		return httpclient;
	}

	protected void addHeader(HttpRequestBase request, String key, String value) {
		request.addHeader(key, value);
	}

	protected void addCookie(HttpRequestBase request, String cookie) {
		addHeader(request, "Cookie", cookie);
	}

	protected void setCookie(HttpRequestBase request, String cookie) {
		request.setHeader("Cookie", cookie);
	}

	/**
	 * 设置请求的header值
	 * 
	 * @param request
	 *            http的get或者post请求
	 */
	protected void setHeader(HttpRequestBase request) {
		request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
		request.setHeader("Accept-Language", "en-us,en;q=0.5");
		request.setHeader("Connection", "keep-alive");
		request.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:13.0) Gecko/20100101 Firefox/13.0.1");
	}
}
2、weibo.cn登陆

package cn.mingyuan.weibo.login;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

import cn.mingyuan.weibo.commons.RequestCommons;
import cn.mingyuan.weibo.until.Constants;

/**
 * 登陆 获取cookie
 * 
 * @author mingyuan
 * 
 */
public class Login extends RequestCommons {
	/**
	 * 获取登陆参数。主要有三个值:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
	 * 
	 * @return 返回登陆参数,string数组,里面的元素:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
	 */
	private String[] getLoginParameters() {
		HttpClient httpClient = getHttpClient();
		String location = "http://3g.sina.com.cn/prog/wapsite/sso/login.php?backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=4&revalid=2&ns=1";
		HttpGet get = new HttpGet(location);
		setHeader(get);
		HttpResponse response;
		InputStream content;
		String retAction = null;
		String retPassword = null;
		String retVk = null;
		try {
			response = httpClient.execute(get);
			HttpEntity entity = response.getEntity();
			content = entity.getContent();

			// 提取登陆参数
			HtmlCleaner cleaner = new HtmlCleaner();
			TagNode tagNode = cleaner.clean(content, "utf-8");
			Object[] action = tagNode.evaluateXPath("//form/@action");

			if (action.length > 0) {
				retAction = action[0].toString();
			}
			Object[] passwordKey = tagNode.evaluateXPath("//form//input[@type='password']/@name");

			if (passwordKey.length > 0) {
				retPassword = passwordKey[0].toString();
			}
			Object[] vkKey = tagNode.evaluateXPath("//form//input[@name='vk']/@value");

			if (vkKey.length > 0) {
				retVk = vkKey[0].toString();
			}
			EntityUtils.consume(entity);
		} catch (ClientProtocolException e) {
			System.out.println("获取登陆页面失败,location=" + location);
			e.printStackTrace();
		} catch (IOException e) {
			System.out.println("获取页面内容流失败");
			e.printStackTrace();
		} catch (XPatherException e) {
			System.out.println("解析登陆参数失败");
			e.printStackTrace();
		} finally {
			if (get != null) {
				get.releaseConnection();
			}
		}

		System.out.println("请求页面:" + location);
		System.out.println("提交地址:" + retAction);
		System.out.println("密码输入框名称:" + retPassword);
		System.out.println("vk值:" + retVk);

		return new String[] { retAction, retPassword, retVk };
	}

	/**
	 * 提交账号密码,开始登陆
	 * 
	 * @param postAction
	 *            登陆地址
	 * @param userNameValue
	 *            微博登陆账号
	 * @param passwordValue
	 *            微博登陆密码
	 * @param passwordKey
	 *            微博登陆框的name
	 * @param vkValue
	 *            vk的值
	 * @return 返回取到的cookie与跳转地址,组合成一个String数组。第一个元素为cookie,第二个元素为跳转地址
	 */
	private String[] submitPassword(String postAction, String userNameValue, String passwordValue, String passwordKey, String vkValue) {
		HttpClient httpclient = getHttpClient();
		String url = "http://3g.sina.com.cn/prog/wapsite/sso/" + postAction;
		System.out.println("开始提交账号密码:" + url);
		HttpPost post = new HttpPost(url);
		setHeader(post);
		List<NameValuePair> nvps = new ArrayList<NameValuePair>();
		nvps.add(new BasicNameValuePair("mobile", userNameValue));
		nvps.add(new BasicNameValuePair(passwordKey, passwordValue));
		nvps.add(new BasicNameValuePair("remember", "on"));
		nvps.add(new BasicNameValuePair("vk", vkValue));
		nvps.add(new BasicNameValuePair("backURL", "http://weibo.cn/"));
		nvps.add(new BasicNameValuePair("backTitle", "新浪微博"));
		nvps.add(new BasicNameValuePair("submit", "登录"));
		HttpResponse response;
		String cookie = null;
		String location = null;
		try {
			post.setEntity(new UrlEncodedFormEntity(nvps));
			response = httpclient.execute(post);
			HttpEntity entity2 = response.getEntity();
			Header[] setCookie = response.getHeaders("Set-Cookie");

			if (setCookie != null) {
				cookie = setCookie[0].getValue();
				System.out.println("获取到Cookie:" + cookie);
			}
			Header[] locations = response.getHeaders("Location");
			if (locations != null) {
				location = locations[0].getValue();
				System.out.println("获取到跳转链接:" + location);
			}
			EntityUtils.consume(entity2);
		} catch (UnsupportedEncodingException e1) {
			e1.printStackTrace();
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			post.releaseConnection();
		}

		return new String[] { cookie, location };
	}

	/**
	 * 获取重定向页面内容
	 * 
	 * @param redirectUrl
	 *            获取重定向页面地址
	 * @return 获取cookie和要跳转的地址
	 */
	private String[] getRedirectPageInfo(String redirectUrl) {
		System.out.println("开始获取跳转链接页面");
		HttpGet get = new HttpGet(redirectUrl);
		setHeader(get);
		HttpResponse redirectResponse;
		String cookie = null;
		String clickHref = null;
		try {
			redirectResponse = httpclient.execute(get);
			Header[] headers = redirectResponse.getHeaders("Set-Cookie");
			if (headers != null) {
				cookie = headers[0].getValue();
				String[] splits = cookie.split(";");

				for (String str : splits) {
					if (str.startsWith("gsid_CTandWM")) {
						cookie = str;
						break;
					}
				}
			}
			HttpEntity entity = redirectResponse.getEntity();
			InputStream content = entity.getContent();
			HtmlCleaner cleaner = new HtmlCleaner();
			TagNode tagNode = cleaner.clean(content, "utf-8");
			Object[] clickHrefs = tagNode.evaluateXPath("//div/a/@href");

			if (clickHrefs != null) {
				clickHref = clickHrefs[0].toString();
				System.out.println("获取到跳转链接地址:" + clickHref);

			}
			EntityUtils.consume(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XPatherException e) {
			e.printStackTrace();
		} finally {
			get.releaseConnection();
		}
		return new String[] { cookie, clickHref };
	}

	/**
	 * 跳转
	 * 
	 * @param cookie
	 *            上次请求取到的cookie
	 * @param redirectUrl
	 *            跳转url
	 * @return 返回跳转后取得的cookie
	 */
	private String doRedirection(String cookie, String redirectUrl) {
		HttpGet get = new HttpGet(redirectUrl);
		setHeader(get);
		get.setHeader("Cookie", cookie);
		HttpResponse response;
		try {
			response = httpclient.execute(get);
			HttpEntity entity = response.getEntity();
			Header[] headers2 = response.getHeaders("Set-Cookie");
			if (headers2 != null) {
				cookie = headers2[0].getValue();
				System.out.println("跳转页面取回的cookie:" + cookie);
				String[] splits = cookie.split(";");
				for (String str : splits) {
					if (str.startsWith("_WEIBO_UID")) {
						cookie = str;
						break;
					}
				}
			}
			EntityUtils.consume(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return cookie;
	}

	/**
	 * 提交账号密码,登陆
	 * 
	 * @param userNameValue
	 *            微博账号
	 * @param passwordValue
	 *            微博密码
	 * @return 返回cookie
	 */
	public String doLogin(String userNameValue, String passwordValue) {
		// 获取登陆页面的参数
		String[] loginParameters = getLoginParameters();
		String postAction = loginParameters[0];
		String passwordKey = loginParameters[1];
		String vkValue = loginParameters[2];

		// 提交账号密码,获取重定向页面链接与cookie
		String[] cookieRedirectLocation = submitPassword(postAction, userNameValue, passwordValue, passwordKey, vkValue);
		String cookie = cookieRedirectLocation[0];
		String redirectUrl = cookieRedirectLocation[1];
		// 获取重定向页面内容
		String[] redirectInfo = getRedirectPageInfo(redirectUrl);
		cookie = redirectInfo[0];
		redirectUrl = redirectInfo[1];
		System.out.println("准备跳转");
		try {
			TimeUnit.SECONDS.sleep(3);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		System.out.println("开始跳转");
		String cookieOfRedirect = doRedirection(cookie, redirectUrl);

		StringBuffer sb = new StringBuffer(cookie);
		sb.append(';').append(cookieOfRedirect);
		System.out.println("登陆成功,最终cookie为:" + sb.toString());
		return sb.toString();
	}

	/**
	 * 使用配置的账号、密码登陆
	 * 
	 * @return 返回登陆cookie
	 */
	public String doLogin() {
		return this.doLogin(Constants.LOGIN_USERNAME, Constants.LOGIN_PASSWORD);
	}
}

3、测试

步骤:先登录取cookie再拿cookie填到请求里面取页面内容


package cn.mingyuan.weibo.test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;

import cn.mingyuan.weibo.commons.RequestCommons;
import cn.mingyuan.weibo.login.Login;
/**
 * 测试,取页面内容
 * @author mingyuan
 *
 */
public class WeiboTest extends RequestCommons {

	/**
	 * 打印流
	 * 
	 * @param in
	 *            InputStream
	 */
	private void printContent(InputStream in) {
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
			String line;
			while ((line = reader.readLine()) != null) {
				System.out.println(line);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// 这里不关闭流,留作他用
		}
	}

	/**
	 * 测试,读取一个微博地址,打印出页面内容
	 * 
	 * @param finalCookie
	 *            cookie
	 */
	private void test(String finalCookie) {
		HttpGet get = new HttpGet("http://weibo.cn/irlucene");
		setHeader(get);
		get.setHeader("Cookie", finalCookie.toString());
		HttpResponse response;
		try {
			response = httpclient.execute(get);
			HttpEntity entity = response.getEntity();
			printContent(entity.getContent());
			EntityUtils.consume(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			get.releaseConnection();
		}
	}

	public static void main(String[] args) {
		Login login = new Login();
		String userNameValue = "username";
		String passwordValue = "password";
		String cookie = login.doLogin(userNameValue, passwordValue);
		System.out.println("final Cookie=" + cookie);
		new WeiboTest().test(cookie);
	}

}



你可能感兴趣的:(httpclient,httpclient,httpclient,xpath,xpath,HTMLCleaner)