0、背景
原来对新浪微博的采集主要靠对weibo.com这个入口进行,但是最近发现有人使用weibo.cn这个入口操作。应为weibo.cn对应的是 微博的3g版本,基本没有广告,页面小,这样下载数据量会小很多,并且3g版本的采集比较简单。于是就有了这个3g版本的采集程序。
写出来,分享给大家,希望对有需要的朋友有所帮助。
使用到的类库:httpclient、htmlcleaner
httpclient负责处理http的get和post请求,下载页面;htmlcleaner负责将下载的页面转化为规范的xml,之后用xpath匹配所需内容。
1、基础的http请求类
- package cn.mingyuan.weibo.commons;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.methods.HttpRequestBase;
- import org.apache.http.impl.client.DefaultHttpClient;
- /**
- * http请求基类
- *
- * @author mingyuan
- *
- */
- public abstract class RequestCommons {
- protected HttpClient httpclient = null;
- public RequestCommons() {
- initHttpClient();
- }
- /**
- * 初始化httpclient
- */
- protected void initHttpClient() {
- httpclient = new DefaultHttpClient();
- }
- protected HttpClient getHttpClient() {
- return httpclient;
- }
- protected void addHeader(HttpRequestBase request, String key, String value) {
- request.addHeader(key, value);
- }
- protected void addCookie(HttpRequestBase request, String cookie) {
- addHeader(request, "Cookie", cookie);
- }
- protected void setCookie(HttpRequestBase request, String cookie) {
- request.setHeader("Cookie", cookie);
- }
- /**
- * 设置请求的header值
- *
- * @param request
- * http的get或者post请求
- */
- protected void setHeader(HttpRequestBase request) {
- request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
- request.setHeader("Accept-Language", "en-us,en;q=0.5");
- request.setHeader("Connection", "keep-alive");
- request.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:13.0) Gecko/20100101 Firefox/13.0.1");
- }
- }
2、weibo.cn登陆
- package cn.mingyuan.weibo.login;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.UnsupportedEncodingException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.concurrent.TimeUnit;
- import org.apache.http.Header;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.NameValuePair;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.entity.UrlEncodedFormEntity;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.methods.HttpPost;
- import org.apache.http.message.BasicNameValuePair;
- import org.apache.http.util.EntityUtils;
- import org.htmlcleaner.HtmlCleaner;
- import org.htmlcleaner.TagNode;
- import org.htmlcleaner.XPatherException;
- import cn.mingyuan.weibo.commons.RequestCommons;
- import cn.mingyuan.weibo.until.Constants;
- /**
- * 登陆 获取cookie
- *
- * @author mingyuan
- *
- */
- public class Login extends RequestCommons {
- /**
- * 获取登陆参数。主要有三个值:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
- *
- * @return 返回登陆参数,string数组,里面的元素:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
- */
- private String[] getLoginParameters() {
- HttpClient httpClient = getHttpClient();
- String location = "http://3g.sina.com.cn/prog/wapsite/sso/login.php?backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=4&revalid=2&ns=1";
- HttpGet get = new HttpGet(location);
- setHeader(get);
- HttpResponse response;
- InputStream content;
- String retAction = null;
- String retPassword = null;
- String retVk = null;
- try {
- response = httpClient.execute(get);
- HttpEntity entity = response.getEntity();
- content = entity.getContent();
- // 提取登陆参数
- HtmlCleaner cleaner = new HtmlCleaner();
- TagNode tagNode = cleaner.clean(content, "utf-8");
- Object[] action = tagNode.evaluateXPath("//form/@action");
- if (action.length > 0) {
- retAction = action[0].toString();
- }
- Object[] passwordKey = tagNode.evaluateXPath("//form//input[@type='password']/@name");
- if (passwordKey.length > 0) {
- retPassword = passwordKey[0].toString();
- }
- Object[] vkKey = tagNode.evaluateXPath("//form//input[@name='vk']/@value");
- if (vkKey.length > 0) {
- retVk = vkKey[0].toString();
- }
- EntityUtils.consume(entity);
- } catch (ClientProtocolException e) {
- System.out.println("获取登陆页面失败,location=" + location);
- e.printStackTrace();
- } catch (IOException e) {
- System.out.println("获取页面内容流失败");
- e.printStackTrace();
- } catch (XPatherException e) {
- System.out.println("解析登陆参数失败");
- e.printStackTrace();
- } finally {
- if (get != null) {
- get.releaseConnection();
- }
- }
- System.out.println("请求页面:" + location);
- System.out.println("提交地址:" + retAction);
- System.out.println("密码输入框名称:" + retPassword);
- System.out.println("vk值:" + retVk);
- return new String[] { retAction, retPassword, retVk };
- }
- /**
- * 提交账号密码,开始登陆
- *
- * @param postAction
- * 登陆地址
- * @param userNameValue
- * 微博登陆账号
- * @param passwordValue
- * 微博登陆密码
- * @param passwordKey
- * 微博登陆框的name
- * @param vkValue
- * vk的值
- * @return 返回取到的cookie与跳转地址,组合成一个String数组。第一个元素为cookie,第二个元素为跳转地址
- */
- private String[] submitPassword(String postAction, String userNameValue, String passwordValue, String passwordKey, String vkValue) {
- HttpClient httpclient = getHttpClient();
- String url = "http://3g.sina.com.cn/prog/wapsite/sso/" + postAction;
- System.out.println("开始提交账号密码:" + url);
- HttpPost post = new HttpPost(url);
- setHeader(post);
- List<NameValuePair> nvps = new ArrayList<NameValuePair>();
- nvps.add(new BasicNameValuePair("mobile", userNameValue));
- nvps.add(new BasicNameValuePair(passwordKey, passwordValue));
- nvps.add(new BasicNameValuePair("remember", "on"));
- nvps.add(new BasicNameValuePair("vk", vkValue));
- nvps.add(new BasicNameValuePair("backURL", "http://weibo.cn/"));
- nvps.add(new BasicNameValuePair("backTitle", "新浪微博"));
- nvps.add(new BasicNameValuePair("submit", "登录"));
- HttpResponse response;
- String cookie = null;
- String location = null;
- try {
- post.setEntity(new UrlEncodedFormEntity(nvps));
- response = httpclient.execute(post);
- HttpEntity entity2 = response.getEntity();
- Header[] setCookie = response.getHeaders("Set-Cookie");
- if (setCookie != null) {
- cookie = setCookie[0].getValue();
- System.out.println("获取到Cookie:" + cookie);
- }
- Header[] locations = response.getHeaders("Location");
- if (locations != null) {
- location = locations[0].getValue();
- System.out.println("获取到跳转链接:" + location);
- }
- EntityUtils.consume(entity2);
- } catch (UnsupportedEncodingException e1) {
- e1.printStackTrace();
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- post.releaseConnection();
- }
- return new String[] { cookie, location };
- }
- /**
- * 获取重定向页面内容
- *
- * @param redirectUrl
- * 获取重定向页面地址
- * @return 获取cookie和要跳转的地址
- */
- private String[] getRedirectPageInfo(String redirectUrl) {
- System.out.println("开始获取跳转链接页面");
- HttpGet get = new HttpGet(redirectUrl);
- setHeader(get);
- HttpResponse redirectResponse;
- String cookie = null;
- String clickHref = null;
- try {
- redirectResponse = httpclient.execute(get);
- Header[] headers = redirectResponse.getHeaders("Set-Cookie");
- if (headers != null) {
- cookie = headers[0].getValue();
- String[] splits = cookie.split(";");
- for (String str : splits) {
- if (str.startsWith("gsid_CTandWM")) {
- cookie = str;
- break;
- }
- }
- }
- HttpEntity entity = redirectResponse.getEntity();
- InputStream content = entity.getContent();
- HtmlCleaner cleaner = new HtmlCleaner();
- TagNode tagNode = cleaner.clean(content, "utf-8");
- Object[] clickHrefs = tagNode.evaluateXPath("//div/a/@href");
- if (clickHrefs != null) {
- clickHref = clickHrefs[0].toString();
- System.out.println("获取到跳转链接地址:" + clickHref);
- }
- EntityUtils.consume(entity);
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (XPatherException e) {
- e.printStackTrace();
- } finally {
- get.releaseConnection();
- }
- return new String[] { cookie, clickHref };
- }
- /**
- * 跳转
- *
- * @param cookie
- * 上次请求取到的cookie
- * @param redirectUrl
- * 跳转url
- * @return 返回跳转后取得的cookie
- */
- private String doRedirection(String cookie, String redirectUrl) {
- HttpGet get = new HttpGet(redirectUrl);
- setHeader(get);
- get.setHeader("Cookie", cookie);
- HttpResponse response;
- try {
- response = httpclient.execute(get);
- HttpEntity entity = response.getEntity();
- Header[] headers2 = response.getHeaders("Set-Cookie");
- if (headers2 != null) {
- cookie = headers2[0].getValue();
- System.out.println("跳转页面取回的cookie:" + cookie);
- String[] splits = cookie.split(";");
- for (String str : splits) {
- if (str.startsWith("_WEIBO_UID")) {
- cookie = str;
- break;
- }
- }
- }
- EntityUtils.consume(entity);
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return cookie;
- }
- /**
- * 提交账号密码,登陆
- *
- * @param userNameValue
- * 微博账号
- * @param passwordValue
- * 微博密码
- * @return 返回cookie
- */
- public String doLogin(String userNameValue, String passwordValue) {
- // 获取登陆页面的参数
- String[] loginParameters = getLoginParameters();
- String postAction = loginParameters[0];
- String passwordKey = loginParameters[1];
- String vkValue = loginParameters[2];
- // 提交账号密码,获取重定向页面链接与cookie
- String[] cookieRedirectLocation = submitPassword(postAction, userNameValue, passwordValue, passwordKey, vkValue);
- String cookie = cookieRedirectLocation[0];
- String redirectUrl = cookieRedirectLocation[1];
- // 获取重定向页面内容
- String[] redirectInfo = getRedirectPageInfo(redirectUrl);
- cookie = redirectInfo[0];
- redirectUrl = redirectInfo[1];
- System.out.println("准备跳转");
- try {
- TimeUnit.SECONDS.sleep(3);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- System.out.println("开始跳转");
- String cookieOfRedirect = doRedirection(cookie, redirectUrl);
- StringBuffer sb = new StringBuffer(cookie);
- sb.append(';').append(cookieOfRedirect);
- System.out.println("登陆成功,最终cookie为:" + sb.toString());
- return sb.toString();
- }
- /**
- * 使用配置的账号、密码登陆
- *
- * @return 返回登陆cookie
- */
- public String doLogin() {
- return this.doLogin(Constants.LOGIN_USERNAME, Constants.LOGIN_PASSWORD);
- }
- }
3、测试
步骤:先登录取cookie再拿cookie填到请求里面取页面内容
- package cn.mingyuan.weibo.test;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.util.EntityUtils;
- import cn.mingyuan.weibo.commons.RequestCommons;
- import cn.mingyuan.weibo.login.Login;
- /**
- * 测试,取页面内容
- * @author mingyuan
- *
- */
- public class WeiboTest extends RequestCommons {
- /**
- * 打印流
- *
- * @param in
- * InputStream
- */
- private void printContent(InputStream in) {
- BufferedReader reader = null;
- try {
- reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
- String line;
- while ((line = reader.readLine()) != null) {
- System.out.println(line);
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- // 这里不关闭流,留作他用
- }
- }
- /**
- * 测试,读取一个微博地址,打印出页面内容
- *
- * @param finalCookie
- * cookie
- */
- private void test(String finalCookie) {
- HttpGet get = new HttpGet("http://weibo.cn/irlucene");
- setHeader(get);
- get.setHeader("Cookie", finalCookie.toString());
- HttpResponse response;
- try {
- response = httpclient.execute(get);
- HttpEntity entity = response.getEntity();
- printContent(entity.getContent());
- EntityUtils.consume(entity);
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- get.releaseConnection();
- }
- }
- public static void main(String[] args) {
- Login login = new Login();
- String userNameValue = "username";
- String passwordValue = "password";
- String cookie = login.doLogin(userNameValue, passwordValue);
- System.out.println("final Cookie=" + cookie);
- new WeiboTest().test(cookie);
- }
- }
转载:http://blog.csdn.net/telnetor/article/details/8582045
https://passport.sina.cn/signup/signup?r=http%3A%2F%2Fmy.sina.cn%2F%3Fpos%3D108%26vt%3D4%26m%3D78fc51068140045a973a3aeab4db2381