0、背景
原来对新浪微博的采集主要靠对weibo.com这个入口进行,但是最近发现有人使用weibo.cn这个入口操作。应为weibo.cn对应的是微博的3g版本,基本没有广告,页面小,这样下载数据量会小很多,并且3g版本的采集比较简单。于是就有了这个3g版本的采集程序。
写出来,分享给大家,希望对有需要的朋友有所帮助。
使用到的类库:httpclient、htmlcleaner
httpclient负责处理http的get和post请求,下载页面;htmlcleaner负责将下载的页面转化为规范的xml,之后用xpath匹配所需内容。
1、基础的http请求类
package cn.mingyuan.weibo.commons; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.impl.client.DefaultHttpClient; /** * http请求基类 * * @author mingyuan * */ public abstract class RequestCommons { protected HttpClient httpclient = null; public RequestCommons() { initHttpClient(); } /** * 初始化httpclient */ protected void initHttpClient() { httpclient = new DefaultHttpClient(); } protected HttpClient getHttpClient() { return httpclient; } protected void addHeader(HttpRequestBase request, String key, String value) { request.addHeader(key, value); } protected void addCookie(HttpRequestBase request, String cookie) { addHeader(request, "Cookie", cookie); } protected void setCookie(HttpRequestBase request, String cookie) { request.setHeader("Cookie", cookie); } /** * 设置请求的header值 * * @param request * http的get或者post请求 */ protected void setHeader(HttpRequestBase request) { request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); request.setHeader("Accept-Language", "en-us,en;q=0.5"); request.setHeader("Connection", "keep-alive"); request.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:13.0) Gecko/20100101 Firefox/13.0.1"); } }2、weibo.cn登陆
package cn.mingyuan.weibo.login; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; import cn.mingyuan.weibo.commons.RequestCommons; import cn.mingyuan.weibo.until.Constants; /** * 登陆 获取cookie * * @author mingyuan * */ public class Login extends RequestCommons { /** * 获取登陆参数。主要有三个值:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值 * * @return 返回登陆参数,string数组,里面的元素:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值 */ private String[] getLoginParameters() { HttpClient httpClient = getHttpClient(); String location = "http://3g.sina.com.cn/prog/wapsite/sso/login.php?backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=4&revalid=2&ns=1"; HttpGet get = new HttpGet(location); setHeader(get); HttpResponse response; InputStream content; String retAction = null; String retPassword = null; String retVk = null; try { response = httpClient.execute(get); HttpEntity entity = response.getEntity(); content = entity.getContent(); // 提取登陆参数 HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(content, "utf-8"); Object[] action = tagNode.evaluateXPath("//form/@action"); if (action.length > 0) { retAction = action[0].toString(); } Object[] passwordKey = tagNode.evaluateXPath("//form//input[@type='password']/@name"); if (passwordKey.length > 0) { retPassword = passwordKey[0].toString(); } Object[] vkKey = tagNode.evaluateXPath("//form//input[@name='vk']/@value"); if (vkKey.length > 0) { retVk = vkKey[0].toString(); } EntityUtils.consume(entity); } catch (ClientProtocolException e) { System.out.println("获取登陆页面失败,location=" + location); e.printStackTrace(); } catch (IOException e) { System.out.println("获取页面内容流失败"); e.printStackTrace(); } catch (XPatherException e) { System.out.println("解析登陆参数失败"); e.printStackTrace(); } finally { if (get != null) { get.releaseConnection(); } } System.out.println("请求页面:" + location); System.out.println("提交地址:" + retAction); System.out.println("密码输入框名称:" + retPassword); System.out.println("vk值:" + retVk); return new String[] { retAction, retPassword, retVk }; } /** * 提交账号密码,开始登陆 * * @param postAction * 登陆地址 * @param userNameValue * 微博登陆账号 * @param passwordValue * 微博登陆密码 * @param passwordKey * 微博登陆框的name * @param vkValue * vk的值 * @return 返回取到的cookie与跳转地址,组合成一个String数组。第一个元素为cookie,第二个元素为跳转地址 */ private String[] submitPassword(String postAction, String userNameValue, String passwordValue, String passwordKey, String vkValue) { HttpClient httpclient = getHttpClient(); String url = "http://3g.sina.com.cn/prog/wapsite/sso/" + postAction; System.out.println("开始提交账号密码:" + url); HttpPost post = new HttpPost(url); setHeader(post); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("mobile", userNameValue)); nvps.add(new BasicNameValuePair(passwordKey, passwordValue)); nvps.add(new BasicNameValuePair("remember", "on")); nvps.add(new BasicNameValuePair("vk", vkValue)); nvps.add(new BasicNameValuePair("backURL", "http://weibo.cn/")); nvps.add(new BasicNameValuePair("backTitle", "新浪微博")); nvps.add(new BasicNameValuePair("submit", "登录")); HttpResponse response; String cookie = null; String location = null; try { post.setEntity(new UrlEncodedFormEntity(nvps)); response = httpclient.execute(post); HttpEntity entity2 = response.getEntity(); Header[] setCookie = response.getHeaders("Set-Cookie"); if (setCookie != null) { cookie = setCookie[0].getValue(); System.out.println("获取到Cookie:" + cookie); } Header[] locations = response.getHeaders("Location"); if (locations != null) { location = locations[0].getValue(); System.out.println("获取到跳转链接:" + location); } EntityUtils.consume(entity2); } catch (UnsupportedEncodingException e1) { e1.printStackTrace(); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { post.releaseConnection(); } return new String[] { cookie, location }; } /** * 获取重定向页面内容 * * @param redirectUrl * 获取重定向页面地址 * @return 获取cookie和要跳转的地址 */ private String[] getRedirectPageInfo(String redirectUrl) { System.out.println("开始获取跳转链接页面"); HttpGet get = new HttpGet(redirectUrl); setHeader(get); HttpResponse redirectResponse; String cookie = null; String clickHref = null; try { redirectResponse = httpclient.execute(get); Header[] headers = redirectResponse.getHeaders("Set-Cookie"); if (headers != null) { cookie = headers[0].getValue(); String[] splits = cookie.split(";"); for (String str : splits) { if (str.startsWith("gsid_CTandWM")) { cookie = str; break; } } } HttpEntity entity = redirectResponse.getEntity(); InputStream content = entity.getContent(); HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(content, "utf-8"); Object[] clickHrefs = tagNode.evaluateXPath("//div/a/@href"); if (clickHrefs != null) { clickHref = clickHrefs[0].toString(); System.out.println("获取到跳转链接地址:" + clickHref); } EntityUtils.consume(entity); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (XPatherException e) { e.printStackTrace(); } finally { get.releaseConnection(); } return new String[] { cookie, clickHref }; } /** * 跳转 * * @param cookie * 上次请求取到的cookie * @param redirectUrl * 跳转url * @return 返回跳转后取得的cookie */ private String doRedirection(String cookie, String redirectUrl) { HttpGet get = new HttpGet(redirectUrl); setHeader(get); get.setHeader("Cookie", cookie); HttpResponse response; try { response = httpclient.execute(get); HttpEntity entity = response.getEntity(); Header[] headers2 = response.getHeaders("Set-Cookie"); if (headers2 != null) { cookie = headers2[0].getValue(); System.out.println("跳转页面取回的cookie:" + cookie); String[] splits = cookie.split(";"); for (String str : splits) { if (str.startsWith("_WEIBO_UID")) { cookie = str; break; } } } EntityUtils.consume(entity); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return cookie; } /** * 提交账号密码,登陆 * * @param userNameValue * 微博账号 * @param passwordValue * 微博密码 * @return 返回cookie */ public String doLogin(String userNameValue, String passwordValue) { // 获取登陆页面的参数 String[] loginParameters = getLoginParameters(); String postAction = loginParameters[0]; String passwordKey = loginParameters[1]; String vkValue = loginParameters[2]; // 提交账号密码,获取重定向页面链接与cookie String[] cookieRedirectLocation = submitPassword(postAction, userNameValue, passwordValue, passwordKey, vkValue); String cookie = cookieRedirectLocation[0]; String redirectUrl = cookieRedirectLocation[1]; // 获取重定向页面内容 String[] redirectInfo = getRedirectPageInfo(redirectUrl); cookie = redirectInfo[0]; redirectUrl = redirectInfo[1]; System.out.println("准备跳转"); try { TimeUnit.SECONDS.sleep(3); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("开始跳转"); String cookieOfRedirect = doRedirection(cookie, redirectUrl); StringBuffer sb = new StringBuffer(cookie); sb.append(';').append(cookieOfRedirect); System.out.println("登陆成功,最终cookie为:" + sb.toString()); return sb.toString(); } /** * 使用配置的账号、密码登陆 * * @return 返回登陆cookie */ public String doLogin() { return this.doLogin(Constants.LOGIN_USERNAME, Constants.LOGIN_PASSWORD); } }
步骤:先登录取cookie再拿cookie填到请求里面取页面内容
package cn.mingyuan.weibo.test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.util.EntityUtils; import cn.mingyuan.weibo.commons.RequestCommons; import cn.mingyuan.weibo.login.Login; /** * 测试,取页面内容 * @author mingyuan * */ public class WeiboTest extends RequestCommons { /** * 打印流 * * @param in * InputStream */ private void printContent(InputStream in) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(in, "utf-8")); String line; while ((line = reader.readLine()) != null) { System.out.println(line); } } catch (Exception e) { e.printStackTrace(); } finally { // 这里不关闭流,留作他用 } } /** * 测试,读取一个微博地址,打印出页面内容 * * @param finalCookie * cookie */ private void test(String finalCookie) { HttpGet get = new HttpGet("http://weibo.cn/irlucene"); setHeader(get); get.setHeader("Cookie", finalCookie.toString()); HttpResponse response; try { response = httpclient.execute(get); HttpEntity entity = response.getEntity(); printContent(entity.getContent()); EntityUtils.consume(entity); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { get.releaseConnection(); } } public static void main(String[] args) { Login login = new Login(); String userNameValue = "username"; String passwordValue = "password"; String cookie = login.doLogin(userNameValue, passwordValue); System.out.println("final Cookie=" + cookie); new WeiboTest().test(cookie); } }