自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名

如果觉得我的技术文章还有点让列为看官汲取之处,

请给我投上宝贵的一篇,以兹鼓励呵,多谢,多谢!!

本人ID:m13666368773

投票地址:http://vote.blog.csdn.net/item/blogstar/m13666368773

 

有幸入选 CSDN 2012 博客之星 88位候选人,但是排名不是很靠前,想看看自己距离前面几名 多少投票,遂写了这个 粗劣的程序,跑了一下

原理:由于评选页面估计是 异步读取的信息,所以只能进入88名候选人投票页面,获取有用信息:用户名,票数,排名,所以爬虫爬行时间有点慢,需要优化,不过基本上实现排名。

 

程序如下:

package com.aptech;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.junit.Test;

@SuppressWarnings("unchecked")
public class TestPachongUrl {
	static Map messageMap = new HashMap();
	private static List list = new ArrayList();
	static String url = "http://vote.blog.csdn.net/item/blogstar/";
	static String user[] = new String[] { "Testing_is_believing", "t0nsha", "iukey", "yjflinchong", "taomanman", "chinafe", "hliq5399", "dog250", "qinjuning", "cheny_com", "v_JULY_v", "zhmxy555",
			"Purpleendurer", "iihero", "yming0221", "ccanan", "tigerjb", "cheungmine", "hawksoft", "sheismylife", "hfahe", "cyq1984", "littletigerat", "kmyhy", "caimouse", "manoel", "xyz_lmn",
			"hunkcai", "yiyaaixuexi", "norains", "clever101", "leftfist", "xiaominghimi", "niyi0318", "yanghuiliu", "abandonship", "mapdigit", "bill_man", "Augusdi", "LoveLion", "sunboy_2050",
			"kongxx", "21aspnet", "chszs", "thl789", "mylxiaoyi", "akof1314", "yincheng01", "keyboardOTA", "pan_tian", "downmoon", "wangkuifeng0118", "robinson_0612", "bluishglc", "coolbacon",
			"tangcheng_ok", "tianxiaode", "cjjky", "MoreWindows", "mr_raptor", "dojotoolkit", "chelsea", "chgaowei", "teamlet", "IBM_hoojo", "iefreer", "lee576", "jaminwm", "xuhuojun", "linghe301",
			"caolaosanahnu", "ricohzhanglong", "totogo2010", "axman", "ce123", "rabbit729", "nkmnkm", "superdont", "m13666368773", "aomandeshangxiao", "hitlion2008", "siren0203", "feixiaoxing",
			"Poechant", "cloudhsu", "Innost", "yanghua_kobe", "tianlesoftware" };

	@Test
	public static String test(URL url) throws Exception {
		/**
		 * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using
		 * java.net.URL and //java.net.URLConnection
		 */
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
		/**
		 * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
		 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
		 */
		connection.setDoOutput(true);
		connection.setRequestMethod("POST");
		connection.setRequestProperty("user-agent", "mozilla/4.7 [en] (win98; i)");
		connection.connect();

		/**
		 * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
		 */
		OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");
		out.flush();
		out.close();
		/**
		 * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
		 * text/plain Content-type: application/x-www-form-urlencoded
		 * Content-length: 99 username=bob password=someword
		 */
		// 一旦发送成功,用以下方法就可以得到服务器的回应:
		String sCurrentLine;
		String sTotalString;
		sCurrentLine = "";
		sTotalString = "";
		InputStream l_urlStream;
		l_urlStream = connection.getInputStream();
		// 传说中的三层包装阿!
		BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));
		while ((sCurrentLine = l_reader.readLine()) != null) {
			sTotalString += sCurrentLine + "\r\n";
		}
		int begin0 = sTotalString.indexOf("博客地址:<a href=\"http://blog.csdn.net/");
		int end0 = sTotalString.indexOf("\" class=\"red\" target=\"_blank\">");
		int begin1 = sTotalString.indexOf("票数:<span class=\"red\">");
		int end1 = sTotalString.indexOf("</span> 票</li>");
		int begin2 = sTotalString.indexOf("当前排名:<span class=\"red\">");
		int end2 = sTotalString.indexOf("</span> 名</li>");
		String message = sTotalString.substring(begin0 + 35, end0) + "-" + sTotalString.substring(begin1 + 21, end1) + "=" + sTotalString.substring(begin2 + 23, end2);
		return message;
	}

	public static void main(String[] args) throws Exception {

		for (int i = 0; i < user.length; i++) {
			list.add(new URL(url + user[i]));
		}
		SimpleDateFormat dateformat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒 E ");
		String nowTime = dateformat.format(new Date());

		System.out.println("统计时间:" + nowTime);
		System.out.println("候选人数量:" + user.length);
		System.out.println(addChinaBlank("用户名") + addChinaBlank("票数") + "排名");
		for (int i = 0; i < list.size(); i++) {
			String subMessage = test((URL) list.get(i));
			String key = subMessage.substring(subMessage.indexOf("=") + 1, subMessage.length());
			messageMap.put(key, subMessage);
		}
		for (int i = 1; i <= 88; i++) {
			String endMessage = messageMap.get("" + i).toString();
			System.out.println(addBlank(endMessage.substring(0, endMessage.indexOf("-"))) + endMessage.substring(endMessage.indexOf("-") + 1, endMessage.indexOf("=")) + "			"
					+ endMessage.substring(endMessage.indexOf("=") + 1, endMessage.length()));
		}

	}

	public static String addBlank(String user) {
		String blank = " ";
		int userLength = user.length();
		for (int i = 0; i < 30 - userLength; i++) {
			user += blank;
		}
		return user;
	}

	public static String addChinaBlank(String message) {
		String blank = " ";
		int userLength = message.length() * 2;
		for (int i = 0; i < 70 - userLength; i++) {
			message += blank;
		}
		return message;
	}
}


运行一下:

统计时间:2012年12月08日 13时31分24秒 星期六 
候选人数量:88
用户名                              票数                        排名
v_JULY_v                      1133			1
yincheng01                    827			2
MoreWindows                   446			3
mr_raptor                     371			4
yiyaaixuexi                   350			5
LoveLion                      298			6
ricohzhanglong                295			7
tianlesoftware                243			8
xiaominghimi                  236			9
taomanman                     225			10
yming0221                     205			11
zhmxy555                      180			12
Poechant                      165			13
aomandeshangxiao              146			14
linghe301                     133			15
hawksoft                      125			16
nkmnkm                        112			17
cjjky                         112			18
niyi0318                      90			19
cyq1984                       87			20
clever101                     84			21
cloudhsu                      79			22
akof1314                      70			23
Testing_is_believing          65			24
cheny_com                     57			25
yanghuiliu                    52			26
lee576                        45			27
manoel                        45			28
bill_man                      43			29
hfahe                         43			30
tangcheng_ok                  41			31
teamlet                       41			32
dojotoolkit                   41			33
cheungmine                    41			34
yjflinchong                   41			35
norains                       40			36
sheismylife                   38			37
m13666368773                  36			38
coolbacon                     36			39
pan_tian                      34			40
sunboy_2050                   34			41
qinjuning                     31			42
Augusdi                       30			43
21aspnet                      29			44
tigerjb                       29			45
axman                         29			46
mapdigit                      29			47
downmoon                      27			48
chgaowei                      27			49
ce123                         26			50
mylxiaoyi                     26			51
dog250                        25			52
t0nsha                        25			53
feixiaoxing                   25			54
thl789                        25			55
kongxx                        25			56
abandonship                   24			57
iukey                         23			58
caimouse                      22			59
caolaosanahnu                 22			60
xyz_lmn                       21			61
robinson_0612                 20			62
IBM_hoojo                     20			63
Innost                        20			64
wangkuifeng0118               19			65
iihero                        19			66
hunkcai                       19			67
rabbit729                     19			68
chelsea                       19			69
totogo2010                    18			70
tianxiaode                    18			71
Purpleendurer                 18			72
yanghua_kobe                  17			73
jaminwm                       16			74
iefreer                       16			75
siren0203                     16			76
ccanan                        15			77
littletigerat                 15			78
kmyhy                         15			79
chszs                         15			80
superdont                     14			81
keyboardOTA                   14			82
leftfist                      14			83
chinafe                       13			84
hliq5399                      12			85
bluishglc                     10			86
hitlion2008                   9			87
xuhuojun                      7			88


 

你可能感兴趣的:(自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名)