【续】自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名

本篇博客撰写说明:

①时代在变,楼主的需求也发生了一丁点的更新,从入围《CSDN 2012 博客之星》评选,楼主幸运挤进前20名。但是与第10名票数还有一定的差距,故更新程序,查看楼主自己与第十名的 票数差距、排名差距

②有CSDN的朋友对前几天楼主写的《自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名》程序有点兴趣,故将程序 进行优化和重构,并加入了相应的注释,使程序更加具有可读性。

末:由于楼主能力有限,原先发现的该程序爬行88个网页速度过慢,主因系:网速原因,故不再优化。楼主也发现解析各个网页中【用户名、票数、排名】部分有很大的优化空间,如感兴趣的网友,请提供解析部分的优化方案,共同学习哦,亲!

 

如果觉得我的技术文章还有点让列为看官汲取之处,

请给我投上宝贵的一篇,以兹鼓励呵,多谢,多谢!!

本人ID:m13666368773

投票地址:http://vote.blog.csdn.net/item/blogstar/m13666368773

凡投票的朋友,

请第一时间在文章下方评论:“当前票数:XXX+已投票+邮箱:[email protected]

稍后会将 <Web应用界面设计规范>PPT版本,发给您。

该博客地址:http://blog.csdn.net/m13666368773/article/details/8276810

请稍花点时间,为我投上您手中宝贵的一票,

敬告:我这能看到您的投票“用户名”,请勿虚报!多谢,多谢!!

截至时间:2012-12-30

 

废话不多说:上代码

package com.aptech;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@SuppressWarnings("unchecked")
public class TestPachongUrl {
	private static Map messageMap = new HashMap();
	private static List list = new ArrayList();
	private static String url = "http://vote.blog.csdn.net/item/blogstar/";//抽取公共Url部分
	/*
	 * 以下user[],手工录入2012年88位CSDN博客之星候选人
	 */
	private static String user[] = new String[] { "Testing_is_believing", "t0nsha", "iukey", "yjflinchong", "taomanman", "chinafe", "hliq5399", "dog250", "qinjuning", "cheny_com", "v_JULY_v",
			"zhmxy555", "Purpleendurer", "iihero", "yming0221", "ccanan", "tigerjb", "cheungmine", "hawksoft", "sheismylife", "hfahe", "cyq1984", "littletigerat", "kmyhy", "caimouse", "manoel",
			"xyz_lmn", "hunkcai", "yiyaaixuexi", "norains", "clever101", "leftfist", "xiaominghimi", "niyi0318", "yanghuiliu", "abandonship", "mapdigit", "bill_man", "Augusdi", "LoveLion",
			"sunboy_2050", "kongxx", "21aspnet", "chszs", "thl789", "mylxiaoyi", "akof1314", "yincheng01", "keyboardOTA", "pan_tian", "downmoon", "wangkuifeng0118", "robinson_0612", "bluishglc",
			"coolbacon", "tangcheng_ok", "tianxiaode", "cjjky", "MoreWindows", "mr_raptor", "dojotoolkit", "chelsea", "chgaowei", "teamlet", "IBM_hoojo", "iefreer", "lee576", "jaminwm", "xuhuojun",
			"linghe301", "caolaosanahnu", "ricohzhanglong", "totogo2010", "axman", "ce123", "rabbit729", "nkmnkm", "superdont", "m13666368773", "aomandeshangxiao", "hitlion2008", "siren0203",
			"feixiaoxing", "Poechant", "cloudhsu", "Innost", "yanghua_kobe", "tianlesoftware" };

	private static final String master = "m13666368773";// 楼主用户名,[关键值],用于从集合中获取楼主信息,包括用户名、当前票数、当前排名
	private static final String tenthUser = "10";// 第十名,[关键值],用户从集合中获取第十名用户的信息,包括用户名、当前票数、当前排名
	private static String saveMasterMessage = null;// 初始化,用于保存楼主信息
	private static String saveTenthUserMessage = null;// 初始化,用于保存第十名用户的信息

	/**
	 * 该方法用于爬取88名候选人投票主页,并记录信息:用户名、当前票数、当前排名
	 * @param url
	 */
	public static String test(URL url) throws Exception {
		/**
		 * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using
		 * java.net.URL and //java.net.URLConnection
		 */
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
		/**
		 * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
		 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
		 */
		connection.setDoOutput(true);
		connection.setRequestMethod("POST");
		connection.setRequestProperty("user-agent", "mozilla/4.7 [en] (win98; i)");
		connection.connect();

		/**
		 * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
		 */
		OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");
		out.flush();
		out.close();
		/**
		 * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
		 * text/plain Content-type: application/x-www-form-urlencoded
		 * Content-length: 99 username=bob password=someword
		 */
		// 一旦发送成功,用以下方法就可以得到服务器的回应:
		String sCurrentLine = "";
		String sTotalString = "";
		InputStream l_urlStream;
		l_urlStream = connection.getInputStream();
		// 传说中的三层包装阿!
		BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));
		while ((sCurrentLine = l_reader.readLine()) != null) {
			sTotalString += sCurrentLine + "\r\n";
		}
		int begin0 = sTotalString.indexOf("博客地址:<a href=\"http://blog.csdn.net/");
		int end0 = sTotalString.indexOf("\" class=\"red\" target=\"_blank\">");
		int begin1 = sTotalString.indexOf("票数:<span class=\"red\">");
		int end1 = sTotalString.indexOf("</span> 票</li>");
		int begin2 = sTotalString.indexOf("当前排名:<span class=\"red\">");
		int end2 = sTotalString.indexOf("</span> 名</li>");
		String message = sTotalString.substring(begin0 + 35, end0) + "-" + sTotalString.substring(begin1 + 21, end1) + "=" + sTotalString.substring(begin2 + 23, end2);
		return message;
	}

	/**
	 * 给用户名补充空格,用于显示对齐
	 * @param user
	 */
	public static String addBlank(String user) {
		String blank = " ";
		int userLength = user.length();
		for (int i = 0; i < 30 - userLength; i++) {
			user += blank;
		}
		return user;
	}

	/**
	 * 给表头补充空格,用于显示对齐
	 * @param message
	 */
	public static String addChinaBlank(String message) {
		String blank = " ";
		int userLength = message.length() * 2;
		for (int i = 0; i < 70 - userLength; i++) {
			message += blank;
		}
		return message;
	}

	/**
	 * 输入 一条用户信息,通过本方法,分别解析出 用户名、当前票数、当前排名,并做对齐处理,返回
	 * @param message
	 */
	public static String getRankMessage(String message) {

		return addBlank(message.substring(0, message.indexOf("-"))) + message.substring(message.indexOf("-") + 1, message.indexOf("=")) + "			   "
				+ message.substring(message.indexOf("=") + 1, message.length());
	}

	/**
	 * 主方法,运行一下喽
	 */
	public static void main(String[] args) throws Exception {

		for (int i = 0; i < user.length; i++) {
			list.add(new URL(url + user[i]));
		}
		SimpleDateFormat dateformat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒 E ");
		String nowTime = dateformat.format(new Date());

		System.out.println("统计时间:" + nowTime);
		System.out.println("候选人数量:" + user.length);
		System.out.println(addChinaBlank("用户名") + addChinaBlank("票数") + "排名");
		for (int i = 0; i < list.size(); i++) {
			String subMessage = test((URL) list.get(i));
			String key = subMessage.substring(subMessage.indexOf("=") + 1, subMessage.length());
			messageMap.put(key, subMessage);
		}
		for (int i = 1; i <= 88; i++) {
			String endMessage = messageMap.get("" + i).toString();
			System.out.println(getRankMessage(endMessage));
			if (master.equals(endMessage.substring(0, endMessage.indexOf("-")))) {// 保存楼主信息
				saveMasterMessage = endMessage;
			}
			if (tenthUser.equals(endMessage.substring(endMessage.indexOf("=") + 1, endMessage.length()))) {// 保存第十名用户的信息
				saveTenthUserMessage = endMessage;
			}
		}

		int tenthUserPiaoshu = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("-") + 1, saveTenthUserMessage.indexOf("=")));
		int masterPiaoshu = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("-") + 1, saveMasterMessage.indexOf("=")));
		int piaoshuGap = tenthUserPiaoshu - masterPiaoshu;// 楼主与第十名相差的票数

		int tenthUserPaiming = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("=") + 1, saveTenthUserMessage.length()));
		int masterPaiming = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("=") + 1, saveMasterMessage.length()));
		int paimingGap = ~(tenthUserPaiming - masterPaiming) + 1;// 楼主与第十名相差的名数

		System.out.println("=============以下对比楼主与第十名用户的信息===============================");
		System.out.println(getRankMessage(saveTenthUserMessage));
		System.out.println(getRankMessage(saveMasterMessage));
		System.out.println("========================================================================");
		System.out.println(addBlank("difference tenthUer VS master") + piaoshuGap + "			   " + paimingGap);

	}

}


运行一下:

统计时间:2012年12月19日 17时16分34秒 星期三 
候选人数量:88
用户名                             票数                           排名
v_JULY_v                      1347			   1
MoreWindows                   583			   2
yiyaaixuexi                   476			   3
mr_raptor                     435			   4
xiaominghimi                  410			   5
yincheng01                    395			   6
zhmxy555                      391			   7
yming0221                     379			   8
Poechant                      358			   9
ricohzhanglong                346			   10
LoveLion                      322			   11
tianlesoftware                286			   12
taomanman                     282			   13
m13666368773                  217			   14
aomandeshangxiao              216			   15
cheny_com                     176			   16
linghe301                     160			   17
dojotoolkit                   149			   18
hawksoft                      141			   19
cjjky                         123			   20
akof1314                      122			   21
nkmnkm                        120			   22
clever101                     116			   23
yanghuiliu                    103			   24
cyq1984                       103			   25
niyi0318                      101			   26
sheismylife                   96			   27
cloudhsu                      87			   28
coolbacon                     76			   29
Testing_is_believing          71			   30
cheungmine                    56			   31
bill_man                      55			   32
tangcheng_ok                  55			   33
21aspnet                      53			   34
lee576                        53			   35
norains                       51			   36
teamlet                       50			   37
manoel                        48			   38
hfahe                         48			   39
sunboy_2050                   47			   40
yjflinchong                   47			   41
tigerjb                       43			   42
mapdigit                      43			   43
axman                         42			   44
Augusdi                       39			   45
pan_tian                      39			   46
feixiaoxing                   38			   47
mylxiaoyi                     37			   48
t0nsha                        35			   49
thl789                        35			   50
qinjuning                     35			   51
kongxx                        34			   52
caimouse                      32			   53
chgaowei                      32			   54
dog250                        31			   55
ce123                         31			   56
downmoon                      30			   57
xyz_lmn                       29			   58
littletigerat                 28			   59
robinson_0612                 28			   60
iihero                        28			   61
siren0203                     28			   62
Purpleendurer                 28			   63
iukey                         27			   64
tianxiaode                    27			   65
abandonship                   27			   66
Innost                        27			   67
wangkuifeng0118               26			   68
iefreer                       26			   69
caolaosanahnu                 26			   70
hunkcai                       25			   71
chelsea                       25			   72
totogo2010                    24			   73
leftfist                      24			   74
IBM_hoojo                     24			   75
hitlion2008                   24			   76
jaminwm                       23			   77
rabbit729                     23			   78
yanghua_kobe                  23			   79
keyboardOTA                   22			   80
ccanan                        20			   81
hliq5399                      20			   82
kmyhy                         20			   83
superdont                     19			   84
xuhuojun                      19			   85
chszs                         18			   86
chinafe                       17			   87
bluishglc                     14			   88
=============以下对比楼主与第十名用户的信息===============================
ricohzhanglong                346			   10
m13666368773                  217			   14
========================================================================
difference tenthUer VS master 129			   4


 

 

 

你可能感兴趣的:(【续】自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名)