抓取网页页面信息

不借助第三方工具(httpUnit,htmlparse)
想获得也个网站的某个页面的信息,关键是能顺利请求到该页面。某些网站进行加密和其他技术可以防止被抓,那就很难你得逞了。
我抓的是51job招聘列表页,问题关键是怎么查找下一页。51的是通过post方式提交表单,这时就要把所有参数查找出来,通过parameter写入请求信息中。

请求连接方法
	private Scanner openConnection (int i,String keyName,String link) {
		
		try {
			
			URL url = new URL("http://search.51job.com/jobsearch/keyword_search.php");
                       //参数设置
			String   parameter   =  "postchannel=0000&stype=2&jobarea=0100&district=&address=&lonlat=&radius=" +
			"&funtype_big=0000&funtype=0000&industrytype=00&issuedate=9&keywordtype=2&dis_keyword=" +
			"&keyword=&workyear=99&providesalary=99&cotype=99&degreefrom=99&jobterm=01&ord_field=0" +
			"&list_type=1&last_list_type=1&curr_page=&last_page=1&nStart=1&start_page=&total_page=86" +
			"&jobid_list=39297991~39298287~39298722~39298729~39297918~39297800~39298262~39297331~39297238~39297080~39296848~39297361~39296644~39296315~39287153~39295409~39295407~39295397~39295396~39295391~39287385~39293469~39287417~39285861~39281595~39281853~39279955~39281274~39280683~38748545~37068616~38130945~39023955~36747022~36493173~39006183~38960955~38960944~38960615~38980334~37888484~37584999~38998054~37585073~37332619~36882505~34976909~37307284~37307262~36999896~36767409~39242127~7369258~35503114~35502793~35496087~35496083~35495350~35494140~35493224~35492320~35487346~35468080~35457510~35457504~35457501~35398467~35380047~35347719~35347637~34991677~20974922~20974918~37441300~35465051~39160193~39029414~38138399~39136977~36632495~39266845~39270060~39266835~39097249~39082877~37663952~37662532~37662480~37663986~37662626~37662589~37662556~37738455~39270625~38433053~38261468~38486743~39057636~34582292~36475553~37257361~37257567~37257262~36741386~36711006~36498218~38914431~38734212~38674569~38787188~39259469~38927584~39024252~39024230~39228632~35252232~38658258~38658243~38625335~39245388~37319651~36852389~39136912~39159440~37456013~39256295~39214509~39253898~37376056~38561452~38295890~39156937~26052225~38711016~39272058~39271701~37777885~38524663~39022301~39063658~37777523~39018693~37897821~37023954~39242449~39242399~36227979~38635974~39100175~39200749~39251242~39197848~39229735~39108206~38520680~38520612~37512047~37373955~36748357~36558807~36553946~36994069~35651002~37645149~35650457~37547299~37547226~37547191~37547135~37325202~38909563~37981021~36518439~38435329~38356348~39225954~38905834~39100737~38753876~38753837~38648131~38909881~38909871~39253871~39139848~37756802~38207471~38715097~38714739~39228968~39109760~39109531~39109511~38412880~39193350~38918885~38443045~38133816~35085561~38011368~"+
			"&jobid_count=2551&schTime=15&statCount=364" +
			"&statData=404|114|45|61|92|99|29|34|80|27|15|29|49|449|1|228|133|0|0|1|1|243|494|5|0|0|1|0|7|232|321|139|26|1|0|152|831|1|1|4|18|8|8|4|3|0|0|0|0|0|0|588|0|1|0|0|0|0|1|13|0|0|0|0|0|0|0|1|0|0|0|0|0|0|2|254|6|6|0|1|1|0|0|0|0|0|0|1|0|0|0|0|2|0|1|0|0|0|0|0|0|0|0|0|0|0|365|14|13|0|5|3|18|9|2|0|1|26|6|2|0|0|3|1|2|3|0|9|32|1|0|6|1|0|0|0|13|209|1|0|3|1|7|32|5|37|1|0|3|0|0|13|2|9|10|0|1|0|5|1|1|0|0|2"+
			"&fromType=";
                      //设置分页的页码
			parameter = parameter.replace("curr_page=", "curr_page="+String.valueOf(i));
			parameter = parameter.replace("fromType=", "fromType="+String.valueOf(14));
                       //设置关键字“程序员”
			parameter = parameter.replace("dis_keyword=", "dis_keyword="+URLEncoder.encode(keyName, "GBK"));
			parameter = parameter.replace("keyword=", "keyword="+URLEncoder.encode(keyName, "GBK"));
			
                       //打开链接设置头信息
			HttpURLConnection conn=(HttpURLConnection)url.openConnection(); 
			conn.setDoOutput(true); 
			conn.setRequestMethod("POST"); 
                       //伪装请求
			conn.setRequestProperty("Host", "search.51job.com");
			conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
                      //post方式参数长度必须设定
			conn.setRequestProperty("Content-Length", Integer.toString(parameter.getBytes("GB2312").length));  
			conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Alexa Toolbar; MAXTHON 2.0)");
                      
			OutputStream o = conn.getOutputStream();
			OutputStreamWriter out = new OutputStreamWriter(o, "GBK");  
			out.write(parameter);
			out.flush();
			out.close();
			 
                       //获得请求字节流
			InputStream in = conn.getInputStream();
                      //解析
			Scanner sc = new Scanner(in, "GBK");
			return sc;
		} catch (Exception e) {
			log.error(e,e);
			return null;
		}
	}

这样就可以顺利获得 该关键字 在第几页的列表信息了
做完这一步就可以通过解析要查找的信息了,比如公司信息,招聘职位 .......
while (sc.hasNextLine()) {
						String line = sc.nextLine();
						sp = line.indexOf("class=\"jobname\" >", sp + 1);
						if (sp != -1) {
							sp = line.indexOf("<a href");
							while (sp != -1) {
								int ep = line.indexOf("\"", sp + 9);
								if (ep == -1) {
									continue;
								}
								link = line.substring(sp + 9, ep);
								String userLink = link;
								URL userURL = new URL(SITE_BASE + userLink);
								urls.add(userURL);
								//System.out.println(userURL.toString());
								sp = line.indexOf("<a href", sp + 1);
							}
						}
						
						
					}


其他网站可以是get方式就更简单了,建议用firefox的firebug查看页面源代码可以方便查找请求的方式,信息。

二、查找 百度,谷歌上面的某个关键字的排行也可以用此办法
百度
	private Scanner openConnection (int i,String keyName) {
		
		try {
			String link = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd=&pn=&ver=0&cl=3&uim=2&usm=0";
                       //关键字
			link = link.replace("wd=", "wd="+ URLEncoder.encode(keyName, "GBK"));
                      //页面 00  10 20 30 分布
			link = link.replace("pn=", "pn="+String.valueOf(i));
			URL url = new URL(link);
			InputStream in = url.openStream();
			Scanner sc = new Scanner(in, "GBK");
			return sc;
		} catch (Exception e) {
			log.error(e,e);
			return null;
		}
	}


谷歌
	private Scanner openConnection (int i,String keyName,String link) {
		
		try {
			String link = "http://www.google.cn/search?hl=zh-CN&newwindow=1&q=&start=&sa=N";

			link = link.replace("q=", "q="+ URLEncoder.encode(keyName, "GBK"));
			link = link.replace("start=", "start="+String.valueOf(i));
			URL url = new URL(link);
			URLConnection con = url.openConnection();
                       //设置Agent,模仿浏览器请求,否则谷歌不认识
			con.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
			InputStream in = con.getInputStream();
			Scanner sc = new Scanner(in, "GBK");
			return sc;
		} catch (Exception e) {
			log.error(e,e);
			return null;
		}
	}

你可能感兴趣的:(XP,百度,IE,Firebug,招聘)