不借助第三方工具(httpUnit,htmlparse)
想获得也个网站的某个页面的信息,关键是能顺利请求到该页面。某些网站进行加密和其他技术可以防止被抓,那就很难你得逞了。
我抓的是51job招聘列表页,问题关键是怎么查找下一页。51的是通过post方式提交表单,这时就要把所有参数查找出来,通过parameter写入请求信息中。
请求连接方法
private Scanner openConnection (int i,String keyName,String link) {
try {
URL url = new URL("http://search.51job.com/jobsearch/keyword_search.php");
//参数设置
String parameter = "postchannel=0000&stype=2&jobarea=0100&district=&address=&lonlat=&radius=" +
"&funtype_big=0000&funtype=0000&industrytype=00&issuedate=9&keywordtype=2&dis_keyword=" +
"&keyword=&workyear=99&providesalary=99&cotype=99°reefrom=99&jobterm=01&ord_field=0" +
"&list_type=1&last_list_type=1&curr_page=&last_page=1&nStart=1&start_page=&total_page=86" +
"&jobid_list=39297991~39298287~39298722~39298729~39297918~39297800~39298262~39297331~39297238~39297080~39296848~39297361~39296644~39296315~39287153~39295409~39295407~39295397~39295396~39295391~39287385~39293469~39287417~39285861~39281595~39281853~39279955~39281274~39280683~38748545~37068616~38130945~39023955~36747022~36493173~39006183~38960955~38960944~38960615~38980334~37888484~37584999~38998054~37585073~37332619~36882505~34976909~37307284~37307262~36999896~36767409~39242127~7369258~35503114~35502793~35496087~35496083~35495350~35494140~35493224~35492320~35487346~35468080~35457510~35457504~35457501~35398467~35380047~35347719~35347637~34991677~20974922~20974918~37441300~35465051~39160193~39029414~38138399~39136977~36632495~39266845~39270060~39266835~39097249~39082877~37663952~37662532~37662480~37663986~37662626~37662589~37662556~37738455~39270625~38433053~38261468~38486743~39057636~34582292~36475553~37257361~37257567~37257262~36741386~36711006~36498218~38914431~38734212~38674569~38787188~39259469~38927584~39024252~39024230~39228632~35252232~38658258~38658243~38625335~39245388~37319651~36852389~39136912~39159440~37456013~39256295~39214509~39253898~37376056~38561452~38295890~39156937~26052225~38711016~39272058~39271701~37777885~38524663~39022301~39063658~37777523~39018693~37897821~37023954~39242449~39242399~36227979~38635974~39100175~39200749~39251242~39197848~39229735~39108206~38520680~38520612~37512047~37373955~36748357~36558807~36553946~36994069~35651002~37645149~35650457~37547299~37547226~37547191~37547135~37325202~38909563~37981021~36518439~38435329~38356348~39225954~38905834~39100737~38753876~38753837~38648131~38909881~38909871~39253871~39139848~37756802~38207471~38715097~38714739~39228968~39109760~39109531~39109511~38412880~39193350~38918885~38443045~38133816~35085561~38011368~"+
"&jobid_count=2551&schTime=15&statCount=364" +
"&statData=404|114|45|61|92|99|29|34|80|27|15|29|49|449|1|228|133|0|0|1|1|243|494|5|0|0|1|0|7|232|321|139|26|1|0|152|831|1|1|4|18|8|8|4|3|0|0|0|0|0|0|588|0|1|0|0|0|0|1|13|0|0|0|0|0|0|0|1|0|0|0|0|0|0|2|254|6|6|0|1|1|0|0|0|0|0|0|1|0|0|0|0|2|0|1|0|0|0|0|0|0|0|0|0|0|0|365|14|13|0|5|3|18|9|2|0|1|26|6|2|0|0|3|1|2|3|0|9|32|1|0|6|1|0|0|0|13|209|1|0|3|1|7|32|5|37|1|0|3|0|0|13|2|9|10|0|1|0|5|1|1|0|0|2"+
"&fromType=";
//设置分页的页码
parameter = parameter.replace("curr_page=", "curr_page="+String.valueOf(i));
parameter = parameter.replace("fromType=", "fromType="+String.valueOf(14));
//设置关键字“程序员”
parameter = parameter.replace("dis_keyword=", "dis_keyword="+URLEncoder.encode(keyName, "GBK"));
parameter = parameter.replace("keyword=", "keyword="+URLEncoder.encode(keyName, "GBK"));
//打开链接设置头信息
HttpURLConnection conn=(HttpURLConnection)url.openConnection();
conn.setDoOutput(true);
conn.setRequestMethod("POST");
//伪装请求
conn.setRequestProperty("Host", "search.51job.com");
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
//post方式参数长度必须设定
conn.setRequestProperty("Content-Length", Integer.toString(parameter.getBytes("GB2312").length));
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Alexa Toolbar; MAXTHON 2.0)");
OutputStream o = conn.getOutputStream();
OutputStreamWriter out = new OutputStreamWriter(o, "GBK");
out.write(parameter);
out.flush();
out.close();
//获得请求字节流
InputStream in = conn.getInputStream();
//解析
Scanner sc = new Scanner(in, "GBK");
return sc;
} catch (Exception e) {
log.error(e,e);
return null;
}
}
这样就可以顺利获得 该关键字 在第几页的列表信息了
做完这一步就可以通过解析要查找的信息了,比如公司信息,招聘职位 .......
while (sc.hasNextLine()) {
String line = sc.nextLine();
sp = line.indexOf("class=\"jobname\" >", sp + 1);
if (sp != -1) {
sp = line.indexOf("<a href");
while (sp != -1) {
int ep = line.indexOf("\"", sp + 9);
if (ep == -1) {
continue;
}
link = line.substring(sp + 9, ep);
String userLink = link;
URL userURL = new URL(SITE_BASE + userLink);
urls.add(userURL);
//System.out.println(userURL.toString());
sp = line.indexOf("<a href", sp + 1);
}
}
}
其他网站可以是get方式就更简单了,建议用firefox的firebug查看页面源代码可以方便查找请求的方式,信息。
二、查找 百度,谷歌上面的某个关键字的排行也可以用此办法
百度
private Scanner openConnection (int i,String keyName) {
try {
String link = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd=&pn=&ver=0&cl=3&uim=2&usm=0";
//关键字
link = link.replace("wd=", "wd="+ URLEncoder.encode(keyName, "GBK"));
//页面 00 10 20 30 分布
link = link.replace("pn=", "pn="+String.valueOf(i));
URL url = new URL(link);
InputStream in = url.openStream();
Scanner sc = new Scanner(in, "GBK");
return sc;
} catch (Exception e) {
log.error(e,e);
return null;
}
}
谷歌
private Scanner openConnection (int i,String keyName,String link) {
try {
String link = "http://www.google.cn/search?hl=zh-CN&newwindow=1&q=&start=&sa=N";
link = link.replace("q=", "q="+ URLEncoder.encode(keyName, "GBK"));
link = link.replace("start=", "start="+String.valueOf(i));
URL url = new URL(link);
URLConnection con = url.openConnection();
//设置Agent,模仿浏览器请求,否则谷歌不认识
con.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
InputStream in = con.getInputStream();
Scanner sc = new Scanner(in, "GBK");
return sc;
} catch (Exception e) {
log.error(e,e);
return null;
}
}