今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:
大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重新获取并带着爬取下一个页面)
1.先爬取网站的主页,由于我们学校的网站是ASP.net,所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)
2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件,在这里我只是把验证码下载到本地之后,需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取,来标识是本次session请求的验证码,如果不带sessionid下载验证码之后输入验证码也无效。
3.输入用户名,密码和验证码登录系统,登录系统需要携带一些其他参数(值为空也需要携带)。
4.登录之后不能直接爬取成绩,需要爬虫登录成功之后的主页面获取__viewstate。
5.爬完登录成功的主页之后就可以进行爬取成绩,将爬到的成绩收集起来,最后输出到html页面中。
(在这个爬虫的过程中需要注意__viewstate,每个页面都需要获取这个值,这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的),防止盗链)
下面是代码:
1.爬虫的入口
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.Scanner; /** * 爬虫主的程序调度器(爬虫教务系统的入口) * * @author liqiang * */ public class MainClass { public static void main(String[] args) { // 输入学号和密码 System.out.print("请输入你要查询学号:"); Scanner sc = new Scanner(System.in); String xuehao = sc.next(); System.out.print("请输入密码:"); String password = sc.next(); // Console con = System.console(); // String pswd = new String(con.readPassword());// 因为读取的是字符数组,所以需要用new try { DownloadLoginfo downloadLoginfo = new DownloadLoginfo(); LoginClass loginClass = new LoginClass(); GradeOutput gradeOutput = new GradeOutput(); // 1.访问主页,获取验证码与viewstate downloadLoginfo.getLogInfo(); // 2.登录 loginClass.login(downloadLoginfo, xuehao, password); for (Entryentry : loginClass.getCookies().entrySet()) { System.out.println("key:" + entry.getKey() + ";value" + entry.getValue()); } CrawGrade crawGrade = new CrawGrade(); //3. 爬取成绩的上一个页面 crawGrade.crawGradeLastPage(downloadLoginfo.getCookies(), downloadLoginfo.getViewState(), xuehao); List condition = geneQueryCondition(); //4.循环分学年爬取成绩 for (String xuenian : condition) { String html_content = crawGrade.crawGrade(xuenian, "2", downloadLoginfo.getCookies(), // 4.1爬取成绩页面 downloadLoginfo.getViewState(), xuehao); gradeOutput.collectGrade(html_content); } //5.输出爬到的数据到html文件中 gradeOutput.outputDatas2Html(); } catch (IOException e) { System.out.println("无法连接学校服务器"); } catch (Exception e) { e.printStackTrace(); } } /** * 构造需要查询的年份和学期 * * @return */ public static List geneQueryCondition() { List condition = new ArrayList (); condition.add("2014-2015"); condition.add("2015-2016"); condition.add("2016-2017"); condition.add("2017-2018"); return condition; } }
2.爬取学校主页获取__VIEWSTATE和cookie
package cn.qlq.craw.JsoupCrawJWXT; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * url获取图片并且保存到本地 * * @author liqiang * */ public class DownloadLoginfo { /** * 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId) */ private Mapcookies = null; /** * __viewstate 教务系统用于验证的信息 */ private String viewState = null; public DownloadLoginfo() { this.cookies = new HashMap ();; this.viewState = ""; } /** * 获取登录信息 * 主要就是访问一下主页面,获取一个__viewstate与cookie */ public void getLogInfo() throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/"; Connection connect = Jsoup.connect(urlLogin); // 伪造请求头 connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") .header("X-Requested-With", "XMLHttpRequest"); // 请求url获取响应信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求 // 获取返回的cookie this.cookies = res.cookies(); for (Entry entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } // 获取响应体 String body = res.body(); // 调用下面方法获取__viewstate this.getViewState(body);// 获取viewState //调用下载验证码的工具类下载验证码 JsoupDoloadPicture.downloadImg("http://newjwc.tyust.edu.cn/CheckCode.aspx", cookies);; } /** * 获取viewstate * * @return */ public String getViewState(String htmlContent) { Document document = Jsoup.parse(htmlContent); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); // 获取到viewState this.viewState = value; return value; } public Map getCookies() { return cookies; } public void setCookies(Map cookies) { this.cookies = cookies; } public String getViewState() { return viewState; } public void setViewState(String viewState) { this.viewState = viewState; } }
3.带着验证码爬取验证码,并下载到本地
package cn.qlq.craw.JsoupCrawJWXT; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Map; import org.apache.commons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码,否则下载的验证码无效) * * @author liqiang * */ public class JsoupDoloadPicture { /** * 带着cookie下载验证码图片 * * @param url * @param cookies * @throws IOException */ public static void downloadImg(String url, Mapcookies) throws IOException { // TODO Auto-generated method stub Connection connect = Jsoup.connect(url); connect.cookies(cookies);// 携带cookies爬取图片 connect.timeout(5 * 10000); Connection.Response response = connect.ignoreContentType(true).execute(); byte[] img = response.bodyAsBytes(); System.out.println(img.length); // 读取文件存储位置 String directory = ResourcesUtil.getValue("path", "file"); savaImage(img, directory, "yzm.png"); } /** * 保存图片到本地 * @param img * @param filePath * @param fileName */ public static void savaImage(byte[] img, String filePath, String fileName) { BufferedOutputStream bos = null; FileOutputStream fos = null; File file = null; File dir = new File(filePath); try { //判断文件目录是否存在 if(dir.exists() && !dir.isDirectory()){ FileUtils.deleteQuietly(dir); } dir.mkdir(); file = new File(filePath + "\\" + fileName); fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); bos.write(img); System.out.println("验证码已经下载到:"+filePath); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (bos != null) { try { bos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (fos != null) { try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
4.登录类
package cn.qlq.craw.JsoupCrawJWXT; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; /** * 登录类(访问登录页面获取登录的cookie) * * @author liqiang * */ public class LoginClass { /** * 记录返回的cookie */ private Mapcookies = null; /** * 模拟登录获取cookie和sessionid * */ public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/default2.aspx"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 输入验证码 System.out.println("-----------请输入验证码---------"); Scanner sc = new Scanner(System.in); String yzm = sc.next(); sc.close(); // 携带登陆信息 connect.data("txtUserName", xuehao).data("__VIEWSTATE", downloadLoginfo.getViewState()).data("TextBox2", mima) .data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "") .data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm); connect.cookies(downloadLoginfo.getCookies()); // 请求url获取响应信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求 // 获取返回的cookie this.cookies = res.cookies(); for (Entry entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } System.out.println("---------获取的登录之后的页面-----------"); String body = res.body();// 获取响应体 System.out.println(body); } public Map getCookies() { return cookies; } public void setCookies(Map cookies) { this.cookies = cookies; } }
5.爬取登录之后的主页和成绩
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * 爬取成绩的类 * * @author liqiang * */ public class CrawGrade { private String viewState; /** * 全局获取viewstate的函数 * @param html * @return */ public String getViewState(String html){ Document document = Jsoup.parse(html); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); this.viewState = value; // 获取到viewState return value; } /** * 爬取获取成绩的上一个页面(也就是刚登陆之后的页面) * @param cookies * @param viewStata * @param xuehao * @return * @throws IOException */ public String crawGradeLastPage(Mapcookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息 connect.data("xh","201420020123") .data("xm", viewStata) .data("hidLanguage", "") .data("gnmkdm", "N121613"); //设置cookie connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成绩的上一个页面--------------"); String html = document.toString(); System.out.println(html); // 重新获取到viewState this.getViewState(html); return html; } /** * 爬取成绩页面 */ public String crawGrade(String xuenian,String xueqi,Map cookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息 connect.data("__EVENTTARGET","") .data("__EVENTARGUMENT", "") .data("__VIEWSTATE", this.viewState) .data("hidLanguage","") .data("ddlXN", xuenian) .data("ddlXQ", xueqi) .data("btn_xn", "") .data("ddl_kcxz", ""); connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成绩的页面--------------"); String html = document.toString(); //更新viewstate this.getViewState(html); System.out.println(html); return html; } public void setViewState(String viewState) { this.viewState = viewState; } }
6.收集成绩的类
package cn.qlq.craw.JsoupCrawJWXT; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 收集成绩与输出成绩 * * @author liqiang * */ @SuppressWarnings("all") public class GradeOutput { /** * 保存成绩的集合 */ private List
path.properties (设置验证码图片和最后的成绩单输出到哪个位置)
#fileToSave
#yzm
file=C:\\Users\\liqiang\\Desktop
读取上述配置文件的工具类:
package cn.qlq.craw.JsoupCrawJWXT; import java.io.Serializable; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.ResourceBundle; import java.util.Set; /** * 资源文件读取工具类 * */ public class ResourcesUtil implements Serializable { private static final long serialVersionUID = -7657898714983901418L; /** * 系统语言环境,默认为中文zh */ public static final String LANGUAGE = "zh"; /** * 系统国家环境,默认为中国CN */ public static final String COUNTRY = "CN"; private static Locale getLocale() { Locale locale = new Locale(LANGUAGE, COUNTRY); return locale; } /** * 根据语言、国家、资源文件名和key名字获取资源文件值 * * @param language * 语言 * * @param country * 国家 * * @param baseName * 资源文件名 * * @param section * key名字 * * @return 值 */ private static String getProperties(String baseName, String section) { String retValue = ""; try { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); retValue = (String) rb.getObject(section); } catch (Exception e) { e.printStackTrace(); // TODO 添加处理 } return retValue; } /** * 通过key从资源文件读取内容 * * @param fileName * 资源文件名 * * @param key * 索引 * * @return 索引对应的内容 */ public static String getValue(String fileName, String key) { String value = getProperties(fileName,key); return value; } public static ListgekeyList(String baseName) { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); List reslist = new ArrayList (); Set keyset = rb.keySet(); for (Iterator it = keyset.iterator(); it.hasNext();) { String lkey = (String)it.next(); reslist.add(lkey); } return reslist; } /** * 通过key从资源文件读取内容,并格式化 * * @param fileName * 资源文件名 * * @param key * 索引 * * @param objs * 格式化参数 * * @return 格式化后的内容 */ public static String getValue(String fileName, String key, Object[] objs) { String pattern = getValue(fileName, key); String value = MessageFormat.format(pattern, objs); return value; } public static void main(String[] args) { System.out.println(getValue("resources.messages", "101",new Object[]{100,200})); //根据操作系统环境获取语言环境 /*Locale locale = Locale.getDefault(); System.out.println(locale.getCountry());//输出国家代码 System.out.println(locale.getLanguage());//输出语言代码s //加载国际化资源(classpath下resources目录下的messages.properties,如果是中文环境会优先找messages_zh_CN.properties) ResourceBundle rb = ResourceBundle.getBundle("resources.messages", locale); String retValue = rb.getString("101");//101是messages.properties文件中的key System.out.println(retValue); //信息格式化,如果资源中有{}的参数则需要使用MessageFormat格式化,Object[]为传递的参数,数量根据资源文件中的{}个数决定 String value = MessageFormat.format(retValue, new Object[]{100,200}); System.out.println(value); */ } }
git地址:https://github.com/qiao-zhi/javaCraw