Jsoup爬取带登录验证码的网站

 

  今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:

 

大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重新获取并带着爬取下一个页面)

 

  1.先爬取网站的主页,由于我们学校的网站是ASP.net,所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)

  2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件,在这里我只是把验证码下载到本地之后,需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取,来标识是本次session请求的验证码,如果不带sessionid下载验证码之后输入验证码也无效。

  3.输入用户名,密码和验证码登录系统,登录系统需要携带一些其他参数(值为空也需要携带)。

  4.登录之后不能直接爬取成绩,需要爬虫登录成功之后的主页面获取__viewstate。

  5.爬完登录成功的主页之后就可以进行爬取成绩,将爬到的成绩收集起来,最后输出到html页面中。

 

(在这个爬虫的过程中需要注意__viewstate,每个页面都需要获取这个值,这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的),防止盗链)

 

 

下面是代码:

1.爬虫的入口

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Scanner;

/**
 * 爬虫主的程序调度器(爬虫教务系统的入口)
 * 
 * @author liqiang
 *
 */
public class MainClass {

    public static void main(String[] args) {

        // 输入学号和密码
        System.out.print("请输入你要查询学号:");
        Scanner sc = new Scanner(System.in);
        String xuehao = sc.next();
        System.out.print("请输入密码:");
        String password = sc.next();
        // Console con = System.console();
        // String pswd = new String(con.readPassword());// 因为读取的是字符数组,所以需要用new

        try {
            DownloadLoginfo downloadLoginfo = new DownloadLoginfo();
            LoginClass loginClass = new LoginClass();
            GradeOutput gradeOutput = new GradeOutput();
            // 1.访问主页,获取验证码与viewstate
            downloadLoginfo.getLogInfo();
            // 2.登录
            loginClass.login(downloadLoginfo, xuehao, password);
            for (Entry entry : loginClass.getCookies().entrySet()) {
                System.out.println("key:" + entry.getKey() + ";value" + entry.getValue());
            }
            CrawGrade crawGrade = new CrawGrade();
            //3. 爬取成绩的上一个页面
            crawGrade.crawGradeLastPage(downloadLoginfo.getCookies(), downloadLoginfo.getViewState(), xuehao);
            List condition = geneQueryCondition();
            //4.循环分学年爬取成绩
            for (String xuenian : condition) {
                String html_content = crawGrade.crawGrade(xuenian, "2", downloadLoginfo.getCookies(),
                        // 4.1爬取成绩页面
                        downloadLoginfo.getViewState(), xuehao);
                gradeOutput.collectGrade(html_content);

            }
            //5.输出爬到的数据到html文件中
            gradeOutput.outputDatas2Html();
        } catch (IOException e) {
            System.out.println("无法连接学校服务器");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 构造需要查询的年份和学期
     * 
     * @return
     */
    public static List geneQueryCondition() {
        List condition = new ArrayList();
        condition.add("2014-2015");
        condition.add("2015-2016");
        condition.add("2016-2017");
        condition.add("2017-2018");
        return condition;
    }

}

 

2.爬取学校主页获取__VIEWSTATE和cookie

package cn.qlq.craw.JsoupCrawJWXT;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
 * url获取图片并且保存到本地
 * 
 * @author liqiang
 *
 */
public class DownloadLoginfo {
    /**
     * 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId)
     */
    private  Map cookies = null;
    /**
     * __viewstate    教务系统用于验证的信息
     */
    private  String viewState = null;
    
    public DownloadLoginfo() {
        this.cookies = new HashMap();;
        this.viewState = "";
    }

    /**
     * 获取登录信息
     * 主要就是访问一下主页面,获取一个__viewstate与cookie
     */
    public  void getLogInfo() throws Exception {
        String urlLogin = "http://newjwc.tyust.edu.cn/";
        Connection connect = Jsoup.connect(urlLogin);
        // 伪造请求头
        connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding",
                "gzip, deflate");
        connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");
        connect.header("Content-Length", "213").header("Content-Type",
                "application/x-www-form-urlencoded; charset=UTF-8");
        connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/");
        connect.header("User-Agent",
                "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
                .header("X-Requested-With", "XMLHttpRequest");

        // 请求url获取响应信息
        Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求
        // 获取返回的cookie
        this.cookies = res.cookies();
        for (Entry entry : cookies.entrySet()) {
            System.out.println(entry.getKey() + "-" + entry.getValue());
        }
        // 获取响应体
        String body = res.body();

        // 调用下面方法获取__viewstate
        this.getViewState(body);// 获取viewState
        //调用下载验证码的工具类下载验证码
        JsoupDoloadPicture.downloadImg("http://newjwc.tyust.edu.cn/CheckCode.aspx", cookies);;
    }

    /**
     * 获取viewstate
     * 
     * @return
     */
    public  String getViewState(String htmlContent) {
        Document document = Jsoup.parse(htmlContent);
        Element ele = document.select("input[name='__VIEWSTATE']").first();
        String value = ele.attr("value");
        // 获取到viewState
        this.viewState = value;
        return value;
    }

    public Map getCookies() {
        return cookies;
    }

    public void setCookies(Map cookies) {
        this.cookies = cookies;
    }

    public String getViewState() {
        return viewState;
    }

    public void setViewState(String viewState) {
        this.viewState = viewState;
    }


    
}

 

3.带着验证码爬取验证码,并下载到本地

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;

/**
 * Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码,否则下载的验证码无效)
 * 
 * @author liqiang
 *
 */
public class JsoupDoloadPicture {

    /**
     * 带着cookie下载验证码图片
     * 
     * @param url
     * @param cookies
     * @throws IOException
     */
    public static void downloadImg(String url, Map cookies) throws IOException {
        // TODO Auto-generated method stub
        Connection connect = Jsoup.connect(url);
        connect.cookies(cookies);// 携带cookies爬取图片
        connect.timeout(5 * 10000);
        Connection.Response response = connect.ignoreContentType(true).execute();
        byte[] img = response.bodyAsBytes();
        System.out.println(img.length);
        // 读取文件存储位置
        String directory = ResourcesUtil.getValue("path", "file");
        savaImage(img, directory, "yzm.png");
    }

    /**
     * 保存图片到本地
     * @param img
     * @param filePath
     * @param fileName
     */
    public static void savaImage(byte[] img, String filePath, String fileName) {
        BufferedOutputStream bos = null;
        FileOutputStream fos = null;
        File file = null;
        File dir = new File(filePath);
        try {
            //判断文件目录是否存在
            if(dir.exists() && !dir.isDirectory()){
                FileUtils.deleteQuietly(dir);
            }
            dir.mkdir();
            file = new File(filePath + "\\" + fileName);
            fos = new FileOutputStream(file);
            bos = new BufferedOutputStream(fos);
            bos.write(img);
            System.out.println("验证码已经下载到:"+filePath);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (bos != null) {
                try {
                    bos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            if (fos != null) {
                try {
                    fos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

    }
}

 

 

4.登录类

package cn.qlq.craw.JsoupCrawJWXT;

import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;

import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;

/**
 * 登录类(访问登录页面获取登录的cookie)
 * 
 * @author liqiang
 *
 */
public class LoginClass {
    /**
     * 记录返回的cookie
     */
    private Map cookies = null;

    /**
     * 模拟登录获取cookie和sessionid
     * 
     */
    public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception {
        String urlLogin = "http://newjwc.tyust.edu.cn/default2.aspx";
        Connection connect = Jsoup.connect(urlLogin);
        connect.timeout(5 * 100000);
        // 伪造请求头
        connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded");
        connect.header("Host", "newjwc.tyust.edu.cn").header("Referer",
                "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
        connect.header("User-Agent",
                "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");

        // 输入验证码
        System.out.println("-----------请输入验证码---------");
        Scanner sc = new Scanner(System.in);
        String yzm = sc.next();
        sc.close();
        // 携带登陆信息
        connect.data("txtUserName", xuehao).data("__VIEWSTATE", downloadLoginfo.getViewState()).data("TextBox2", mima)
                .data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "")
                .data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm);
        connect.cookies(downloadLoginfo.getCookies());
        // 请求url获取响应信息
        Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求
        // 获取返回的cookie
        this.cookies = res.cookies();
        for (Entry entry : cookies.entrySet()) {
            System.out.println(entry.getKey() + "-" + entry.getValue());
        }
        System.out.println("---------获取的登录之后的页面-----------");
        String body = res.body();// 获取响应体
        System.out.println(body);
    }

    public Map getCookies() {
        return cookies;
    }

    public void setCookies(Map cookies) {
        this.cookies = cookies;
    }

}

 

5.爬取登录之后的主页和成绩

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.IOException;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
 * 爬取成绩的类
 * 
 * @author liqiang
 *
 */
public class CrawGrade {
    
    private String viewState;
    /**
     * 全局获取viewstate的函数
     * @param html
     * @return
     */
    public  String getViewState(String html){
        Document document = Jsoup.parse(html);
        Element ele = document.select("input[name='__VIEWSTATE']").first();
        String value = ele.attr("value");
        this.viewState = value;
        // 获取到viewState
        return value;
    }

    /**
     * 爬取获取成绩的上一个页面(也就是刚登陆之后的页面)
     * @param cookies
     * @param viewStata
     * @param xuehao
     * @return
     * @throws IOException
     */
    public String crawGradeLastPage(Map cookies,String viewStata,String xuehao) throws IOException{
        String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";
        Connection connect = Jsoup.connect(urlLogin);
        connect.timeout(5 * 100000);
        // 伪造请求头
        connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");
        connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
        connect.header("User-Agent",
                "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");

        // 携带登陆信息
        connect.data("xh","201420020123")
            .data("xm", viewStata)
            .data("hidLanguage", "")
            .data("gnmkdm", "N121613");
        //设置cookie
        connect.cookies(cookies);
        
        Document document = connect.post();
        System.out.println("-----------爬到的成绩的上一个页面--------------");
        String html = document.toString();
        System.out.println(html);
        // 重新获取到viewState
        this.getViewState(html);
        return html;

        
    }
    
    /**
     * 爬取成绩页面
     */
    public String crawGrade(String xuenian,String xueqi,Map cookies,String viewStata,String xuehao) throws IOException{
        String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";
        Connection connect = Jsoup.connect(urlLogin);
        connect.timeout(5 * 100000);
        // 伪造请求头
        connect.header("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
        .header("Accept-Encoding", "gzip, deflate");
        connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");
        connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");
        connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
        connect.header("User-Agent",
                "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
        
        // 携带登陆信息
        connect.data("__EVENTTARGET","")
        .data("__EVENTARGUMENT", "")
        .data("__VIEWSTATE", this.viewState)
        .data("hidLanguage","")
        .data("ddlXN", xuenian)
        .data("ddlXQ", xueqi)
        .data("btn_xn", "")
        .data("ddl_kcxz", "");
        
        connect.cookies(cookies);
        
        Document document = connect.post();
        System.out.println("-----------爬到的成绩的页面--------------");
        String html = document.toString();
        //更新viewstate
        this.getViewState(html);
        System.out.println(html);
        return html;
    }

    public void setViewState(String viewState) {
        this.viewState = viewState;
    }
    
    
    
}

 

6.收集成绩的类

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 收集成绩与输出成绩
 * 
 * @author liqiang
 *
 */
@SuppressWarnings("all")
public class GradeOutput {
    /**
     * 保存成绩的集合
     */
    private List> datas;

    public GradeOutput() {
        this.datas = new ArrayList>();
    }

    /**
     * 收集成绩
     * 
     * @param html
     * @return
     */
    public String collectGrade(String html) {
        // 解析html
        Document document = Jsoup.parse(html);
        // 获取成绩表格
        Element table = document.select("#Datagrid1").first();
        // 选择除表格表头之外的元素
        Elements trs = table.select("tr:gt(0)");
        for (Element ele : trs) {
            Map result = new LinkedHashMap();
            Elements ele0 = ele.select("td:eq(0)");// 找到学年
            result.put("xuenian", ele0.text());
            Elements ele1 = ele.select("td:eq(1)");// 找到学期
            result.put("xueqi", ele1.text());
            Elements ele3 = ele.select("td:eq(3)");// 找到课程名称
            result.put("kecheng", ele3.text());
            Elements ele8 = ele.select("td:eq(8)");// 找到成绩
            result.put("chengji", ele8.text());
            this.datas.add(result);
        }
        return null;
    }

    /**
     * 输出成绩到控制台
     */
    public void outPutGrade() {
        if (this.datas == null || this.datas.size() == 0) {
            return;
        }
        System.out.println("-------下面是提取到的成绩--------");
        for (Map result : datas) {

            System.out.println(result.get("xuenian") + "\t" + result.get("xueqi") + "\t" + result.get("kecheng") + "\t"
                    + result.get("chengji") + "\t");
        }

    }

    /**
     * 最后处理所有的数据,写出到html或者保存数据库
     * 
     * @throws IOException
     */
    public void outputDatas2Html() throws IOException {
        if (datas != null && datas.size() > 0) {
            // 读取文件存储位置
            String directory = ResourcesUtil.getValue("path", "file");
            
            File file = new File(directory+"\\gradeOut.html");
            // 如果文件不存在就创建文件
            if (!file.exists()) {
                file.createNewFile();
            }
            // 构造FileWriter用于向文件中输出信息(此构造方法可以接收file参数,也可以接收fileName参数)
            FileWriter fileWriter = new FileWriter(file);
            // 开始写入数据
            fileWriter.write("");
            fileWriter.write("");
            fileWriter.write("xxx成绩单");
            fileWriter
                    .write("");
            fileWriter.write("");
            fileWriter.write("");
            fileWriter.write("");
            fileWriter.write(
                    "");

            for (Map data : datas) {
                String xuenian = (String) data.get("xuenian");
                String xueqi = (String) data.get("xueqi");
                String kecheng = (String) data.get("kecheng");
                String chengji = (String) data.get("chengji");
                fileWriter.write("");
                fileWriter.write("");
                fileWriter.write("");
                fileWriter.write("");
                fileWriter.write("");
                fileWriter.write("");

            }
            fileWriter.write("
学年学期课程名字成绩
" + xuenian + "" + xueqi + "" + kecheng + "" + chengji + "
"); fileWriter.write(""); fileWriter.write(""); // 关闭文件流 fileWriter.close(); } } public List> getDatas() { return datas; } public void setDatas(List> datas) { this.datas = datas; } }

 

 

 

path.properties (设置验证码图片和最后的成绩单输出到哪个位置)

#fileToSave
#yzm
file=C:\\Users\\liqiang\\Desktop

 

 

读取上述配置文件的工具类:

package cn.qlq.craw.JsoupCrawJWXT;


import java.io.Serializable;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.Set;

/**
 * 资源文件读取工具类
 * 
 */
public class ResourcesUtil implements Serializable {

    private static final long serialVersionUID = -7657898714983901418L;

    /**
     * 系统语言环境,默认为中文zh
     */
    public static final String LANGUAGE = "zh";

    /**
     * 系统国家环境,默认为中国CN
     */
    public static final String COUNTRY = "CN";
    private static Locale getLocale() {
        Locale locale = new Locale(LANGUAGE, COUNTRY);
        return locale;
    }

    /**
     * 根据语言、国家、资源文件名和key名字获取资源文件值
     * 
     * @param language
     *            语言
     * 
     * @param country
     *            国家
     * 
     * @param baseName
     *            资源文件名
     * 
     * @param section
     *            key名字
     * 
     * @return*/
    private static String getProperties(String baseName, String section) {
        String retValue = "";
        try {
            Locale locale = getLocale();
            ResourceBundle rb = ResourceBundle.getBundle(baseName, locale);
            retValue = (String) rb.getObject(section);
        } catch (Exception e) {
            e.printStackTrace();
            // TODO 添加处理
        }
        return retValue;
    }

    /**
     * 通过key从资源文件读取内容
     * 
     * @param fileName
     *            资源文件名
     * 
     * @param key
     *            索引
     * 
     * @return 索引对应的内容
     */
    public static String getValue(String fileName, String key) {
        String value = getProperties(fileName,key);
        return value;
    }

    public static List gekeyList(String baseName) {
        Locale locale = getLocale();
        ResourceBundle rb = ResourceBundle.getBundle(baseName, locale);

        List reslist = new ArrayList();

        Set keyset = rb.keySet();
        for (Iterator it = keyset.iterator(); it.hasNext();) {
            String lkey = (String)it.next();
            reslist.add(lkey);
        }

        return reslist;

    }

    /**
     * 通过key从资源文件读取内容,并格式化
     * 
     * @param fileName
     *            资源文件名
     * 
     * @param key
     *            索引
     * 
     * @param objs
     *            格式化参数
     * 
     * @return 格式化后的内容
     */
    public static String getValue(String fileName, String key, Object[] objs) {
        String pattern = getValue(fileName, key);
        String value = MessageFormat.format(pattern, objs);
        return value;
    }

    public static void main(String[] args) {
        System.out.println(getValue("resources.messages", "101",new Object[]{100,200}));
        
        
        //根据操作系统环境获取语言环境
        /*Locale locale = Locale.getDefault();
        System.out.println(locale.getCountry());//输出国家代码
        System.out.println(locale.getLanguage());//输出语言代码s
        
        //加载国际化资源(classpath下resources目录下的messages.properties,如果是中文环境会优先找messages_zh_CN.properties)
        ResourceBundle rb = ResourceBundle.getBundle("resources.messages", locale);
        String retValue = rb.getString("101");//101是messages.properties文件中的key
        System.out.println(retValue);
        
        //信息格式化,如果资源中有{}的参数则需要使用MessageFormat格式化,Object[]为传递的参数,数量根据资源文件中的{}个数决定
        String value = MessageFormat.format(retValue, new Object[]{100,200});
        System.out.println(value);
*/

    }
}

 

 

 

 

git地址:https://github.com/qiao-zhi/javaCraw

 

转载于:https://www.cnblogs.com/qlqwjy/p/8899232.html

你可能感兴趣的:(Jsoup爬取带登录验证码的网站)