Android 基于Jsoup的新版正方教务爬虫

1 前言

此代码只适用于新版正方教务系统

本文主要是代码,具体分析过程有很多爬虫教程都有讲述,主要就是学会使用各种抓包工具,仔细分析提交时的请求头和表单,用各种方法提取或构造出需要提交的参数,具体步骤可以搜索其他爬虫教程,我个人表达能力不是很好,怕讲不明白或者讲错了,所以就不仔细讲述了。

代码是我借鉴了许多版本的java正方爬虫写出的,因为在Android端HttpClient已经被Google官方遗弃,所以代码是基于Jsoup完成的。

2 准备工作

Jsoup库用于网络请求和html文本解析
fastjson库用来解析JSON数据

在Android Studio的build.gradle(Module)添加对应依赖:

implementation 'org.jsoup:jsoup:1.11.3'
implementation 'com.alibaba:fastjson:1.1.54.android'

3 代码

3.1 新版正方RSA加密方法

public class Base64 {
    public static String b64map="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    private static char b64pad = '=';
    private static String hexCode = "0123456789abcdef";

    // 获取对应16进制字符
    public static char int2char(int a){
        return hexCode.charAt(a);
    }

    // Base64转16进制
    public static String b64tohex(String s) {
        String ret = "";
        int k = 0;
        int slop = 0;
        for(int i = 0; i < s.length(); ++i) {
            if(s.charAt(i) == b64pad) break;
            int v = b64map.indexOf(s.charAt(i));
            if(v < 0) continue;
            if(k == 0) {
                ret += int2char(v >> 2);
                slop = v & 3;
                k = 1;
            }
            else if(k == 1) {
                ret += int2char((slop << 2) | (v >> 4));
                slop = v & 0xf;
                k = 2;
            }
            else if(k == 2) {
                ret += int2char(slop);
                ret += int2char(v >> 2);
                slop = v & 3;
                k = 3;
            }
            else {
                ret += int2char((slop << 2) | (v >> 4));
                ret += int2char(v & 0xf);
                k = 0;
            }
        }
        if(k == 1)
            ret += int2char(slop << 2);
        return ret;
    }

    // 16进制转Base64
    public static String hex2b64(String h) {
        int i , c;
        StringBuilder ret = new StringBuilder();
        for(i = 0; i+3 <= h.length(); i+=3) {
            c = parseInt(h.substring(i,i+3),16);
            ret.append(b64map.charAt(c >> 6));
            ret.append(b64map.charAt(c & 63));
        }
        if(i+1 == h.length()) {
            c = parseInt(h.substring(i,i+1),16);
            ret.append(b64map.charAt(c << 2));
        }
        else if(i+2 == h.length()) {
            c = parseInt(h.substring(i,i+2),16);
            ret.append(b64map.charAt(c >> 2));
            ret.append(b64map.charAt((c & 3) << 4));
        }
        while((ret.length() & 3) > 0) ret.append(b64pad);
        return ret.toString();
    }
}
public class RSAEncoder {
    private static BigInteger n = null;
    private static BigInteger e = null;

    public static String RSAEncrypt(String pwd, String nStr, String eStr){
        n = new BigInteger(nStr,16);
        e = new BigInteger(eStr,16);

        BigInteger r = RSADoPublic(pkcs1pad2(pwd,(n.bitLength()+7)>>3));
        String sp = r.toString(16);
        if((sp.length()&1) != 0 )
            sp = "0" + sp;
        return sp;
    }

    private static BigInteger RSADoPublic(BigInteger x){
        return x.modPow(e, n);
    }

    private static BigInteger pkcs1pad2(String s, int n){
        if(n < s.length() + 11) { // TODO: fix for utf-8
            System.err.println("Message too long for RSAEncoder");
            return null;
        }
        byte[] ba = new byte[n];
        int i = s.length()-1;
        while(i >= 0 && n > 0) {
            int c = s.codePointAt(i--);
            if(c < 128) { // encode using utf-8
                ba[--n] = new Byte(String.valueOf(c));
            }
            else if((c > 127) && (c < 2048)) {
                ba[--n] = new Byte(String.valueOf((c & 63) | 128));
                ba[--n] = new Byte(String.valueOf((c >> 6) | 192));
            } else {
                ba[--n] = new Byte(String.valueOf((c & 63) | 128));
                ba[--n] = new Byte(String.valueOf(((c >> 6) & 63) | 128));
                ba[--n] = new Byte(String.valueOf((c >> 12) | 224));
            }
        }
        ba[--n] = new Byte("0");
        byte[] temp = new byte[1];
        Random rdm = new Random(47L);
        while(n > 2) { // random non-zero pad
            temp[0] = new Byte("0");
            while(temp[0] == 0)
                rdm.nextBytes(temp);
            ba[--n] = temp[0];
        }
        ba[--n] = 2;
        ba[--n] = 0;
        return new BigInteger(ba);
    }
}

3.2 爬虫工具类

目前只做了模拟登陆和个人信息查询功能,但是其他功能和查询个人信息的思路是一样的,代码偏长还没有简化,用的时候可以自己改动简化一下

public class JWGLUtils {

    /**
     * 网站关闭返回相应状态
     */
    private static int STATUS_CLOSE = 0;

    /**
     * 账号密码错误返回状态
     */
    private static int STATUS_USERFAULT = -1;

    /**
     * 账号为空状态
     */
    private static int STATUS_USERNULL = -2;

    /**
     * 密码为空状态
     */
    private static int STATUS_PASSNULL = -3;

    /**
     * 登陆成功状态
     */
    private static int STATUS_SUCCEED = 1;

    /**
     * 查询成功状态
     */
    private static int STATUS_INDEX = 2;

    /**
     * 长时间未响应返回状态
     */
    private static int STATUS_RESPOND = 3;

    /**
     * 状态储存
     */
    private static int STATUS_RETURN;

    /**
     * 对应学校教务处网站
     * 

* 可通过Baseurl方法修改 */ private String Baseurl; /** * 记录服务器给出cookies *

* Jsoup已给出拦截方法,使用者无需关注 */ private Map<String, String> cookies = new HashMap<>(); /** * 由服务器给出,结合RSA加密工具类得出加密密码,具体方法已封装,使用者无需关注 */ private String modulus; private String exponent; /** * 需通过解析得到相应参数,使用者无需关注 */ private String csrftoken; /** * Jsoup对应接口 */ private Connection connection; private Connection.Response response; private Document document; /** * 用户登陆账号 *

* 结合相应方法传入参数 */ private String stuNum; /** * 用户登陆密码 *

* 结合相应方法传入参数 */ private String password; /** * 登陆实时时间(毫秒) *

* 用户无需关注 */ private String mNowTime; /** * 设置基础url * * @param Baseurl 学校教务处基础网址 * @return */ public JWGLUtils Baseurl(String Baseurl) { this.Baseurl = Baseurl; return this; } /** * 设置用户登陆账号 * * @param stuNum 教务处账号 * @return */ public JWGLUtils Username(String stuNum) { this.stuNum = stuNum; return this; } /** * 设置用户登陆密码 * * @param password 教务处密码 * @return */ public JWGLUtils Password(String password) { this.password = password; return this; } /** * @return ResponseBody */ public Document GetString() { return this.document; } /** * 解析csrftokoen参数值 * 同时得到相应Cookies * * @return */ private boolean getCsrftoken() { try { mNowTime = String.valueOf(new Date().getTime()); connection = Jsoup.connect(Baseurl + "/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t=" + mNowTime); connection.header("User-Agent", "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"); connection.header("Cache-Control", "no-cache"); connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.header("Accept-Language", "zh-CN,zh;q=0.9"); connection.header("Upgrade-Insecure-Requests", "1"); connection.header("Connection", "keep-alive"); // connection.header("Host", "jwzx.zjxu.edu.cn"); response = connection.timeout(5000).execute(); cookies = response.cookies(); document = Jsoup.parse(response.body()); csrftoken = document.getElementById("csrftoken").val(); } catch (Exception ex) { ex.printStackTrace(); return false; } return true; } /** * 获取公钥并加密密码 * * @return */ private boolean getRSApublickey() { // throws Exception try { connection = Jsoup.connect(Baseurl + "/jwglxt/xtgl/login_getPublicKey.html?" + "time=" + mNowTime); connection.header("User-Agent", "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"); connection.header("Cache-Control", "no-cache"); connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.header("Accept-Language", "zh-CN,zh;q=0.9"); connection.header("Upgrade-Insecure-Requests", "1"); connection.header("Connection", "keep-alive"); // connection.header("Host", "jwzx.zjxu.edu.cn"); response = connection.cookies(cookies).ignoreContentType(true).timeout(5000).execute(); JSONObject jsonObject = JSON.parseObject(response.body()); modulus = jsonObject.getString("modulus"); exponent = jsonObject.getString("exponent"); password = RSAEncoder.RSAEncrypt(password, Base64.b64tohex(modulus), Base64.b64tohex(exponent)); password = Base64.hex2b64(password); } catch (IOException e) { e.printStackTrace(); return false; } return true; } /** * 模拟登陆 * 得到重新分配的Cookies * * @return */ private int beginLogin() { // throws Exception try { connection = Jsoup.connect(Baseurl + "/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t=" + mNowTime); connection.header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8"); connection.header("User-Agent", "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"); connection.header("Cache-Control", "no-cache"); connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.header("Accept-Language", "zh-CN,zh;q=0.9"); connection.header("Upgrade-Insecure-Requests", "1"); connection.header("Connection", "keep-alive"); // connection.header("Host", "jwzx.zjxu.edu.cn"); connection.header("Referer", Baseurl + "/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t=" + mNowTime); connection.header("Proxy-Connection", "keep-alive"); connection.header("Pragma", "no-cache"); connection.data("csrftoken", csrftoken); connection.data("yhm", stuNum); connection.data("mm", password); connection.data("mm", password); response = connection.cookies(cookies).ignoreContentType(true) .timeout(5000) .method(Connection.Method.POST).execute(); cookies = response.cookies(); document = Jsoup.parse(response.body()); } catch (IOException e) { e.printStackTrace(); return STATUS_CLOSE; } if (document.getElementById("tips") == null) { return STATUS_SUCCEED; } else { return STATUS_USERFAULT; } } /** * 查询个人信息 * * @return */ public int getStudentInformaction() { // throws Exception try { if (stuNum == null) return STATUS_USERNULL; if (password == null) return STATUS_PASSNULL; if (getCsrftoken() == false) return STATUS_CLOSE; if (getRSApublickey() == false) return STATUS_CLOSE; STATUS_RETURN = beginLogin(); if (STATUS_RETURN == STATUS_CLOSE) { return STATUS_CLOSE; } else if (STATUS_RETURN == STATUS_USERFAULT) { return STATUS_USERFAULT; } else { System.out.println("登陆成功"); } connection = Jsoup.connect(Baseurl + "/jwglxt/xsxxxggl/xsxxwh_cxCkDgxsxx.html?gnmkdm=N100801&su=" + stuNum); connection.cookies(cookies); connection.header("User-Agent", "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"); connection.header("Cache-Control", "no-cache"); connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.header("Accept-Language", "zh-CN,zh;q=0.9"); connection.header("Upgrade-Insecure-Requests", "1"); connection.header("Connection", "keep-alive"); // connection.header("Host", "jwzx.zjxu.edu.cn"); connection.header("Referer", Baseurl + "/jwglxt/xtgl/login_slogin.html?language=zh_CN&_t=" + mNowTime); connection.header("Proxy-Connection", "keep-alive"); connection.header("Pragma", "no-cache"); response = connection.ignoreContentType(true) .timeout(5000) .method(Connection.Method.GET).execute(); } catch (IOException e) { e.printStackTrace(); return STATUS_CLOSE; } document = Jsoup.parse(response.body()); return STATUS_INDEX; } }

3.3 简单使用例子

public class Test {
	public void Hello() {
		new Thread(new Runnable() {		// Android的网络请求必须在子线程
                    @Override
                    public void run() {
                        JWGLUtils jwglUtils = new JWGLUtils()
                                .Baseurl("教务处网站")
                                .Username("登陆账号")
                                .Password("密码");
                        switch (jwglUtils.getStudentInformaction()){
                            case 0:
                                System.out.println("教务处网站关闭");
                                break;
                            case 2:
                                System.out.println(jwglUtils.GetString());
                                break;
                            case -1:
                                System.out.println("账号密码错误请重新输入");
                                break;
                            case -2:
                                System.out.println("账号不得为空");
                                break;
                            case -3:
                                System.out.println("密码不得为空");
                                break;
                        }
                    }
                }).start();
	}
}

4 个人总结

本人代码水平有限,大部分都是借鉴大神们的代码,为了方便像我一样写Android爬虫的初学者,所以分享自己的一点经验。有疏忽的地方希望各位多提意见。

你可能感兴趣的:(Android 基于Jsoup的新版正方教务爬虫)