实现爬虫江西财经大学

MIT License

Copyright (c) 2017 Jakie Peng

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Kotlin 版本

package Jufe

import java.math.BigInteger
import java.security.KeyFactory
import java.security.PublicKey
import java.security.spec.RSAPublicKeySpec
import javax.crypto.Cipher

class EncryptionJufe(private val password: String){
    private val n = "5598e3b75d21a2989274e222fa59ab07d829faa29b544e3a920c4dd287aed9302a657280c23220a35ae985ba157" +
            "400e0502ce8e44570a1513bf7146f372e9c842115fb1b86def80e2ecf9f8e7a586656d12b27529f487e55052e5c31d0836b" +
            "2e8c01c011bca911d983b1541f20b7466c325b4e30b4a79652470e88135113c9d9"
    // 256 bit hex
    private val e = "10001"  // hex

    private fun generatePublicKey(n: String, e: String): PublicKey {
        // 根据n和e的值生成公钥
        val modulus = BigInteger(n, 16)
        val exponent = BigInteger(e, 16)
        val publicKeySpec = RSAPublicKeySpec(modulus, exponent)
        val keyFactory = KeyFactory.getInstance("RSA")
        val publicKey = keyFactory.generatePublic(publicKeySpec)
        return publicKey
    }

    private fun encryption(content: String, publicKey: PublicKey): ByteArray{
        // 利用公钥进行加密
        val cipher = Cipher.getInstance("RSA")
        cipher.init(Cipher.ENCRYPT_MODE, publicKey)
        val byteContent = content.toByteArray()
        val encryptedContent = cipher.doFinal(byteContent)
        return encryptedContent
    }

    private fun bytes2hex(content: ByteArray): String{
        var bigInteger = BigInteger(1, content)
        return bigInteger.toString(16)
    }

    fun run():String{
        val publicKey = generatePublicKey(n, e)
        var encryptedContent = encryption(password, publicKey)
        return bytes2hex(encryptedContent)
    }
}
package Jufe

import okhttp3.*
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

/*
this file can really get into the websites of Jufe.
we should process the cookies when GET the website the first time
and then POST data into the website to get the final login cookie
actually if we can right set the cookie
we can set the redirects right(but these still remain confused!)
 */

class Crawler private constructor(){
    /*
    这个爬虫应该不仅仅爬取江西财经大学信息门户网,而且还应该和应用本身的服务器进行通信
    目标在于构建分布式的爬虫系统,在对系统造成最小伤害的时候,达到最大程度的爬取
    由于OkHTTPClient的特性,所以我们对这个类限制成为单例模式
     */

    private val client = OkHttpClient.Builder().cookieJar(CookieJarApp()).build()  // 带有头部的client

    private object mHolder {val INSTANCE = Crawler()}

    companion object {
        // 这个伴随对象确保单例模式
        fun getInstance(): Crawler{
            return mHolder.INSTANCE
        }
    }

    // 这个是用来POST登录的data数据
    private val data = mutableMapOf("username" to "",
            "password" to "",
            "errors" to "0",
            "imageCodeName" to " ",
            "_rememberMe" to "on",
            "cryptoType" to "1",
            "lt" to "",
            "_eventId" to "submit")

    // 登录的链接
    private val LoGIN_URL = "https://ssl.jxufe.edu.cn/cas/login?service=http%3A%2F%2Fecampus.jxufe.edu.cn%2Fc%2Fportal" +
            "%2Flogin%3Fredirect%3D%252Fc"

    private fun getBodyDataByMap(data: Map): FormBody{
        // 将封装好的Map里面的data转化为FromBody以便POST出去
        val formData = FormBody.Builder()
        println(data)
        // 循环data以将data中的数据换回formData
        for (key in data.keys){
            formData.add(key, data.getOrDefault(key, "不存在此值"))  // 这里可能存在错误
        }
        val temp = formData.build()
        return temp
    }

    private fun getLt(doc: Document): String {
        try {
            val lt = doc.getElementsByAttributeValue("name", "lt").attr("value")
            return lt
        } catch (e: Exception) {
            e.printStackTrace()
        }
        return ""
    }

    private fun getHtml(response: Response): Document {
        val html = response.body()?.string()
        val doc = Jsoup.parse(html)
        return doc
    }

    private fun processData(username: String, password: String, response: Response){
        data["username"] = username
        data["password"] = EncryptionJufe(password).run()
        data["lt"] = getLt(getHtml(response))
    }

    fun loginJufe(username: String, password: String): Boolean{
        // 这个用来登录江西财经大学信息门户
        var response = client.newCall(Request.Builder().url(LoGIN_URL).build()).execute()
        processData(username, password, response)  // 这里封装好数据
        val body = getBodyDataByMap(data)
        response = client.newCall(Request.Builder().url(LoGIN_URL).post(body).build()).execute()

        // 通过识别response中的页面来判断是否登录成功
        val doc = Jsoup.parse(response.body()?.string())
        if (doc.getElementsByTag("title")[0].text().contains("本科生首页")){
            println("containing")
            return true
        } else{
            println("not containing")
            return false
        }
    }

    fun getResponse(url: String): Response{
        // get
        val response = client.newCall(Request.Builder().url(url).build()).execute()
        return response
    }

    fun postResponse(url: String, data: Map): Response{
        // post
        val body = getBodyDataByMap(data)
        val response = client.newCall(Request.Builder().url(url).post(body).build()).execute()
        return response
    }

}

class CookieJarApp: CookieJar{
    /*
    用来储存和更改Cookies,这里的话根据host的不同去存储数据,而且这里的话暂时不使用持久化存储cookies
     */

    private val cookieStore = HashMap>()

    override fun loadForRequest(url: HttpUrl): MutableList {
        println("Using Cookies of ${url.host()}")
        return cookieStore[url.host()] ?: ArrayList()
    }

    override fun saveFromResponse(url: HttpUrl, cookies: MutableList) {
        println("Saving Cookies of ${url.host()}")
        cookieStore.put(url.host(), cookies)
    }
}

fun main(args: Array){
    val a = Crawler.getInstance()
    a.loginJufe("220150", "p")
}

Python 版本

江西财经大学自动评教系统

引用xNathan项目中的代码,并加以修改。

版本说明

增加的地方如下:

  1. 增加了新版本门户网的登录加密方法
  2. 增加了命令行参数的支持

修改的地方如下:

  1. 将原来默认的分数范围调整至[86, 95]
  2. 将代码迁移至Python3
  3. 重构了代码,使代码更易阅读

使用说明

老司机请优雅的上车---如何下载github上的单个文件

  1. pip install -r < requirements.txt 安装必要的包
  2. python AutoEvaluateTeach.py -u 220150**** -p pjd*** 输入相关的信息
  3. 愉快的自动评教了,但是要记住自己登陆上去点击结束评教的按钮

操作示例

1.png
实现爬虫江西财经大学_第1张图片
2.png
实现爬虫江西财经大学_第2张图片
3.png

免责声明

原作者免责声明

  1. 本次修改已经邮件和原作者@xNathan取得了联系,同样希望大家在使用的时候本着客观公正的态度去给每个老师进行评价。
  2. 系统给分是随机,所以希望大家认真对待给每个老师的评分。

江西财经大学学生课程容量信息爬虫

自动获取课程容量信息/CourseInfo.py

版本说明

增加的地方如下:

  1. 增加了新版本门户网的登录加密方法
  2. 增加了命令行参数的支持
  3. 增加了列表获取课程信息函数
  4. 使用了pandas库的支持

使用说明

这是第二个关于江西财经大学信息门户网站的小demo,然后本demo本着非Python爱好者用不了的态度,所以用了一些高级的原生的不带的package,目的是为了减少代码量,此外,本人也非CS专业,只是抱着学习的态度在做这些,因此,这个版本一旦写好,以后门户网更新了应该不会再进行相应的更新。

  1. 需要的package:pandas/rsa/beautifulsoup
  2. 依然采用了命令行输入的方式Python /自动获取课程容量信息/CourseInfo.py -u -p -o -t
  3. path1是指存放课程代码的列表的txt文件,path2是指生成的xlsx文件的导出位置。
  4. Example: Python /自动获取课程容量信息/CourseInfo.py -u 220150**** -p ***** -o E:/1.txt -t E:/result.xlsx

存放课程代码txt文件实例

  1. Example中的E:/1.txt文件格式如下图所示。
    实现爬虫江西财经大学_第3张图片
    4.png

每个代码之间用\n隔开。

  1. 导出文件样式


    实现爬虫江西财经大学_第4张图片
    5.png
  2. 操作示例


    实现爬虫江西财经大学_第5张图片
    6.png

免责声明

  1. 由于这个程序是用学生账号对学校服务器进行爬虫操作,因此如果过于爬取频率过大被学校查处,使用者自行解决。
  2. 由于爬虫会使用较大服务器带宽,不建议使用者在学生集中选课时使用,为此造成信息门户系统访问速度变慢是作者非常不提倡的。
  3. 希望使用者能充分利用爬取的文件,不要重复爬取一些无意义的数据。

功能展望

  1. 这个程序可以在校内的电脑完美运行,且我校的选课制度一个不方便的地方在于:学校只能通过QQ群一级一级传达课程容量扩容信息,而通过这个自动获取课程容量信息的代码可以拓展为定时刷新课程容量信息,并及时通知到关注这门课程的同学,已达到信息传达的及时性。
  2. 这个程序可以更改数据信息处理部分,使得这段程序可以爬虫课表,从而更加方便的使用课表信息。
  3. 这个程序可以继续处理相关信息,从而爬取相关课程的上课的同学,达到自动查询相应课程本班人数的目的。
  4. 通过爬取相应的课表,可以整理出相应的无课表信息,从而导出一定数目人的无课表信息,达到班级安排的高效率。
# coding: utf-8
# @ Author: xNathan
# @ GitHub: https://github.com/xNathan
# @ Date: 2015-12-11 23:24

# @modified by Peng Jidong on 2017-12-16 15:58:53
# - 适应了新版门户网的加密
# - 使用了命令行解析功能
# - [expectation] add multi thread to work quicker
"""Description
对本学期已选课程进行自动评教,简化复杂的填表过程。
适用对象:江西财经大学的学生
可以自由设定分值下限和上限,所有分数均为随机生成,作者不对评价客观真实性做保证。
本软件只供学习和参考,如果因为使用此软件而造成任何法律后果,作者不承担任何责任。
"""

import rsa
import binascii
import optparse
import requests
from bs4 import BeautifulSoup
import sys
from random import randint


class Original:
    """
    原作者写的程序,整合在了一个class里面,名称稍微有些变化
    """
    def __init__(self, session):
        self.min_grade = 86
        self.max_grade = 95
        self.login_url = 'http://xfz.jxufe.edu.cn/portal/main.xsp/page/-1'
        self.base_url = 'http://xfz.jxufe.edu.cn/portal/main.xsp/page/-1/?.a.p=aT0lMkZ4Znpwb3J0YWwlMkZwZ25ldyZ0PXImcz1ub3JtYWwmZXM9ZGV0YWNoJm09dmlldw==&mlinkf='
        self.post_url = self.base_url + 'pg/pg1.jsp'
        self.index_url = self.base_url + 'pg/index.jsp'

        self.post_data = {
            'courseName': '',  # 课程名称
            'teacherName': '',  # 老师姓名
            'courseCode': '',  # 课程代码
            'classNO': '',  # 班级代号
            'teachattitude': '',  # 教学态度
            'teachmethod': '',  # 教学水平
            'teacheffect': '',  # 教学效果
            'stmemo': '',  # 早退、表扬、建议
            'teachcontent': '',  # 课件评价
            'coursepleased': '',  # 课程价值
            'teachjc': '',  # 教材评价
            'jcmemo': '',  # 课程教材留言评价
            'coursememo': '',  # 课程设置留言评价
        }
        self.session = session  # 所有爬虫的基础

    def login(self):
        """登录百合信息平台"""
        try:
            # res = self.session.get(self.login_url)
            # return res.url == 'http://xfz.jxufe.edu.cn/portal/main.xsp/page/-1'
            # examine before use this class, so cancel this examine
            return True
        except:
            return False

    def get_list(self):
        """获取课程列表"""
        page = self.session.get(self.index_url).text
        print('-------评教首页--------')
        print("首页长度为:", len(page))
        soup = BeautifulSoup(page, 'lxml')
        out_put = []
        for item in soup.find('table', class_='Table').findAll('tr'):
            out_put.append([i.get_text().encode('utf-8')
                            for i in item.findAll('td')])
        course_list = out_put[1:]  # 去除第一行表头
        return course_list

    def get_evaluate_list(self):
        """获取等待评教的课程列表"""

        # 先获取所有课程列表
        course_list = self.get_list()
        result = []
        for item in course_list:
            # item[-2] 有数据即已评教, 无数据则待评教
            if not item[-2]:
                result.append(item)
        return result

    def evaluate(self, courseCode, classNO,
                 courseName, teacherName):
        """进行评教
        Args:
            courseCode: 课程代码
            classNO: 班级代号
            courseName: 课程名称
            teacherName: 老师姓名
        Returns:
            Bool 值,True or False
            是否提交成功
        """

        self.post_data['courseCode'] = courseCode
        self.post_data['classNO'] = classNO
        self.post_data['courseName'] = courseName
        self.post_data['teacherName'] = teacherName

        self.post_data['teachattitude'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['teachmethod'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['teacheffect'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['stmemo'] = u'都有'.encode('utf-8')
        self.post_data['teachcontent'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['coursepleased'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['teachjc'] = randint(self.min_grade, self.max_grade + 1)
        self.post_data['jcmemo'] = u'教材适用'.encode('utf-8')
        self.post_data['coursememo'] = u'课程设置合理,易于接受'.encode('utf-8')
        res = self.session.post(self.post_url, data=self.post_data)
        return u'操作成功' in res.text

    def main(self):
        # 先登录
        if self.login():
            evaluate_list = self.get_evaluate_list()  # 获取待评教课程
            if evaluate_list:
                for item in self.get_evaluate_list():
                    courseCode = item[0]
                    classNO = item[1]
                    courseName = item[3]
                    teacherName = item[4]
                    print(courseCode, classNO, courseName, teacherName)
                    flag = self.evaluate(courseCode, classNO,
                                    courseName, teacherName)
                    if flag:
                        print('-----Success------\n')
                    else:
                        print('------Error-----\n')
            else:
                print('No course to evaluate')
        else:
            print('Login error')


class EvaluateTeach:
    def __init__(self, student_num, password):
        self.student_num = student_num
        self.password = password
        self.headers = {
                        'Host': 'ssl.jxufe.edu.cn',
                        'Origin': 'https://ssl.jxufe.edu.cn',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/\
                                       51.0.2704.84 Safari/537.36',
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate',
                        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                        'Referer': ""
        }
        self.data = {
                    "username": self.student_num,  # refill later
                    "password": "",  # refill later
                    "errors": "0",
                    "imageCodeName": "",
                    "_rememberMe": "on",
                    "cryptoType": "1",
                    "lt": "_c01C8161C-458D-5ABA-AD27-55438D2520B1_kA3BE3356-EFCD-0846-620D-74F8AB074C99",
                    "_eventId": "submit"
        }
        self.login_url = "https://ssl.jxufe.edu.cn/cas/login?service=http%3A%2F%2Fecampus.jxufe.edu.cn%2Fc%2Fportal" \
                         "%2Flogin%3Fredirect%3D%252Fc"

    def _crypto(self):
        # operate RSA encryption for the password
        n = '5598e3b75d21a2989274e222fa59ab07d829faa29b544e3a920c4dd287aed9302a657280c23220a35ae985ba157400e0502ce8e445' \
            '70a1513bf7146f372e9c842115fb1b86def80e2ecf9f8e7a586656d12b27529f487e55052e5c31d0836b2e8c01c011bca911d983b1' \
            '541f20b7466c325b4e30b4a79652470e88135113c9d9'  # 256-bit hex
        e = '10001'  # hex

        public_key = rsa.PublicKey(int(n, 16), int(e, 16))
        encrypted_password = rsa.encrypt(self.password.encode(), public_key)
        encoded_password = binascii.b2a_hex(encrypted_password)  # 将bytes转化为16进制

        return encoded_password

    def _valid_login(self):
        # 验证是否登录成功
        s = requests.session()
        login = s.get(self.login_url, headers=self.headers)
        bsObj_login = BeautifulSoup(login.text, 'lxml')
        self.data['lt'] = bsObj_login.find_all('input', {'name': 'lt'})[0].get('value')  # update the lt in the data
        form_url = bsObj_login.find_all("form", {"id": "fm1"})[0].get("action")
        form_url = 'https://ssl.jxufe.edu.cn' + form_url

        # post
        self.headers['Referer'] = 'https://ssl.jxufe.edu.cn/cas/login?service=http%3A%2F%2Fecampus.jxufe.edu.cn' \
                                  '%2Fc%2Fportal%2Flogin%3Fredirect%3D%252Fc'
        s.post(form_url, data=self.data, allow_redirects=True)

        # 通过访问一个需要权限的页面来判断是否登录成功
        result = s.get('http://xfz.jxufe.edu.cn/portal/main.xsp/page/-1')
        if result.url.split(":")[0] == "http":
            return s
        else:
            return False

    def _auto_evaluate(self):
        # 自动评教,完全使用原作者的代码
        pass

    def run(self):
        self.data['password'] = self._crypto()
        s = self._valid_login()
        if s:
            print('[+] login successfully!')
        else:
            print('[-] failed to login! And system will exit soon!')
            sys.exit()

        # 进行自动评教
        original = Original(s)
        original.main()



def main():
    # 用来初始化命令行参数页面
    parser = optparse.OptionParser('Usage %prog ' + '-u  -p ')
    parser.add_option('-u', '--username', dest='username', type='string', help='enter your username/student number...')
    parser.add_option('-p', '--password', dest='password', type='string', help='enter your password, enter to compete...')
    opts, args = parser.parse_args()
    print('your information entered as follows:\n  --username: %s\n  --password: %s' %
          (opts.username, opts.password))
    prompt = input('enter y to continue, n to cancel: ')
    if prompt == 'y':
        print('go on to execute the program...')
        e = EvaluateTeach(opts.username, opts.password)
        e.run()
    elif prompt == 'n':
        print('exit...')
        sys.exit()
    else:
        print('invalid arguments...\n the system will exit soon...')
        sys.exit()



if __name__ == '__main__':
    main()

你可能感兴趣的:(实现爬虫江西财经大学)