python 爬教学系统获取成绩单

注:本程序源码用于爬取本校方正管理系统的在校成绩,著作权归作者所有。商业转载请联系作者获得授权,非商业转载请著名出处。

欢迎关注微信公众号:计算机小白成长历程

python 爬教学系统获取成绩单_第1张图片

import re
import requests
import urllib
from lxml import etree
import operator as op

import xlwt

1.获取前端动态码

def get___VIEWSTATE(session):

    index_url = 'http://210.44.159.4'

    index_page = session.get(index_url, headers=headers)

    html = index_page.text


    pattern = r'name="__VIEWSTATE" value="(.*?)"'

    __VIEWSTATE = re.findall(pattern, html)

    return __VIEWSTATE[0]


def get___VIEWSTATE_by_url(session, index_url):
   index_page = session.get(index_url, headers=headers)
   html = index_page.text
   pattern = r'name="__VIEWSTATE" value="(.*?)"'
   __VIEWSTATE = re.findall(pattern, html)
   return __VIEWSTATE[0]


def get___VIEWSTATE_by_html(session, html_text):
   html = html_text
   pattern = r'name="__VIEWSTATE" value="(.*?)"'
   __VIEWSTATE = re.findall(pattern, html)
   return __VIEWSTATE[0]


def get___VIEWSTATEGENERATOR(session):
   index_url = 'http://210.44.159.4'
   index_page = session.get(index_url, headers=headers)
   html = index_page.text

   pattern = r'name="__VIEWSTATEGENERATOR" value="(.*?)"'
   __VIEWSTATEGENERATOR = re.findall(pattern, html)
   return __VIEWSTATEGENERATOR[0]


def get___VIEWSTATEGENERATOR_by_url(session, index_url):
   index_page = session.get(index_url, headers=headers)
   html = index_page.text

   pattern = r'name="__VIEWSTATEGENERATOR" value="(.*?)"'
   __VIEWSTATEGENERATOR = re.findall(pattern, html)
   return __VIEWSTATEGENERATOR[0]


ef get___VIEWSTATEGENERATOR_by_html(session, html_text):
   html = html_text
    pattern = r'name="__VIEWSTATEGENERATOR" value="(.*?)"'
   __VIEWSTATEGENERATOR = re.findall(pattern, html)

   return __VIEWSTATEGENERATOR[0]


2.获取验证码

def get_CheckCode(session):
   index_url = 'http://210.44.159.4/CheckCode.aspx?'
   '''
   postdata = {
       '__VIEWSTATE': get___VIEWSTATE_by_url(session, index_url),
       '__VIEWSTATEGENERATOR': get___VIEWSTATEGENERATOR_by_url(session, index_url),
   }
   index_page = session.post(index_url, postdata, headers = headers)
'''
   index_page = session.get(index_url, stream=True)
   image = index_page.content

   with open("code.gif", "wb") as jpg:
       jpg.write(image)

   code = input('验证码是:')
   return code

3. 提取用户名

def getUserName(home_text):
   page = etree.HTML(home_text)
   username = page.xpath('//*[@id="xhxm"]/text()')[0]

   return (username.split('同学')[0])

4. 提取成绩单网页源码

def getReportCard(session, home_text, usernumber, username):
   index_url = 'http://210.44.159.4/'
   page = etree.HTML(home_text)
   report_card_url = page.xpath('//*[@οnclick="GetMc(\'成绩查询\');"]/@href')
   full_link = index_url+report_card_url[0]
referer='http://210.44.159.4/xscj.aspx?xh='+usernumber+'&xm='+urllib.parse.quote(username.encode("gbk"))+'&gnmkdm=N121605'

   headers['Referer'] = referer
   get_content = session.get(referer, headers=headers).text

   __VIEWSTATE = get___VIEWSTATE_by_html(session,get_content)
   __VIEWSTATEGENERATOR = get___VIEWSTATEGENERATOR_by_html(session,get_content)

   #postdata['__VIEWSTATE'] = __VIEWSTATE #没用
   #postdata['__VIEWSTATEGENERATOR'] = __VIEWSTATEGENERATOR #没用
   #postdata['ddlXN'] = '' #没用
   #postdata['ddlXQ'] = '' #没用
   postdata['txtQSCJ'] = '0'
   postdata['txtZZCJ'] = '100'
   postdata['Button2'] = '在校学习成绩查询'.encode("gb2312")

   post_content = session.post(referer, data=postdata,headers=headers).text

   return (post_content)

5. 提取成绩单信息到list

def getReportCardList(html):
   page = etree.HTML(html)
   datalist = page.findall('.//table[@id="DataGrid1"]/tr')

   report_list = []
   index = 0
   for data in datalist:
       row = data.findall('td')
       row_list = []
       for c in row:
           row_list.append(c.text.strip())
       report_list.append(row_list)
       index += 1

   return (report_list)

6.  统计输出学分情况

def printCreditsByList(report_list):
   compulsory_credits = 0
   tongxuan_credits = 0
   xuanxiu_credits = 0
   open_experiment = 0
   total_credits = 0
   Failed_list = []
   subject_count = {}

   isFirstRow = True

   for row in report_list:
       if isFirstRow:
           isFirstRow = False
           continue

       try:
           score = float(row[-6])
           score2 = -1
           credits = float(row[-2])

           if (op.eq(row[-4],'') == False):
               score2 = float(row[-4])

           if subject_count.__contains__(row[1]) == True:
               subject_count[row[1]] += 1
           else:
               subject_count[row[1]] = 1

           if (score >= 60) or (score2 >= 60):
               total_credits += credits
               curriculum_nature = row[2]
               if op.eq(curriculum_nature, '必修课') == True:
                   compulsory_credits += credits
               elif op.eq(curriculum_nature, '通选课') == True:
                   tongxuan_credits += credits
               elif op.eq(curriculum_nature, '选修课') == True:
                   xuanxiu_credits += credits
               elif op.eq(curriculum_nature, '开放实验') == True:
                   open_experiment += credits
           else:
               Failed_list.append(row)
       except Exception as e:
           print (e)
           print('未统计:', row)
           continue


   print ('\n不及格科目历史记录:')
   for row in Failed_list:
       print (row)

   print ('\n')
   print ('必修课:',compulsory_credits)
   print ('通选课:',tongxuan_credits)
   print ('选修课:',xuanxiu_credits)
   print ('开放实验:',open_experiment)

   print ('总学分:',total_credits)

7. 将成绩列表输出到txt文件中

def writeReportListToFile(filename, report_card_list):
   f = open(filename, 'w')
   for l in report_card_list:
       #print(l)

       f.write(str(l) + '\n')

8.将成绩列表导入到excel文件中

def exportExcelFromReportList(filename, report_card_list):
   wbk = xlwt.Workbook()
   sheet = wbk.add_sheet('sheet 1')

   row_index = 0
   for row in report_card_list:
       col_index = 0
       for c in row:
           sheet.write(row_index, col_index, c)
           col_index += 1
       row_index+=1

   wbk.save(filename)

   print ('成功导出文件:', filename)

9.运行主程序

##############################################################################################################
#                                                开始运行
###############################################################################################################
################################
#       在这输入用户名和密码
################################

usernumber = '123456' #学号
password = '123455' # 密码

################################
#    准备头信息以及发送的数据
################################

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
headers = {
   'User-Agent':user_agent,
          }
session = requests.session()

__VIEWSTATE = get___VIEWSTATE(session)
__VIEWSTATEGENERATOR = get___VIEWSTATEGENERATOR(session)

post_url = 'http://210.44.159.4/default2.aspx'

postdata = {
   '__VIEWSTATE':__VIEWSTATE,
   '__VIEWSTATEGENERATOR':__VIEWSTATEGENERATOR,
   'txtUserName':usernumber,
   'TextBox2':password,
   #'RadioButtonList1':'%D1%A7%C9%FA',
   #'RadioButtonList1':u"学生".encode('gb2312','replace'), #没用
   "Button1":"",
   "lbLanguage":"",
   'txtSecretCode':get_CheckCode(session),
   'hidPdrs':'',
   'hidsc':'',
}

################################
#           发送请求
################################

login_page = session.post(post_url, data=postdata, headers=headers)
login_code = login_page.text

################################
#  若登陆成功,可提取学生姓名
################################

username = getUserName(login_code)

print('#######################################################################################')
print('                               用户名:',username,'')
print('#######################################################################################')

##############################################################################
# 获取成绩单列表('在校学习成绩查询'那个按钮),统计和打印学分,生成excel文件
##############################################################################

report_card_code = getReportCard(session, login_code, usernumber, username) # 获取成绩单html源代码
report_card_list = getReportCardList(report_card_code) # 解析成绩单html源代码

list_len = len(report_card_list)
if list_len > 0:
   writeReportListToFile(username+'.txt',report_card_list) # 将成绩单输出到txt文件
   printCreditsByList(report_card_list) # 统计学分并输出
   exportExcelFromReportList(username+'.xls', report_card_list) # 将成绩单输出到excel文件
else:
   print ('成绩表为空')
session.close()

你可能感兴趣的:(计算机编程,Python,爬虫)