# coding:utf-8
from pip._vendor.distlib.compat import raw_input
from spider_mb import url_manager, html_downloader, html_parser, html_outputer, xh_generate
import time
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.output = html_outputer.HtmlOutputer()
self.xh = xh_generate.Xh_generate()
def craw(self, root_url):
raw_input("欢迎使用高校现代化教学管理系统")
tbYHM = raw_input("请输入学号:")
tbPSW = raw_input("请输入密码:")
ddlSF = raw_input("请输入身份:")
xsmainfs, session = self.downloader.jw_login(root_url, tbYHM, tbPSW, ddlSF)
alert = self.parser.login(xsmainfs)
if len(alert) == 0:
xsxx, xscj, djkscjcx1, ryxk, xstop = self.downloader.jw_info(session, tbYHM)
xsxx_list = self.parser.parser_xsxx(xsxx)
self.output.output_data(xsxx_list)
top = self.parser.xstop(xstop)
shang, zhong, xia = self.parser.parser_xscj(xscj)
zhengshu = self.parser.djkscjcx1(djkscjcx1)
xuanxiu = self.parser.ryxk(ryxk)
self.output.output_cj(tbYHM, top, shang, zhong, xia, zhengshu, xuanxiu, xsxx_list)
print("-->学生信息:", xsxx_list)
else:
print("-->登录失败:", tbYHM, alert[0])
if __name__ == "__main__":
root_url = "http://域名"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
爬虫html下载器
# coding:utf-8
import requests
from spider_mb import html_parser
from requests.adapters import HTTPAdapter
class HtmlDownloader(object):
def __init__(self):
self.parser = html_parser.HtmlParser()
def download(self, url, headers):
if url is None:
return None
session = requests.session()
response = session.get(url, headers=headers)
if response.status_code != 200:
return None
return response.text
def jw_login(self, root_url, tbYHM, tbPSW, ddlSF):
try:
session = requests.session()
session.mount('http://', HTTPAdapter(max_retries=120))
html_cont = session.get(root_url, timeout=5)
__VIEWSTATE = self.parser.parse(root_url, html_cont.content)
login = {
"__VIEWSTATE": __VIEWSTATE,
"tbYHM": tbYHM,
"tbPSW": tbPSW,
"ddlSF": ddlSF.encode("gbk"),
"imgDL.x": "22",
"imgDL.y": "16",
}
xsmainfs = session.post(root_url, data=login, timeout=5).text
return xsmainfs, session
except Exception as e:
print("异常信息:", e)
def jw_info(self, session, tbYHM):
xstop = session.get("域名", timeout=30).text
xsxx = session.get("域名" + tbYHM + "&xh1=" + tbYHM,
timeout=5).text
xscj = session.get("域名" + tbYHM, timeout=30).text
djkscjcx1 = session.get("域名" + tbYHM, timeout=30).text
ryxk = session.get("域名" + tbYHM, timeout=30).text
session.close()
return xsxx, xscj, djkscjcx1, ryxk, xstop
爬虫解析器
涉及系统内部数据,暂不展示自行百度即可,本人使用BeautifulSoup库,html.parser解析器