作者接下来大三,可能准备考研了。想先做一下考前准备,看看那些学校可以报名。分别考那些科目,招多少人,学校在什么地方等等这些信息。
做这个工具的原因:
。所以作者做了这个工具,可以将所有信息整合到excel当中,方便我进行数据筛选。给大家看看示意图:首先我们来进行抓包:
代码编写:
首先导入所需的库
import tkinter
import re
from tkinter import ttk
import requests
from lxml import etree
import time
import threading
from queue import Queue
import xlwt
界面制作
win = tkinter.Tk() # 构造窗体
win.geometry('450x350') #窗体大小设置
l=tkinter.Label(win,text="地区")#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l.pack() #显示这个标签
下拉框0 所在城市选择
comvalue0 = tkinter.StringVar() # 窗体自带的文本,新建一个值
comboxlist0 = ttk.Combobox(win, textvariable=comvalue0) # 初始化
comboxlist0["values"] = ("不限","(11)北京市", "(12)天津市","(13)河北省","(14)山西省","(15)内蒙古自治区","(21)辽宁省","(22)吉林省","(23)黑龙江省","(31)上海市",
"(32)江苏省","(33)浙江省","(34)安徽省","(35)福建省","(36)江西省","(37)山东省","(41)河南省","(42)湖北省","(43)湖南省","(44)广东省",
"(45)广西壮族自治区","(46)海南省","(50)重庆市","(51)四川省","(52)贵州省","(53)云南省","(54)西藏自治区","(61)陕西省","(62)甘肃省","(63)青海省",
"(64)宁夏回族自治区","(65)新疆维吾尔自治区")
comboxlist0.pack()
l1=tkinter.Label(win,text="门类类别")#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l1.pack() #显示这个标签
下拉框1 专硕学硕专业门类选择
comvalue1 = tkinter.StringVar() # 窗体自带的文本,新建一个值
comboxlist1 = ttk.Combobox(win, textvariable=comvalue1) # 初始化
comboxlist1["values"] = ("专硕", "(01)哲学","(02)经济学","(03)法学","(04)教育学","(05)文学","(06)历史学","(07)理学","(08)工学","(09)农学","(10)医学"
,"(11)军事学","(12)管理学","(13)艺术学")
# comboxlist.current(0) # 选择第一个
comboxlist1.bind("<>", zyly) # 绑定事件,(下拉列表框被选中时,绑定go()函数)
# comboxlist1.current(0) # 选择第一个
comboxlist1.pack()
l2=tkinter.Label(win,text="专业领域")#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l2.pack() #显示这个标签
下拉框2 专业领域下拉框显示
这里只显示下拉框,下拉框的内容显示要靠下拉框1(学硕专硕专业门类)的选择而改变
comvalue2 = tkinter.StringVar() # 窗体自带的文本,新建一个值
comboxlist2 = ttk.Combobox(win, textvariable=comvalue2) # 初始化
comboxlist2.pack()
l3=tkinter.Label(win,text="学习方式")#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l3.pack() #显示这个标签
下拉框3 全日制和非全日制选择
comvalue3 = tkinter.StringVar() # 窗体自带的文本,新建一个值
comboxlist3 = ttk.Combobox(win, textvariable=comvalue3) # 初始化
comboxlist3["values"] = ("不限","全日制", "非全日制")
comboxlist3.pack()
l4=tkinter.Label(win,width=20,height=1)#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l4.pack() #显示这个标签
var2=tkinter.StringVar()
var2.set("查询结果显示")
l5=tkinter.Label(win,textvariable=var2,bg="red",font=('Arial',12),width=25,height=2)#标签设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度)
l5.pack() #显示这个标签
#按钮 搜索相关信息到本地
b=tkinter.Button(win,text="查询到本地",width=8,height=1,command=infos)#按钮设置(窗体,文本,背景,字体属性(字体类型,大小),标签宽度,标签高度,调用函数)
b.pack()
win.mainloop() # 进入消息循环
根据不同专硕学硕专业门类的选项,专业领域的下拉框内容会做出对应的变化
def zyly(*args): # 专业领域下拉框设置
value=comboxlist1.get()
if value=="专硕":
comboxlist2["values"] = ("(0251)金融", "(0252)应用统计", "(0253)税务", "(0254)国际商务","(0255)保险","(0256)资产评估"
,"(0257)审计","(0351)法律","(0352)社会工作","(0353)警务","(0451)教育","(0452)体育","(0453)汉语国际教育",
"(0454)应用心理","(0551)翻译","(0552)新闻与传播","(0553)出版","(0651)文物与博物馆","(0851)建筑学","(0853)城市规划"
,"(0854)电子信息","(0855)机械","(0856)材料与化工","(0857)资源与环境","(0858)能源动力","(0859)土木水利","(0860)生物与医药",
"(0861)交通运输","(0951)农业","(0952)兽医","(0953)风景园林","(0954)林业","(1051)临床医学","(1052)口腔医学",
"(1053)公共卫生","(1054)护理","(1055)药学","(1056)中药学","(1057)中医","(1151)军事","(1251)工商管理","(1252)公共管理",
"(1253)会计","(1254)旅游管理","(1255)图书情报","(1256)工程管理","(1351)艺术")
elif value=="(01)哲学":
#(0101)哲学
comboxlist2["values"] = ("(0101)哲学")
elif value=="(02)经济学":
comboxlist2["values"] = ("(0201)理论经济学","(0202)应用经济学","(0270)统计学")
elif value == "(03)法学":
comboxlist2["values"] = ("(0301)法学", "(0302)政治学", "(0303)社会学","(0304)民族学", "(0305)马克思主义理论", "(0306)公安学")
elif value == "(04)教育学":
comboxlist2["values"] = ("(0401)教育学", "(0402)心理学", "(0403)体育学", "(0471)")
elif value == "(05)文学":
comboxlist2["values"] = ("(0501)中国语言文学", "(0502)外国语言文学", "(0503)新闻传播学")
elif value == "(06)历史学":
comboxlist2["values"] = ("(0601)考古学", "(0602)中国史", "(0603)世界史")
elif value == "(07)理学":
comboxlist2["values"] = ("(0701)数学", "(0702)物理学", "(0703)化学", "(0704)天文学", "(0705)地理学", "(0706)大气科学",
"(0707)海洋科学", "(0708)地球物理学", "(0709)地质学", "(0710)生物学", "(0711)系统科学", "(0712)科学技术史",
"(0713)生态学", "(0714)统计学", "(0771)心理学", "(0772)力学", "(0773)材料科学与工程", "(0774)电子科学与技术",
"(0775)计算机科学与技术", "(0776)环境科学与工程", "(0777)生物医学工程", "(0778)基础医学",
"(0779)公共卫生与预防医学", "(0780)药学", "(0781)中药学", "(0782)医学技术", "(0783)护理学", "(0784)",
"(0785)", "(0786)")
elif value == "(08)工学":
comboxlist2["values"] = ("(0801)力学", "(0802)机械工程", "(0803)光学工程", "(0804)仪器科学与技术", "(0805)材料科学与工程", "(0806)冶金工程",
"(0807)动力工程及工程热物理", "(0808)电气工程", "(0809)电子科学与技术", "(0810)信息与通信工程", "(0811)控制科学与工程", "(0812)计算机科学与技术",
"(0813)建筑学", "(0814)土木工程", "(0815)水利工程", "(0816)测绘科学与技术", "(0817)化学工程与技术", "(0818)地质资源与地质工程", "(0819)矿业工程",
"(0820)石油与天然气工程", "(0821)纺织科学与工程", "(0822)轻工技术与工程", "(0823)交通运输工程", "(0824)船舶与海洋工程", "(0825)航空宇航科学与技术"
, "(0826)兵器科学与技术", "(0827)核科学与技术", "(0828)农业工程", "(0829)林业工程", "(0830)环境科学与工程", "(0831)生物医学工程", "(0832)食品科学与工程",
"(0833)城乡规划学", "(0834)风景园林学", "(0835)软件工程", "(0836)生物工程", "(0837)安全科学与工程", "(0838)公安技术", "(0839)网络空间安全", "(0870)科学技术史",
"(0871)管理科学与工程", "(0872)设计学")
elif value == "(09)农学":
comboxlist2["values"] = ("(0901)作物学", "(0902)园艺学", "(0903)农业资源与环境", "(0904)植物保护", "(0905)畜牧学", "(0906)兽医学", "(0907)林学",
"(0908)水产", "(0909)草学", "(0970)科学技术史", "(0971)环境科学与工程", "(0972)食品科学与工程", "(0973)风景园林学")
elif value == "(10)医学":
comboxlist2["values"] = ("(1001)基础医学", "(1002)临床医学", "(1003)口腔医学", "(1004)公共卫生与预防医学", "(1005)中医学", "(1006)中西医结合", "(1007)药学",
"(1008)中药学", "(1009)特种医学", "(1010)医学技术", "(1011)护理学", "(1071)科学技术史", "(1072)生物医学工程", "(1073)", "(1074)",)
elif value == "(11)军事学":
comboxlist2["values"] = ("(1101)军事思想及军事历史", "(1102)战略学", "(1103)战役学", "(1104)战术学", "(1105)军队指挥学",
"(1106)军事管理学", "(1107)军队政治工作学", "(1108)军事后勤学", "(1109)军事装备学", "(1110)军事训练学")
elif value == "(12)管理学":
comboxlist2["values"] = ("(1201)管理科学与工程", "(1202)工商管理", "(1203)农林经济管理", "(1204)公共管理", "(1205)图书情报与档案管理")
elif value == "(13)艺术学":
comboxlist2["values"] = ("(1301)艺术学理论", "(1302)音乐与舞蹈学", "(1303)戏剧与影视学", "(1304)美术学", "(1305)设计学")
信息获取相关代码
def downing_infos(info_links):
print(info_links)
# input("下载")
f = xlwt.Workbook(encoding="utf8")
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
# 写标题
sheet01.write(0, 0, '招生单位') # excl里面:左边0:是横,右边:纵
sheet01.write(0, 1, '院系所')
sheet01.write(0, 2, '专业')
sheet01.write(0, 3, '考试方式')
sheet01.write(0, 4, '研究方向')
sheet01.write(0, 5, '拟招人数')
sheet01.write(0, 6, '学习方式')
sheet01.write(0, 7, '政治')
sheet01.write(0, 8, '外语')
sheet01.write(0, 9, '业务课一')
sheet01.write(0, 10, '业务课二')
sheet01.write(0, 11, '所在地')
header={
"Cookie": "JSESSIONID=00915157320298F6A4E463EF80F4934F; _ga=GA1.3.1896669375.1599321681; _gid=GA1.3.1337310662.1599321681; zg_did=%7B%22did%22%3A%20%221745f002bbb14a-02130d69b55226-3323766-144000-1745f002bbc633%22%7D; aliyungf_tc=AQAAAOozbxkz6g0AIyMDcAwjIVeG9IbH; CHSICC_CLIENTFLAGZSML=fdc0c72e078135f955e18f6745458ca4; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A%201599353606922%2C%22updated%22%3A%201599356575353%2C%22info%22%3A%201599321680839%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fyz.chsi.com.cn%2Fzsml%2FqueryAction.do%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
for info_link in info_links:
url=info_link[1]
resp=requests.get(url=url,headers=header)
HTML = etree.HTML(resp.text)
tbodys=HTML.xpath("//tbody[@class='zsml-res-items']")
subject_zz=[]
for tbody in tbodys:
index=tbodys.index(tbody)
try:
find_huo=HTML.xpath("//tbody[@class='zsml-res-divider']/tr/td/div/span/text()")[0]
if find_huo=="或":
print(find_huo)
except:
find_huo="无或"
pass
print(len(tbodys),tbodys.index(tbody))
if len(tbodys)==index+1:
text=tbody.xpath(".//td/text()")[0].strip()
subject_zz.append(text)
else:
text=tbody.xpath(".//td/text()")[0].strip()
subject_zz.append(text+" or ")
# print(subject_zz)
# print("".join(subject_zz))
text_zz="".join(subject_zz)
print(text_zz)
#外语
subject_wy = []
for tbody in tbodys:
index=tbodys.index(tbody)
try:
find_huo=HTML.xpath("//tbody[@class='zsml-res-divider']/tr/td/div/span/text()")[0]
if find_huo=="或":
print(find_huo)
except:
find_huo="无或"
pass
print(len(tbodys),tbodys.index(tbody))
if len(tbodys)==index+1:
text=tbody.xpath(".//td[2]/text()")[0].strip()
subject_wy.append(text)
else:
text=tbody.xpath(".//td[2]/text()")[0].strip()
subject_wy.append(text+" or ")
# print(subject_zz)
# print("".join(subject_zz))
text_wy="".join(subject_wy)
print(text_wy)
#业务课一
subject_ywk1 = []
for tbody in tbodys:
index=tbodys.index(tbody)
try:
find_huo=HTML.xpath("//tbody[@class='zsml-res-divider']/tr/td/div/span/text()")[0]
if find_huo=="或":
print(find_huo)
except:
find_huo="无或"
pass
print(len(tbodys),tbodys.index(tbody))
if len(tbodys)==index+1:
text=tbody.xpath(".//td[3]/text()")[0].strip()
subject_ywk1.append(text)
else:
text=tbody.xpath(".//td[3]/text()")[0].strip()
subject_ywk1.append(text+" or ")
# print(subject_zz)
# print("".join(subject_zz))
text_ywk1="".join(subject_ywk1)
print(text_ywk1)
#业务课二
subject_ywk2 = []
for tbody in tbodys:
index=tbodys.index(tbody)
try:
find_huo=HTML.xpath("//tbody[@class='zsml-res-divider']/tr/td/div/span/text()")[0]
if find_huo=="或":
print(find_huo)
except:
find_huo="无或"
pass
print(len(tbodys),tbodys.index(tbody))
if len(tbodys)==index+1:
text=tbody.xpath(".//td[4]/text()")[0].strip()
subject_ywk2.append(text)
else:
text=tbody.xpath(".//td[4]/text()")[0].strip()
subject_ywk2.append(text+" or ")
# print(subject_zz)
# print("".join(subject_zz))
text_ywk2="".join(subject_ywk2)
print(text_ywk2)
data = {
"招生单位": HTML.xpath("//tbody/tr[1]/td[2]")[0].text,
"院系所":HTML.xpath("//tbody/tr[2]/td[2]")[0].text,
"专业":HTML.xpath("//tbody/tr[3]/td[2]")[0].text,
"考试方式": HTML.xpath("//tbody/tr[1]/td[4]")[0].text,
"研究方向": HTML.xpath("//tbody/tr[4]/td[2]")[0].text,
"拟招人数": HTML.xpath("//table/tbody/tr[5]/td[2]")[0].text,
"学习方式": HTML.xpath("//table/tbody/tr[3]/td[4]")[0].text,
"政治":text_zz,
"外语":text_wy,
"业务课一":text_ywk1,
"业务课二": text_ywk2,
"所在地":info_link[0]
}
print(data)
print(data,info_links.index(info_link))
sheet01.write(info_links.index(info_link)+1, 0, data['招生单位'])
sheet01.write(info_links.index(info_link) + 1, 1, data['院系所'])
sheet01.write(info_links.index(info_link) + 1, 2, data["专业"])
sheet01.write(info_links.index(info_link) + 1, 3, data['考试方式'])
sheet01.write(info_links.index(info_link) + 1, 4, data['研究方向'])
sheet01.write(info_links.index(info_link) + 1, 5, data['拟招人数'])
sheet01.write(info_links.index(info_link) + 1, 6, data['学习方式'])
sheet01.write(info_links.index(info_link) + 1, 7, data['政治'])
sheet01.write(info_links.index(info_link) + 1, 8, data['外语'])
sheet01.write(info_links.index(info_link) + 1, 9, data['业务课一'])
sheet01.write(info_links.index(info_link) + 1, 10, data['业务课二'])
sheet01.write(info_links.index(info_link) + 1, 11, data['所在地'])
f.save(r"{}{}.xls".format(comboxlist2.get(),time.time()))
控制按钮程序部分:
def get_infos_spider(data): #按钮控制程序
# global var2
url="https://yz.chsi.com.cn/zsml/queryAction.do"
header={
"Cookie": "JSESSIONID=00915157320298F6A4E463EF80F4934F; _ga=GA1.3.1896669375.1599321681; _gid=GA1.3.1337310662.1599321681; zg_did=%7B%22did%22%3A%20%221745f002bbb14a-02130d69b55226-3323766-144000-1745f002bbc633%22%7D; aliyungf_tc=AQAAAOozbxkz6g0AIyMDcAwjIVeG9IbH; CHSICC_CLIENTFLAGZSML=fdc0c72e078135f955e18f6745458ca4; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A%201599353606922%2C%22updated%22%3A%201599356575353%2C%22info%22%3A%201599321680839%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fyz.chsi.com.cn%2Fzsml%2FqueryAction.do%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
resp=requests.post(url,headers=header,data=data)
HTML=etree.HTML(resp.text)
lis=HTML.xpath("//ul[@class='ch-page']/li")
print(len(lis))
num_pages=[]
for li in lis:
try:
text=li.xpath(".//text()")[0]
text=re.findall("\d+",text)[0]
# print(text)
if " " in text:
print("跳过")
pass
else:
print(text)
num_pages.append(int(text))
except:
pass
print(num_pages)
print(max2(num_pages))
school_links=[]
for num_page in range(1,max2(num_pages)+1):
data['pageno']=num_page
print(data)
get_shcool_link(data,school_links)
print(school_links)
# input("school_links:print over")
info_links=[]
for school_link in school_links:
print(school_link[0],school_link[1])
# input("school_links:print over2")
get_infos_link(info_links,school_link)
print(info_links)
var2.set("老板,一共搜到{}条信息".format(len(info_links)))
downing_infos(info_links)
考研信息链接
def get_infos_link(info_links,school_link):
url = school_link[1]
print(url)
# input("信息链接:")
headers = {
"Cookie": "JSESSIONID=00915157320298F6A4E463EF80F4934F; _ga=GA1.3.1896669375.1599321681; _gid=GA1.3.1337310662.1599321681; zg_did=%7B%22did%22%3A%20%221745f002bbb14a-02130d69b55226-3323766-144000-1745f002bbc633%22%7D; aliyungf_tc=AQAAAOozbxkz6g0AIyMDcAwjIVeG9IbH; CHSICC_CLIENTFLAGZSML=fdc0c72e078135f955e18f6745458ca4; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A%201599353606922%2C%22updated%22%3A%201599356575353%2C%22info%22%3A%201599321680839%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fyz.chsi.com.cn%2Fzsml%2FqueryAction.do%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
resp=requests.get(url=url,headers=headers)
HTML=etree.HTML(resp.text)
trs=HTML.xpath("//tbody/tr")
for tr in trs:
link=tr.xpath("./td/a[@target='_blank']/@href")[0]
# print(link)
info_links.append([school_link[0],"https://yz.chsi.com.cn/"+link])
学校界面信息链接
def get_shcool_link(data,school_links):
url = "https://yz.chsi.com.cn/zsml/queryAction.do"
headers={
"Cookie": "JSESSIONID=00915157320298F6A4E463EF80F4934F; _ga=GA1.3.1896669375.1599321681; _gid=GA1.3.1337310662.1599321681; zg_did=%7B%22did%22%3A%20%221745f002bbb14a-02130d69b55226-3323766-144000-1745f002bbc633%22%7D; aliyungf_tc=AQAAAOozbxkz6g0AIyMDcAwjIVeG9IbH; CHSICC_CLIENTFLAGZSML=fdc0c72e078135f955e18f6745458ca4; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A%201599353606922%2C%22updated%22%3A%201599356575353%2C%22info%22%3A%201599321680839%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fyz.chsi.com.cn%2Fzsml%2FqueryAction.do%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
resp=requests.post(url,headers=headers,data=data)
HTML=etree.HTML(resp.text)
trs=HTML.xpath("//tbody/tr")
for tr in trs:
link=tr.xpath('.//a/@href')[0]
city=tr.xpath('./td[2]/text()')[0]
school_links.append([city,"https://yz.chsi.com.cn"+link])
取最大值函数
def max2(list):
a = list[0]
for prime in range (1,len(list)):
if list[prime] > list[0]:
a = list[prime]
return a
按钮触发执行的函数
def infos():
# global var2
print(comvalue0.get(),comvalue1.get(),comvalue2.get(),comvalue3.get())
if comvalue0.get()=="不限":
city=""
else:
city=re.findall('\d+',comboxlist0.get())[0]
ssdm=city
print(ssdm)
if comvalue1.get()=="专硕":
Category="zyxw"
mldm=Category
print(mldm)
elif "(" in comvalue1.get():
Category=re.findall('\d+',comboxlist1.get())[0]
mldm = Category
print(mldm)
print(Category)
Professional_field=re.findall('\d+',comboxlist2.get())[0]
yjxkdm=Professional_field
if comvalue3.get()=="全日制":
learning_style=1
elif comvalue3.get()=="非全日制":
learning_style=2
elif comvalue3.get() == "不限":
learning_style = ""
xxfs=learning_style
print(xxfs)
# learning_style
print(city,Category,Professional_field,learning_style)
print(ssdm, mldm, yjxkdm, xxfs)
# print("".join(ssdm+" "+mldm+" "+yjxkdm+" "+xxfs))
var2.set("".join(str(ssdm)+" "+str(mldm)+" "+str(yjxkdm)+" "+str(xxfs)))
data={
"ssdm":ssdm,
"dwmc":"",
"mldm":mldm,
"mlmc":"",
"yjxkdm":yjxkdm,
"zymc":"",
"xxfs":xxfs,
}
get_infos_spider(data)
先给大家看看成品效果示意图:
界面示意图:
结果示意图:
好了,完成!点个赞吧!!!