鉴于小编本科专业为遥感科学与技术专业,所以爬取了在全国高校信息综合查询系统查询了遥感科学与技术专业到但现在为止的所有院校。
爬取网址:https://gkcx.eol.cn/soudaxue/querySchoolSpecialty.html?&argspecialtyname=%E9%81%A5%E6%84%9F%E7%A7%91%E5%AD%A6%E4%B8%8E%E6%8A%80%E6%9C%AF&argzycengci=%E6%9C%AC%E7%A7%91
所需Python包:
1. BeautifulSoup
2. selenium
3. csv
#!/usr/bin/python
# -*- coding: utf-8 -*-
# author:zhoulong_GISER
# -*- coding: utf-8 -*-
# blog:https://blog.csdn.net/qq_33356563
from bs4 import BeautifulSoup
from selenium import webdriver
def main():
driver_path = r'E:\spiter\data\phantomjs.exe\phantomjs-2.1.1-windows\bin\phantomjs.exe'
value = []
driver = webdriver.PhantomJS(executable_path=driver_path)
for i in range(1, 5, 1):
url = 'https://gkcx.eol.cn/soudaxue/querySchoolSpecialty.html?&argspecialtyname=%E9%81%A5%E6%84%9F%E7%A7%91%E5%AD%A6%E4%B8%8E%E6%8A%80%E6%9C%AF&page=' + str(i)
driver.get(url)
data = driver.page_source
dfcontent = BeautifulSoup(data, 'lxml')
trs = dfcontent.find_all('tr')
for tr in trs:
tup1 = []
for td in tr:
if td.string!="学校名称" and td.string!="专业名称" and td.string!="重点专业" and td.string!="院校属性"and td.string!="高校对比"and td.string!='\n':
if str(td.string)[-3:]=="...":
tdstring=str(td.string[0:-3])+"学"
tup1.append(tdstring)
else:
tup1.append(td.string)
if len(tup1)!=0:
value.append(tup1[0])
#去除重复项
lis = []
for va in value:
if va not in lis:
lis.append(va)
for li in lis:
print(li)
if __name__ == '__main__':
main()