Python源代码
from bs4 import BeautifulSoup
import xlwt
import requests
def ask_url(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47"
}
try:
r = requests.get(url, headers=head, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def get_data(base_url):
data_list = []
for i in range(0, 43):
print('Begin page = ' + str(i + 1))
start = i * 20
url = base_url + str(start)
html = ask_url(url)
if html == "":
continue
soup = BeautifulSoup(html, 'html.parser')
trs = soup.find_all('tr')
for i in range(1, len(trs)):
item = trs[i]
data = {}
name = item.contents[1].a.string.strip()
data['name'] = name
print(name)
data['double_first_class'] = ""
if item.contents[1].span:
data['double_first_class'] = item.contents[1].span.string.strip()
data['location'] = item.contents[3].string.strip()
data['college_affiliation'] = item.contents[5].string.strip()
data_list.append(data)
return data_list
def save_data(data_list, save_path):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("院校信息", cell_overwrite_ok=True)
col = ('院校名称', '双一流', '所在地', '院校隶属')
en_col = ('name', 'double_first_class', 'location', 'college_affiliation')
length = len(data_list)
print('共计' + str(length) + '条数据')
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, length):
print("\r存储数据进度:{:.2f}%".format((i + 1) * 100 / length), end="")
for j in range(0, 4):
sheet.write(i + 1, j, data_list[i][en_col[j]])
book.save(save_path)
return ""
if __name__ == "__main__":
base_url = "https://yz.chsi.com.cn/sch/?start="
save_path = "院校信息数据集.xls"
data_list = get_data(base_url)
save_data(data_list, save_path)
数据集