使用requests库和pandas库实现对世界大学排名的爬取与保存
import requests
import re
from bs4 import BeautifulSoup
import bs4
import pandas as pd
try:
r = requests.get("http://www.zuihaodaxue.com/ARWU2018.html" )
r.encoding=r.apparent_encoding
except:
print("")
html = r.text
pm=[]
school=[]
country=[]
rank=[]
score=[]
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find("tbody").children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
pm.append(tds[0].string)
school.append(tds[1].a.string)
country.append(tds[2].a['title'].lstrip("查看").rstrip("大学排名"))
rank.append(tds[3].string)
score.append(tds[4].string)
data=([pm,school,country,rank,score])
u_info=pd.DataFrame([pm,school,country,rank,score]).T
u_info=u_info.rename(columns={0:"世界排名",1:"学校名称",2:"国家/地区",3:"国家排名", 4:"总分"})
u_info.to_csv('2018世界大学排名.csv',index=False)