import csv
import bs4
from urllib import request
from bs4 import BeautifulSoup
'''(1)获取网站页面'''
def getHTMLText(url):
try:
resp = request.urlopen(url)
html_data = resp.read().decode('utf-8')
return html_data
except:
return ""
'''(2)处理页面,提取相关信息'''
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string,tds[2].string, tds[3].string])
'''(3)解析数据,格式化输出结果'''
def printUnivList(ulist, num):
tplt = "{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}"
print(tplt.format("排名", "学校名称", "省市", "总分", chr(12288)))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))
def write_to_file2(items):
with open('savecollage.csv', 'a', encoding = 'utf_8_sig',newline='') as f:
fieldnames=["pm","sn","area","score"]
w=csv.DictWriter(f,fieldnames=fieldnames)
w.writerow(items)
pass
pass
if __name__ == '__main__':
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 100)
for i in range(100):
fieldnames = ["pm","sn","area","score"]
a=dict(zip(fieldnames,uinfo[i]))
print(a)
write_to_file2(a)
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['font.family']=['sans-serif']
plt.rcParams['axes.unicode_minus']=False
plt.style.use('ggplot')
fig=plt.figure(figsize=(8,5))
colors1="#6D6D6D"
cloumns=['pm','sn','area','score']
df=pd.read_csv('savecollage.csv',encoding = "utf-8",header=None,names=cloumns,index_col='pm')
def annlysis1():
df_socre=df.sort_values('score',ascending=False)
name1=df_socre.sn[:10]
score1=df_socre.score[0:10]
plt.bar(range(10),score1,tick_label=name1)
plt.ylim(10,100)
plt.title("全国大学排名",color=colors1)
plt.xlabel("大学名称")
plt.ylabel("总分")
for x,y in enumerate(list(score1)):
plt.text(x,y+0.01,'%s' %round(y,1),ha="center",color=colors1)
pl.xticks(rotation=270)
plt.tight_layout()
plt.show()
def annlysis2():
area_count = df.groupby(by = 'area').sn.count().sort_values(ascending = False)
print(df)
area_count.plot.bar(color = '#4652B1')
pl.xticks(rotation = 0)
for x, y in enumerate(list(area_count.values)):
plt.text(x, y + 0.5, '%s' % round(y, 1), ha = 'center', color = colors1)
plt.title('各城市学校数量排名')
plt.xlabel('城市')
plt.ylabel('数量(所)')
plt.show()
pass
if __name__ == '__main__':
annlysis1()
annlysis2()