Python爬虫与数据图表的实现
1. 参考教材实例20,编写Python爬虫程序,获取江西省所有高校的大学排名数据记录,并打印输出。
2. 使用numpy和matplotlib等库分析数据,并绘制南昌大学、华东交通大学、江西理工大学三个高校的总分排名、生源质量(新生高考成绩得分)、培养结果(毕业生就业率)、顶尖成果(高被引论文·篇)等四个指标构成的多指标柱形图。
3. 对江西各高校的顶尖成果(高被引论文数量)进行分析,使用matplotlib绘制各高校顶尖成果数构成的饼状图,并突出江西理工大学所在的饼状块。
毕竟不要验证登录,所以挺好写的.
一个好玩的爬虫:
1 # Created by carryon on 18-12-24. 2 import requests 3 from bs4 import BeautifulSoup 4 from prettytable import PrettyTable 5 from test4.Wtsql import Wtsql 6 import numpy as np 7 import matplotlib.mlab as mlab 8 import matplotlib.pyplot as plt 9 import matplotlib 10 11 12 def get(): 13 res = requests.get( 14 url='http://zuihaodaxue.cn/zuihaodaxuepaiming2018.html', 15 headers={ 16 'User-Agent': 'XXX'#自己填自己的 17 } 18 ) 19 20 res.encoding = res.apparent_encoding 21 22 return res.text 23 24 25 def beautiful(text, cur, db, ls): 26 soup = BeautifulSoup(text, 'lxml') 27 title = soup.find(name='thead').find(name='tr').find_all(name='th') 28 29 lists = [] 30 for i in range(len(title)): 31 if i <= 3: 32 lists.append(title[i].text) 33 else: 34 select = title[i].find_all(name='option') 35 for it in select: 36 lists.append(it.text) 37 38 # table = PrettyTable(lists) 39 rou = soup.find(name='tbody', attrs={'class': 'hidden_zhpm', 'style': 'text-align: center;'}).find_all('tr') 40 paints = [] 41 draw_1 = [] 42 draw_2 = [] 43 for it in rou: 44 pan = [] 45 ans = [] 46 tds = it.find_all('td') 47 for i in range(len(tds)): 48 # if tds[i].text 49 pan.append(tds[i].text) 50 if pan[1] == "江西理工大学" or pan[1] == "南昌大学" or pan[1] == "华东交通大学": 51 ans.append(pan[1]) 52 ans.append(float(pan[3])) 53 ans.append(float(pan[4])) 54 # print(pan[5]) 55 pan[5] = str(pan[5]).strip('%') 56 ans.append(float(pan[5])) 57 ans.append(float(pan[9])) 58 paints.append(ans) 59 # print(ans) 60 if pan[2] == "江西": 61 draw_1.append(pan[1]) 62 draw_2.append(float(pan[9])) 63 # ls.insert(pan, cur, db) 64 # table.add_row(pan) 65 # print(table) 66 # print(paints) 67 # paint(paints) 68 drawbing(draw_1, draw_2) 69 def paint(line): 70 matplotlib.rcParams['font.sans-serif'] = ['SimHei'] 71 matplotlib.rcParams['font.family'] = 'sans-serif' 72 # 解决负号'-'显示为方块的问题 73 matplotlib.rcParams['axes.unicode_minus'] = False 74 # data to plot 75 # line = [["江西理工大学",24.4,33.8,94.97,11],["南昌大学", 33.5, 52.4, 86.50, 108],["华东交通大学", 26.2, 42.8, 86.50,11]] 76 n_groups = 4 77 means_frank = (line[0][1], line[0][2], line[0][3], line[0][4]) 78 means_guido = (line[1][1], line[1][2], line[1][3], line[1][4]) 79 means_frank1 = (line[2][1], line[2][2], line[2][3], line[2][4]) 80 81 # create plot 82 fig, ax = plt.subplots() 83 index = np.arange(n_groups) 84 bar_width = 0.15 85 opacity = 0.99 86 87 rects1 = plt.bar(index, means_frank, bar_width, 88 alpha=opacity, 89 color='b', 90 label=line[0][0]) 91 92 rects2 = plt.bar(index + bar_width, means_guido, bar_width, 93 alpha=opacity, 94 color='g', 95 label=line[1][0]) 96 rects3 = plt.bar(index + 2*bar_width, means_frank1, bar_width, 97 alpha=opacity, 98 color='r', 99 label=line[2][0]) 100 101 102 plt.xlabel('江西省部分大学') 103 plt.ylabel('总评比') 104 plt.title('江西省部分大学总评比') 105 plt.xticks(index + bar_width, ("总分排名", "生源质量", "就业率", "顶尖成果")) 106 plt.legend() 107 108 plt.tight_layout() 109 plt.show() 110 111 112 def drawbing(line, ans): 113 114 matplotlib.rcParams['font.sans-serif'] = ['SimHei'] 115 matplotlib.rcParams['font.family'] = 'sans-serif' 116 # 解决负号'-'显示为方块的问题 117 matplotlib.rcParams['axes.unicode_minus'] = False 118 119 lable = line 120 fras = ans 121 explode = [0.2,0.2,0.2,0.2,0.2,0.5,0.2,0.2,0.2,0.2,0.2,0.2, 0.2,0.2,0.2, 0.2, 0.2] # 设置离员中心的位置 此处是为了突出显示 122 plt.axes(aspect=1) # 此处设置的目的 是为了让饼状图画出来是圆形 123 plt.pie(x=fras, labels=lable, autopct='%.2f%%', explode=explode, 124 shadow=False) # 传入数据及参数,占比保留两位小数 explode突出显示 shadow 阴影 125 plt.show() 126 127 128 if __name__ == '__main__': 129 ls = Wtsql() 130 cur, db = ls.login() 131 text = get() 132 beautiful(text, cur, db, ls) 133 # paint()
还有连接数据库:
1 # Created by carryon on 18-12-24. 2 import pymysql 3 4 5 class Wtsql: 6 def login(self): 7 db = pymysql.Connect(host="localhost", user="root", password="", db="") 8 cur = db.cursor() 9 return cur, db 10 11 def insert(self, lists, cur, db): 12 sql = "insert into jiangxi(`pm`,`xxmc`, `ss`, `zf`, `syzl`, `pyjg`, `shsy`, `kygm`, `kyzl`, `djcg`, `djrc`, `kjfw`, `cgzh`, `xsgjh`)values" \ 13 "('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')".format( 14 lists[0], lists[1], lists[2], lists[3], lists[4], lists[5], lists[6], lists[7], lists[8], lists[9], 15 lists[10], lists[11], lists[12], lists[13]) 16 # print(sql) 17 try: 18 # 执行sql语句 19 cur.execute(sql) 20 # 提交到数据库执行 21 db.commit() 22 except Exception as e: 23 # 如果发生错误则回滚 24 print(e) 25 db.rollback()