一、爬虫定制部分
import requests
import lxml.html
import chardet
import pandas as pd
import numpy as np
def get_page(url,headers):
try:
r=requests.get(url, headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except Exception as e:
print(e)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
二、爬取并解析网页
url = "https://yz.chsi.com.cn/kyzx/fsfsx34/201703/20170306/1589083359.html"
page = get_page(url,headers)
selector = lxml.html.fromstring(page)
sample = []
for i in range(2,13):
subject = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td/p/text()'.format(i))
sample.append([subject[0],subject[1],subject[-1]])
三、保存数据
df = pd.DataFrame(data=sample,dtype='object',columns=['学科代码','名称', '总分'])
df.to_csv('C:/Users/David/Desktop/东南大学2017年初试学术学位成绩.csv',header=True,index=False,encoding='utf-8')
四、绘图分析
'''
绘图部分
'''
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
labels = df['名称']
grade_2019 = df['2019年总分']
grade_2018 = df['2018年总分']
grade_2017 = df['2017年总分']
grade_2016 = df['2016年总分']
x = np.arange(len(labels))
width = 0.15
type(x-width/2)
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, grade_2016, width, label='2016')
rects2 = ax.bar(x + width/2, grade_2017, width, label='2017')
rects3 = ax.bar(x + width/2 +width, grade_2018, width, label='2018')
rects4 = ax.bar(x + width/2 +width + width, grade_2019, width, label='2019')
ax.set_ylabel('分数')
ax.set_title('东南大学2016-2019年初试学术学位成绩')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
plt.show()
五、全代码
import requests
import lxml.html
import chardet
import pandas as pd
import numpy as np
def get_page(url,headers):
try:
r=requests.get(url, headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except Exception as e:
print(e)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
url = "https://yz.chsi.com.cn/kyzx/fsfsx34/201703/20170306/1589083359.html"
page = get_page(url,headers)
selector = lxml.html.fromstring(page)
sample = []
for i in range(2,13):
subject = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td/p/text()'.format(i))
sample.append([subject[0],subject[1],subject[-1]])
df = pd.DataFrame(data=sample,dtype='object',columns=['学科代码','名称', '总分'])
df.to_csv('C:/Users/David/Desktop/东南大学2016年初试学术学位成绩.csv',header=True,index=False,encoding='utf8')
url = "https://yz.chsi.com.cn/kyzx/fsfsx34/201703/20170306/1589085174.html"
page = get_page(url,headers)
selector = lxml.html.fromstring(page)
sample = []
for i in range(2,13):
id = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[1]/span/text()'.format(i))
subject = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[2]/text()'.format(i))
grade = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[7]/div/span/text()'.format(i))
sample.append([id[0],subject[0],grade[0]])
df = pd.DataFrame(data=sample,dtype='object',columns=['学科代码','名称', '总分'])
df.to_csv('C:/Users/David/Desktop/东南大学2017年初试学术学位成绩.csv',header=True,index=False,encoding='utf-8')
url = "https://yz.chsi.com.cn/kyzx/fsfsx34/201803/20180305/1664240306.html"
page = get_page(url,headers)
selector = lxml.html.fromstring(page)
sample = []
for i in range(2,13):
id = selector.xpath('//*[@id="article_dnull"]/center[1]/table/tbody/tr[{}]/td[1]/text()'.format(i))
subject = selector.xpath('//*[@id="article_dnull"]/center[1]/table/tbody/tr[{}]/td[2]/text()'.format(i))
grade = selector.xpath('//*[@id="article_dnull"]/center[1]/table/tbody/tr[{}]/td[7]/text()'.format(i))
sample.append([id[0],subject[0],grade[0]])
df = pd.DataFrame(data=sample,dtype='object',columns=['学科代码','名称', '总分'])
df.to_csv('C:/Users/David/Desktop/东南大学2018年初试学术学位成绩.csv',header=True,index=False,encoding='utf-8')
url = "https://yz.chsi.com.cn/kyzx/fsfsx34/201903/20190306/1770746646.html"
page = get_page(url,headers)
selector = lxml.html.fromstring(page)
sample = []
for i in range(2,13):
id = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[1]/text()'.format(i))
subject = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[2]/text()'.format(i))
grade = selector.xpath('//*[@id="article_dnull"]/table[1]/tbody/tr[{}]/td[7]/text()'.format(i))
sample.append([id[0],subject[0],grade[0]])
df = pd.DataFrame(data=sample,dtype='object',columns=['学科代码','名称', '总分'])
df.to_csv('C:/Users/David/Desktop/东南大学2019年初试学术学位成绩.csv',header=True,index=False,encoding='utf-8')
'''
绘图准备部分
'''
import pandas as pd
df_2019 = pd.read_csv("C:/Users/David/Desktop/东南大学2019年初试学术学位成绩.csv")
df_2018 = pd.read_csv("C:/Users/David/Desktop/东南大学2018年初试学术学位成绩.csv")
df_2017 = pd.read_csv("C:/Users/David/Desktop/东南大学2017年初试学术学位成绩.csv")
df_2016 = pd.read_csv("C:/Users/David/Desktop/东南大学2016年初试学术学位成绩.csv")
df_2019['学科代码']
df = pd.DataFrame(data=df_2019)
df = df.rename(columns = {"总分": "2019年总分"})
df['2018年总分'] = df_2018['总分']
df['2017年总分'] = df_2017['总分']
df['2016年总分'] = df_2016['总分']
df.to_csv('C:/Users/David/Desktop/东南大学2016-2019年初试学术学位成绩.csv',header=True,index=False,encoding='utf-8')
'''
绘图部分
'''
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
labels = df['名称']
grade_2019 = df['2019年总分']
grade_2018 = df['2018年总分']
grade_2017 = df['2017年总分']
grade_2016 = df['2016年总分']
x = np.arange(len(labels))
width = 0.15
type(x-width/2)
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, grade_2016, width, label='2016')
rects2 = ax.bar(x + width/2, grade_2017, width, label='2017')
rects3 = ax.bar(x + width/2 +width, grade_2018, width, label='2018')
rects4 = ax.bar(x + width/2 +width + width, grade_2019, width, label='2019')
ax.set_ylabel('分数')
ax.set_title('东南大学2016-2019年初试学术学位成绩')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
plt.show()