在交管局网站上,看到每月的考试信息都有工具。
想想,可以汇总下,按照考试人数排个名(群众的选择大部分时候都是有道理的)
1.数据先抓取下来
#encoding=utf8
import urllib
import urllib.request
import http.cookiejar
from bs4 import BeautifulSoup
# 导入SQLite驱动:
import sqlite3
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent
vyear='201601'
urlFile = urllib.request.urlopen('https://sn.122.gov.cn/publicitypage?size=20&fzjg=%E9%99%95A&page=1&tjyf='+vyear)
data = urlFile.read()
urlFile.close()
data = data.decode('utf-8',errors='ignore')
print("get page success")
pre = "<div id='pagination'><span class=''>共"
index1 = data.find(pre) + len(pre)
index2 = data.find('条记录', index1)
ebi=data[index1 : index2];
print("总条数:"+ebi)
totalPage="";
if (int(data[index1 : index2]))%20>0:
totalPage=str(int(int(data[index1 : index2])/20+1))
else:
totalPage=str(int(int(data[index1 : index2])/20))
print("totalpage:"+totalPage);
def getList(pageId):
hosturl = 'https://sn.122.gov.cn/publicitypage?size=20&fzjg=%E9%99%95A&tjyf='+vyear+'&page='+pageId
print("getcontent:"+hosturl);
request = urllib.request.Request(hosturl, None, header)
response = urllib.request.urlopen(request)
htmldata = response.read()
soup = BeautifulSoup(htmldata,"html.parser")
trs = soup.findAll('tr')
for x in range(1,len(trs)):
tr = trs[x]
tds = tr.findAll("td")
savejiaxiao(tds[0].contents[0],tds[1].contents[0],tds[2].contents[0],tds[3].contents[0],tds[4].contents[0],tds[5].contents[0]);
def savejiaxiao(t_date, t_name,t1,t2,t3,t4):
conn = sqlite3.connect('jiaxiao.db')
cursor = conn.cursor()
# 执行一条SQL语句,创建user表:
cursor.execute('create table IF NOT EXISTS kaoshi (id INTEGER PRIMARY KEY, t_date varchar(100),t_name varchar(100),t1 varchar(100),t2 varchar(100),t3 varchar(100),t4 varchar(100) )')
cursor.execute('select * from kaoshi where t_date=\''+t_date+'\' and t_name=\''+t_name+'\' and t1=\''+t1+'\' and t2=\''+t2+'\' and t3=\''+t3+'\' and t4=\''+t4+'\' ')
values=cursor.fetchall()
if len(values) > 0:#以前就存在
print('已经存在:'+t_date+','+t_name)
else:
try:
cursor.execute('insert into kaoshi (t_date, t_name,t1,t2,t3,t4) values(?,?,?,?,?,?)',(t_date,t_name,t1,t2,t3,t4))
print("save success."+t_date+','+t_name)
except Exception as e:
print('except:', e)
# 关闭Cursor:
cursor.close()
# 提交事务:
conn.commit()
# 关闭Connection:
conn.close()
for x in range(1,int(totalPage)+1): #代表从1到totalPage+1(不包含totalPage)
errorLink=[]
try:
getList(str(x))
except Exception as e:
print('except:', e)
errorLink.append(x)
print("errorLink:"+str(errorLink));
2.把数据导出到excel里
# -*- coding:utf-8 -*-
import xlwt
# 导入SQLite驱动:
import sqlite3
def export():
conn = sqlite3.connect('jiaxiao2016.db')
cursor = conn.cursor()
cursor.execute('select * from kaoshi')
values = cursor.fetchall()
i=0
book=xlwt.Workbook()
sheet1=book.add_sheet('驾校')
for line in values:
row = sheet1.row(i)
row.write(0,line[0])#id
row.write(1,line[1])#title
row.write(2,line[2])#link
row.write(3,line[3])
row.write(4,line[4])
row.write(5,line[5])
row.write(6,line[6])
sheet1.col(1).width = 5000
i=i+1
book.save('驾校2016.xls')
cursor.close()
conn.commit()
conn.close()
export()
3.在excel里做个透视图,按月汇总并排序下,得到结果如下
西安西高驾校科二考场 1870 1241 1158 1095 5364
北所科目一考场 1593 1227 1088 863 4771
郊县分所蓝田县科三考试场 3887 210 343 256 4696
郊县长安科目一考场 2332 1844 15 282 4473
西安支队尚稷路科三考场 2263 764 834 567 4428
西安科目三咸阳吉祥考场 868 928 766 1488 4050
东所科目一考场 2664 300 230 159 3353
西安小型车科目三铜川考场 1424 574 557 588 3143
西安支队户县科目三考场 2107 186 343 338 2974
南所科目一考场 1397 473 511 433 2814
西安天平驾校科二考场 1457 456 890 2803
西安博安驾校科目二考场 842 313 935 417 2507
西安华津驾校科二考场 1494 120 422 291 2327
西所科目一考场 1560 262 193 208 2223
郊县阎良科目一考场 1708 226 1934
西安小型车科目三咸阳西郊考场 5 186 544 783 1518
郊县户县科目一考场 1188 247 21 33 1489