学会了Request库的基本用法,接下来我想利用Requests来抓取火车票数据。
基本用法:
Python利用Requests库写爬虫(一)
根据观察,数据接口如下:
https://kyfw.12306.cn/otn/lcxxcx/query?purpose_codes=ADULT&queryDate=2015-05-23&from_station=NCG&to_station=CZQ
返回的是2015-5-23南昌到郴州的火车票信息,格式为json。
返回的数据的如下(只截取了一部分):
{"validateMessagesShowId":"_validatorMessage","status":true,"httpstatus":200,"data":{"datas":[{"train_no":"5u000G140101","station_train_code":"G1401","start_station_telecode":"NXG","start_station_name":"南昌西","end_station_telecode":"IZQ","end_station_name":"广州南","from_station_telecode":"NXG","from_station_name":"南昌西","to_station_telecode":"ICQ","to_station_name":"郴州西","start_time":"07:29","arrive_time":"10:42","day_difference":"0","train_class_name":"","lishi":"03:13","canWebBuy":"Y","lishiValue":"193","yp_info":"O030850182M0507000009097450000","control_train_day":"20991231","start_train_date":"20150523","seat_feature":"O3M393","yp_ex":"O0M090","train_seat_feature":"3","seat_types":"OM9","location_code":"G2","from_station_no":"01","to_station_no":"11","control_day":59,"sale_time":"0930","is_support_card":"1","note":"","gg_num":"--","gr_num":"--","qt_num":"--","rw_num":"--","rz_num":"--","tz_num":"--","wz_num":"--","yb_num":"--","yw_num":"--","yz_num":"--","ze_num":"182","zy_num":"无","swz_num":"无"}}
看着很乱,我们稍加整理:
{
"validateMessagesShowId":"_validatorMessage",
"status":true,"httpstatus":200,
"data":{
"datas":[
{
"train_no":"5u000G140101",
"station_train_code":"G1401",
"start_station_telecode":"NXG",
"start_station_name":"南昌西",
"end_station_telecode":"IZQ",
"end_station_name":"广州南",
"from_station_telecode":"NXG",
"from_station_name":"南昌西",
"to_station_telecode":"ICQ",
"to_station_name":"郴州西",
"start_time":"07:29",
"arrive_time":"10:42",
"day_difference":"0",
...
"swz_num":"无"
},
{
...
}
]
}
这样就比较清晰了,代码如下,提取自己需要的信息。
#-*- coding:utf-8 -*-
import requests
import json
class trainTicketsSprider:
def getTicketsInfo(self,purpose_codes,queryDate,from_station,to_station):
self.url = 'https://kyfw.12306.cn/otn/lcxxcx/query?purpose_codes=%s&queryDate=%s&from_station=%s&to_station=%s' %(purpose_codes,queryDate,from_station,to_station)
self.headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding":"gzip",
"Accept-Language":"zh-CN,zh;q=0.8",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
self.TicketSession = requests.Session()
self.TicketSession.verify = False #关闭https验证
self.TicketSession.headers = self.headers
try:
self.resp_json = self.TicketSession.get(self.url)
self.ticketsDatas = json.loads(self.resp_json.text)["data"]["datas"]
return self.ticketsDatas
except Exception,e:
print e
def isZero(num):
if num == '--' or '无':
return '0'
else:
return num
def main():
purpose_codes = 'ADULT'
queryDate = '2015-05-23'
from_station = 'NCG'
to_station = 'CZQ'
TicketSprider = trainTicketsSprider()
res= TicketSprider.getTicketsInfo(purpose_codes,queryDate,from_station,to_station)
for i,ticketInfo in enumerate(res):
print u"车次:%s" %ticketInfo["station_train_code"]
print u"起始站:%s" %ticketInfo["start_station_name"]
print u"目的地:%s" %ticketInfo["to_station_name"]
print u"开车时间:%s" %ticketInfo["start_time"]
print u"到达时间:%s" %ticketInfo["arrive_time"]
print u"二等座还剩:%s张票" %isZero(ticketInfo["ze_num"])
print u"硬座还剩:%s张票" %isZero(ticketInfo["yz_num"])
print u"硬卧还剩:%s张票" %isZero(ticketInfo["yw_num"])
print u"无座还剩:%s张票" %isZero(ticketInfo["wz_num"])
print u"是否有票:%s" %ticketInfo["canWebBuy"]
print "**********************************"
if __name__ == '__main__':
main()
Github地址