本文讲解的是通过Python语言实现12306火车票信息爬取的实例
主要思路为:通过查询接口获取网页信息 → 找出信息中的规律 → 对信息进行处理(主要是对字符串的处理) → 提炼相关信息 → 输出相关信息
在这里,相关接口有两类:
1、https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9006,在这个网页上可以获取所有车站对应的代码信息;
2、https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2018-06-18&leftTicketDTO.from_station=XAY&leftTicketDTO.to_station=ZSY&purpose_codes=ADULT,在这个网页上可以可以获取相关车次的信息(车次、站点、时间、票种等)。其中,链接中的2018-06-18、XAY、ZSY、ADULT均可更改,分别表示出发时间、出发站、到达站、成人(学生用0X00表示),由于参数的不同,网页信息也会不同,以下为三种情况:
①正常情况下:
{"validateMessagesShowId":"_validatorMessage","status":true,"httpstatus":200,"data":{"result":["TrzzTVI8faB5h%2Fl%2FbI7bI8kzLbyZkbSR1gUom0cuZ3hSUgPHzhUNvUCiEeTL0hRvvVd2EaebUhSo%0AfwwAVDSUC2DylMiWbx4Ymjz9IcTu0KcEJXFkvw%2FsdEMO4ePApiK%2BhejO5TzmpCS0HyO6ZkDNTsTX%0AQQzKalxJ0WrJ9i3%2BY5Z4PLKt%2Bj4Py4XGK3qruwD1rCj4tB%2FenF5QaaiVALz7yoPlF3BWeC6JWOxs%0AXtB%2FoWeG3A4NxkywoieelcjHTgo0bZTR6w%3D%3D|预订|410000K6260S|K626|XAY|HAN|XAY|ZSY|08:33|10:20|01:47|Y|zLCzbUTvWTf2xEVVJz0KoBnJivTWR3C5Vlb%2BhKByvpXbr%2FbcIQRGHqCoAeE%3D|20180618|3|Y2|01|03|0|0||||2|||有||有|有|||||10401030|1413|0","Oh%2BkGpiSKiINjNP0UqBfRkU1iJHmw4h34knFjzias4jsK6f9OVjz%2F7TS3qhYJmqisfNJRzB9%2F9yy%0A1k%2BCYvh6J2h%2FdjCI3zhV3CCiTEWnuJmQXRwDM9LbFZ583APNpJS%2B%2FZJL9wpbGyH8REMevM5bwXPq%0AF0pHnTfyJMO5SvY8DDTolp48Oqp5brZpPjjcYXlUzvUEJaDjkEQEDs1sZGIkRrZsMRfS4XQAJ0%2F6%0ASRWrXr613t0QYK40hY8sCdLAP0tnJkDKng%3D%3D|预订|41000K10320V|K1032|XAY|GIW|XAY|ZSY|18:30|20:20|01:50|Y|J4oPbdkn2DaSdC%2BgZ8ZdFV%2Bm9D2YV%2BjUtom8C9PH6PKsmZ8S2EshXr8eMgE%3D|20180618|3|Y2|01|03|0|0||||无|||有||有|有|||||10401030|1413|1"],"flag":"1","map":{"ZSY":"柞水","XAY":"西安"}},"messages":[],"validateMessages":{}}
②无直达车次:
{"validateMessagesShowId":"_validatorMessage","status":true,"httpstatus":200,"data":{"result":[],"flag":"1","map":{}},"messages":[],"validateMessages":{}}
{"validateMessagesShowId":"_validatorMessage","status":true,"httpstatus":200,"messages":["选择的查询日期不在预售日期范围内"],"validateMessages":{}}
在这里仅考虑可以查询到相关信息的情况,以2018-06-18从西安到柞水为例,网页信息如下:
{"validateMessagesShowId":"_validatorMessage","status":true,"httpstatus":200,"data":{"result":["JpEXEoUdIb544ps6AtjuprGOho8W4Bp5C65ccpT%2B9K1%2FZTb%2FRUxCiswpOGmZCcL0mTrTbp5obNwQ%0AiECpdH%2B5lHRYHZt2MXBvGD8E4EYc4vgVNBMWu6vwUO53QvuWCbiO1PA3k9hrYxGfOTNXMP5xTp3e%0ACUlFtW6WeUjYoFm2SKn8OMgdTgeKFmmDei1AtNAd2GmAhkYuQloG8qN9uG7miwKANv0dp%2BBhpT0L%0AX3LYpq3W5FWbjGTaaDgrq57ZnQE%2BimVxIw%3D%3D|预订|410000K6260S|K626|XAY|HAN|XAY|ZSY|08:33|10:20|01:47|Y|m1Je80fs7wMFkAVoP2gOJ1qyJ5VDrxiEIT0Ggn2kAG8KsB%2BrE566zP3LQS4%3D|20180618|3|Y2|01|03|0|0||||3|||有||有|有|||||10401030|1413|0","TRuC25LaE%2FLJ341OrQ8kC5g0rgs3jkw5OAVizuKAPSGWf1abFk%2Fjm%2FiZYwNHFlndFsN9ab8sO2Cb%0AL2ucgWTPZQgMGYS9BgUonG7qQMl0mFtPAb28YuTykuyacPjBWq2MuLJpg7sY0Qlp7i2xi15nTtKe%0Af9ZQZgLfyCPxkZFUIER44%2FTLofSt5UFQYXhI0zv3ayP27nbYUoO9wwSF4Yo4mlS8AnqtCq%2FuODcD%0AP3MnajuvBA9afOs2GfcahuOOkMohIOSl7A%3D%3D|预订|41000K10320V|K1032|XAY|GIW|XAY|ZSY|18:30|20:20|01:50|Y|vPLERj7PtbOdfMccILSQhpyN8l6pDbmq9ROzfVwhfuOb2ChkSj4kxxGodQM%3D|20180618|3|Y2|01|03|0|0||||无|||有||有|有|||||10401030|1413|1"],"flag":"1","map":{"ZSY":"柞水","XAY":"西安"}},"messages":[],"validateMessages":{}}
单看这段字符,不难看出这段字符串的排列类似一个字典,字典中又含有字典。列车的相关信息蕴含在以下一类字符串中:
"JpEXEoUdIb544ps6AtjuprGOho8W4Bp5C65ccpT%2B9K1%2FZTb%2FRUxCiswpOGmZCcL0mTrTbp5obNwQ%0AiECpdH%2B5lHRYHZt2MXBvGD8E4EYc4vgVNBMWu6vwUO53QvuWCbiO1PA3k9hrYxGfOTNXMP5xTp3e%0ACUlFtW6WeUjYoFm2SKn8OMgdTgeKFmmDei1AtNAd2GmAhkYuQloG8qN9uG7miwKANv0dp%2BBhpT0L%0AX3LYpq3W5FWbjGTaaDgrq57ZnQE%2BimVxIw%3D%3D|预订|410000K6260S|K626|XAY|HAN|XAY|ZSY|08:33|10:20|01:47|Y|m1Je80fs7wMFkAVoP2gOJ1qyJ5VDrxiEIT0Ggn2kAG8KsB%2BrE566zP3LQS4%3D|20180618|3|Y2|01|03|0|0||||3|||有||有|有|||||10401030|1413|0"分析该字符串得出:不同信息之间均以字符“|”分开,这段字符串中共有36个“|”,其中第三第四个“|”之间包含的是车次、第四第五个“|”之间包含的是始发站……第三十三第三十四个“|”之间包含的是动卧的信息。了解了这些规律,就可以通过对字符串的操作提取出相关信息并存储。
信息提取完之后的输出效果如下:
火车票信息爬取源码如下:
# encoding:utf-8
import re
import requests
from bs4 import BeautifulSoup
kv = {'user-agent': 'Mozilla/5.0'}
url1 = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9006'
response1 = requests.get(url1, headers=kv)
stations = re.findall(u'([\u4e00-\u9fa5]+)\|([A-Z]+)', response1.text)
sta_cod = dict(stations) # 车站名称对应的代码
cod_sta = {v: k for k, v in sta_cod.items()} # 代码对应的车站名称
while 1: # 多次查询
k = [0, 0, 0]
while sum(k) != 3:
train_data = input("请输入出发时间(格式:20180131):")
from_station = input("请输入出发站:")
to_station = input("请输入到达站:")
if len(train_data) == 8:
tra_dat = train_data[0:4] + '-' + train_data[4:6] + '-' + train_data[6:8]
year = eval(train_data[0:4])
if eval(train_data[4]) != 0:
month = eval(train_data[4:6])
else:
month = eval(train_data[5])
if eval(train_data[6]) != 0:
day = eval(train_data[6:8])
else:
day = eval(train_data[7])
if month < 1 or month > 12 or day < 0 or day > 31:
print('出发日期输入错误!')
elif month in [1, 3, 5, 7, 8, 10, 12]:
k[0] = 1
elif month in [4, 6, 9, 11]:
if day < 31:
k[0] = 1
else:
print('出发日期输入错误!')
else:
if (year % 4 == 0 and year % 100 != 0) or year % 400 == 0:
if day < 30:
k[0] = 1
else:
print('出发日期输入错误!')
else:
if day < 29:
k[0] = 1
else:
print('出发日期输入错误!')
else:
print('出发日期输入错误!')
if from_station.find('站') == -1:
k[1] = 1
from_station = sta_cod[from_station]
elif from_station.find('站') != -1:
k[1] = 1
from_station = sta_cod[from_station[0:(len(from_station) - 1)]]
else:
print('出发站输入错误!')
if to_station.find('站') == -1:
k[2] = 1
to_station = sta_cod[to_station]
elif to_station.find('站') != -1:
k[2] = 1
to_station = sta_cod[to_station[0:(len(to_station) - 1)]]
else:
print('到达站输入错误!')
# 火车票信息查询接口
url2 = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=' + tra_dat + '&leftTicketDTO.from_station=' + from_station + '&leftTicketDTO.to_station=' + to_station + '&purpose_codes=ADULT'
response2 = requests.get(url2, headers=kv)
soup2 = BeautifulSoup(response2.text, 'html.parser')
Str_tmp = str(soup2) # 将获得的网页源码转换成字符串
Str = Str_tmp.replace("true", "'true'")
k = []
k_tmp = -1
Mes = eval(Str) # 字符串转换成字典
if Mes['messages'] != []:
print('选择的查询日期不在预售日期范围内!\n')
elif Mes['data']['result'] == [] and Mes['messages'] ==[]:
print('很抱歉,按您的查询条件,当前未找到从 {:} 到 {:} 的列车!\n'.format(cod_sta[from_station], cod_sta[to_station]))
else:
mes = Mes['data']['result']
tra_cod = [] # 车次
sta_beg = [] # 始发站
sta_end = [] # 终到站
sta_lea = [] # 起始站
sta_arr = [] # 终点站
t_lea = [] # 出发时间
t_arr = [] # 到达时间
t_dur = [] # 历时
t_dat = [] # 出发日期
tic = [] # 是否有票
gr = [] # 高级软卧
rw = [] # 软卧
rz = [] # 软座
wz = [] # 无座
yw = [] # 硬卧
yz = [] # 硬座
edz = [] # 二等座
ydz = [] # 一等座
swz = [] # 商务座
dw = [] # 动卧
for i in range(0, len(mes)): # 根据字符串特征提取相关信息
for j in range(0, len(mes[i])):
k_tmp = mes[i].find('|', k_tmp + 1)
if k_tmp == -1:
break
k.append(k_tmp)
tra_cod.append(mes[i][(k[2] + 1):k[3]])
sta_beg.append(cod_sta[mes[i][(k[3] + 1):k[4]]])
sta_end.append(cod_sta[mes[i][(k[4] + 1):k[5]]])
sta_lea.append(cod_sta[mes[i][(k[5] + 1):k[6]]])
sta_arr.append(cod_sta[mes[i][(k[6] + 1):k[7]]])
t_lea.append(mes[i][(k[7] + 1):k[8]])
t_arr.append(mes[i][(k[8] + 1):k[9]])
t_dur.append(mes[i][(k[9] + 1):k[10]])
tic.append(mes[i][(k[10] + 1):k[11]])
t_dat.append(mes[i][(k[12] + 1):k[13]])
gr.append(mes[i][(k[20] + 1):k[21]])
rw.append(mes[i][(k[22] + 1):k[23]])
rz.append(mes[i][(k[23] + 1):k[24]])
wz.append(mes[i][(k[25] + 1):k[26]])
yw.append(mes[i][(k[27] + 1):k[28]])
yz.append(mes[i][(k[28] + 1):k[29]])
edz.append(mes[i][(k[29] + 1):k[30]])
ydz.append(mes[i][(k[30] + 1):k[31]])
swz.append(mes[i][(k[31] + 1):k[32]])
dw.append(mes[i][(k[32] + 1):k[33]])
for h in range(0, len(gr)): # 表示列车不存在相关票种
if gr[h].strip() == '':
gr[h] = '--'
if rw[h].strip() == '':
rw[h] = '--'
if rz[h].strip() == '':
rz[h] = '--'
if wz[h].strip() == '':
wz[h] = '--'
if yw[h].strip() == '':
yw[h] = '--'
if yz[h].strip() == '':
yz[h] = '--'
if edz[h].strip() == '':
edz[h] = '--'
if ydz[h].strip() == '':
ydz[h] = '--'
if swz[h].strip() == '':
swz[h] = '--'
if dw[h].strip() == '':
dw[h] = '--'
k_tmp = -1
del k[0:(len(k) + 1)]
# 输出格式统一
tplt = "{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}"
print(tplt.format("车次", "车站", "时间", "历时", "商务座", "一等座", "二等座", "高级软卧", "软卧", "动卧", "硬卧", "软座", "硬座", "无座",chr(12288)))
for i in range(0, len(mes)):
print(tplt.format(tra_cod[i], sta_lea[i], t_lea[i], t_dur[i], swz[i], ydz[i], edz[i], gr[i], rw[i], dw[i],yw[i], rz[i], yz[i], wz[i], chr(12288)))
print(tplt.format("", sta_arr[i], t_arr[i], "", "", "", "", "", "", "", "", "", "", "", chr(12288)))
print("")
初学Python,如有错误,欢迎指出!
如需转载请联系,谢谢!