对12306车票数据的提取

前言

之前写了12306的爬虫,当时可以运行,最近发现不行,返回的数据不对,可能是cookie有问题(可能需要某些参数,我也不确定,但没有cookie,数据请求不到。。。),修改之后,成功。

对于返回的数据进行了提取,并再次发送请求,得到车票的其他数据。

1,获得火车的经过的站及相关数据。

2,获得火车的票价。

本来觉得应该写窗体的,自己实力不行,有待学习,以后再来完成。

开始

得到数据

发送请求

该写的都写上,cookie,headers之类的。直接给出代码

# a传入时间(2023--01-10)
# b,出发地的代号
# c 目的地代号,
url = f'https://kyfw.12306.cn/otn/leftTicket/queryZ?leftTicketDTO.train_date={a}&leftTicketDTO.from_station={b}&leftTicketDTO.to_station={c}&purpose_codes=ADULT'
            self.headers = {
                'Cookie': f'_jc_save_toStation={b}',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76',
            }
            r = requests.get(url=url, headers=self.headers)
            results = r.json()['data']['result']
#  result 就是数据
#  下面进行分析

解释

参数

需要提供的参数有

1,出发时间

2,出发地代号

3,目的地代号

参数的获取
    """
城市代码
url=https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9002
怎么获得
1,打开开发者工具(中文)
2,源代码
3,在js中
    """

获取到我保存到csv文件中,也可以放到数据库中。

用pandas提取csv中的数据,即参数

返回的数据

返回的数据是以"|"进行分割的,如下

data="vdeVLhSURm4Q1RP66du87ohMRCQ3MLwq0kfa3TJCnHalBU63NmdSl3%2Bj0rKw2GIMFpDmXzicLXbe%0Arj7VwfiSslzB2E330A4AhBXW%2FSiLQW29lNCn2ZoamKWMVLFSt9f7vxwjtgMqvoqMpleoviO0aZOG%0AX75YLCGxZv%2Bj8obzGYdP%2FjDdI1CxxnTonzzXG7GI0pM0YEwsrGSYPgTnlXRAjzA6WrqZn%2BVHxaQF%0AgL0vdZ2sxItfSR2yGSAYacvObVlZXkCJoHAJjrCXT25HnBhsUCrO%2FRQ%2F97qKbBcN5F3JnPxWZZzw%0AfwVu%2F0DAqHBn69up|预订|76000G219505|G2195|ICW|AOH|ICW|CWQ|06:10|13:22|07:12|Y|dHywp8bnO4BIQNw1cH%2FaVsWHeAoP7nmv6%2FF4lR7Tvkkl2UtE|20230114|3|W2|01|14|1|0|||||||||||有|5|2||90M0O0|9MO|1|0||9178550002M095200005O057850021|0|||||1|0#0#0#0#z||"

用split进行分开,得到一个列表,里面有49个元素

data.split('|')

配合enumerate 函数找到有用的数据,经过多次分析,其中有20条数据是有用的

给出数据和对应的含义

"""
#t就是data
                train_no = t[2]   #火车参数
                che = t[3]  # 车次
                startcode = t[4] #出发地代号
                endcode = t[5]  # 目的地代号
                from_station_no = t[16] # 发车地代号
                to_station_no = t[17]  # 终点代号
                seat_types = t[35] # 座位类型
                starttime = t[8]  # 出发时间
                endtime = t[9] # 到站时间
                duration_time = t[10] 持续时间
                special_shop_seat = t[32] or t[25] or '--' # 商务座/特等座,二者数据所处位置不一样
                first_seat = t[31] or '--' # 一等座
                second_seat = t[30] or '--' # 二等座
                high_sleep = t[21] or '--' # 高级软卧
                soft_sleep = t[23] or '--' # 软卧
                dong_sleep = t[33] or '--' # 动卧
                hard_sleep = t[28] or '--' # 硬卧
                sort_seat = t[24] or '--' # 软座
                hart_seat = t[29] or '--' # 硬座
                no_seat = t[26] or '--' # 站票
"""

把数据用prettable进行展示中,并且把这些数据存到一个新的列表中,为后来的操作提供需要的参数。

# import prettable as pt
# tb=pt.PrettyTable() 
# 列名如下,和注释可能不一样,不重要,数据一样的。
# 新增加了序号,好选择火车
tb.field_names = ['序号', '火车参数', '车次', '出发地代号', '终点代号', '出发地代码', '终点代码', '座位信息', '开始时', '结束时', '持续时间', '商务座/特等座','一等座', '二等座', '高级软卧', '软卧', '动卧', '软座', '硬卧', '软座' '硬座', '站票']
# 如图
对12306车票数据的提取_第1张图片

数据太多,没对齐。。。

获取路途的站及相关数据

发送请求

# a 火车参数
# b 起点站的代号
# c 终点站代号
# d 时间
 url_1 = f'https://kyfw.12306.cn/otn/czxx/queryByTrainNo?train_no={a}&from_station_telecode={b}&to_station_telecode={c}&depart_date={d}'
resp = requests.get(url=url_1, headers=self.headers)
data = resp.json()['data']['data']

返回了数据,进行提取,用prettable来表示

tb.field_names = (['到站时间', '站点', '站数', '停留时间', '出站时间'])

没有多说的。唯一要说的就是代号,是火车发车的起点代号。

展示一下

对12306车票数据的提取_第2张图片

获得票价

发送请求

#  a 火车参数
#  b 出发地代号
#  c 目的地代号
#  d  座位的类型
#  e 时间
#  注意代号的不同
url = f'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no={a}&from_station_no={b}&to_station_no={c}&seat_types={d}&train_date={e}'
        r = requests.get(url=url, headers=self.headers)

数据的分析

因为座位的不同,票价不一样,而导致座位类型不同,返回的数据也不同。

而座位的类型,d

        if d == '1341':
            tb.field_names = (['软卧价格', '硬卧价格', '硬座价格', '无座价格'])
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hart_seat_price = r.json()['data']['A1']
            no_seat_price = r.json()['data']['WZ']
            tb.add_row([soft_sleep_price, hard_sleep_price, hart_seat_price, no_seat_price])
            return tb
        if d == '1346':
            tb.field_names = (['高级软卧软卧价格', '硬卧价格', '硬座价格', '无座价格'])
            high_soft_sleep_price = r.json()['data']['A6']
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hard_seat_price = r.json()['data']['A1']
            tb.add_row([high_soft_sleep_price, soft_sleep_price, hard_sleep_price, hard_seat_price])
            return tb
        if d == '134':
            tb.field_names = (['软卧价格', '硬卧价格', '硬座价格'])
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hart_seat_price = r.json()['data']['A1']
            tb.add_row([soft_sleep_price, hard_sleep_price, hart_seat_price])
            return tb

        if d == '9MO' or 'OM9':
            tb.field_names = (['商务座价格', '一等座', '二等座'])
            special_price = r.json()['data']['A9']
            first_seat_price = r.json()['data']['M']
            second_seat = r.json()['data']['O']
            tb.add_row([special_price, first_seat_price, second_seat])
            return tb
        if d == 'MOO':
            tb.field_names = (['一等座', '二等座'])
            first_seat_price = r.json()['data']['M']
            second_seat = r.json()['data']['O']
            tb.add_row([first_seat_price, second_seat])
            return tb
        if d=='FOO':
            tb.field_names = (['动卧', '二等座','无座'])
            dong_sleep = r.json()['data']['F']
            second_seat = r.json()['data']['O']
            no_seat = r.json()['data']['O']
            tb.add_row([dong_sleep, second_seat,no_seat])
            return tb
        if d=='FO':
            tb.field_names = (['动卧', '二等座'])
            dong_sleep = r.json()['data']['F']
            second_seat = r.json()['data']['O']
            tb.add_row([dong_sleep, second_seat])
            return tb
        if d=='F':
            tb.field_names = (['动卧'])
            dong_sleep = r.json()['data']['F']
            tb.add_row([dong_sleep])
            return tb

        if d=='MOP':
            tb.field_names = (['特等座', '二等座','一等座'])
            special_seat = r.json()['data']['P']
            second_seat = r.json()['data']['O']
            first_seat = r.json()['data']['M']
            tb.add_row([special_seat, second_seat,first_seat])
            return tb
        if d=='IJO':
            tb.field_names = (['二等座', '软卧','硬卧'])
            soft_sleep = r.json()['data']['AI']
            second_seat = r.json()['data']['O']
            hard_sleep = r.json()['data']['AJ']
            tb.add_row([second_seat, soft_sleep,hard_sleep])
            return tb

d有很多种,应实际情况会发生不同的变化,我感觉没有遇到完,还有其他情况。。。

而且这样写,重复的代码实在太多,有待修改,有些情况应该还没遇到,还有其他座位类型。

展示一下,有待修改。

对12306车票数据的提取_第3张图片

操作一下

视频

不知道怎么把视频传到csdn上,传到b站上了,顺便开始当up主,哈哈哈哈哈哈

视频

总结

有待修改,还要结合pyqt5,不然终究感觉少了什么

源码

import requests
import prettytable as pt
import pandas as pd
import datetime

tb = pt.PrettyTable()


def today(a: int):
    today = datetime.date.today()
    data = today + datetime.timedelta(days=a)
    return data


class ottzs:
    def __init__(self):
        self.headers = None
        self.times = None

    def get_time(self):
        a = {}
        for i in range(15):
            a[i] = today(i)
        a[15] = '-1'
        return a

    def choose_time(self):
        t = self.get_time()
        choose = int(input(f'需要输入需要查询的车票的时间,序号如下.\n请选择序号\n'
                           f'序号----时间\n'
                           f'0----{today(0)}\n'
                           f'1----{today(1)}\n'
                           f'2----{today(2)}\n'
                           f'3----{today(3)}\n'
                           f'4----{today(4)}\n'
                           f'5----{today(5)}\n'
                           f'6----{today(6)}\n'
                           f'7----{today(7)}\n'
                           f'8----{today(8)}\n'
                           f'9----{today(9)}\n'
                           f'10----{today(10)}\n'
                           f'11----{today(11)}\n'
                           f'12----{today(12)}\n'
                           f'13----{today(13)}\n'
                           f'14----{today(14)}\n'
                           f'15-----退出\n'
                           '请输入序号:\n'
                           ))
        times = t[choose]
        return times

    def get_code(self, a):
        s = pd.read_csv(path, index_col='地点')
        result = s.loc[f'{a}']['代码']
        return result

    """
     城市代码的url=https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9002
    """

    def get_data(self, a, b, c):
        try:
            tb.field_names = ['序号 ', '火车参数     ', '车次   ', '出发地代号   ', '终点代号  ', '出发地代码  ', '终点代码  ', '座位信息 ', '开始时   ', '结束时  ', '  持续时间  ', '商务座/特等座   ','一等座   ', '二等座 ', '高级软卧   ', '软卧   ', '动卧  ', '软座   ', '硬卧   ', '软座 ' '硬座  ', '站票   ']

            url = f'https://kyfw.12306.cn/otn/leftTicket/queryZ?leftTicketDTO.train_date={a}&leftTicketDTO.from_station={b}&leftTicketDTO.to_station={c}&purpose_codes=ADULT'
            self.headers = {
                'Cookie': f'_jc_save_toStation={b}',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76',
            }
            r = requests.get(url=url, headers=self.headers)
            results = r.json()['data']['result']
            acc = []
            num = 0
            for i in results:
                t = i.split('|')
                train_no = t[2]
                che = t[3]
                startcode = t[4]
                endcode = t[5]
                from_station_no = t[16]
                to_station_no = t[17]
                seat_types = t[35]
                starttime = t[8]
                endtime = t[9]
                duration_time = t[10]
                special_shop_seat = t[32] or t[25] or '--'

                first_seat = t[31] or '--'
                second_seat = t[30] or '--'
                high_sleep = t[21] or '--'
                soft_sleep = t[23] or '--'
                dong_sleep = t[33] or '--'
                hard_sleep = t[28] or '--'
                sort_seat = t[24] or '--'
                hart_seat = t[29] or '--'
                no_seat = t[26] or '--'
                tb.add_row([num + 1, train_no, che, startcode, endcode, from_station_no, to_station_no, seat_types, starttime, endtime, duration_time, special_shop_seat, first_seat, second_seat, high_sleep, soft_sleep, dong_sleep, hard_sleep, sort_seat, hart_seat, no_seat])
                acc.append([num + 1, train_no, che, startcode, endcode, from_station_no, to_station_no, seat_types, starttime, endtime, duration_time, special_shop_seat, first_seat, second_seat, high_sleep, soft_sleep, dong_sleep, hard_sleep, sort_seat, hart_seat, no_seat])
                num += 1
            print(tb)
            return acc
        except Exception as e:
            return e

    def price(self, a, b, c, d, e):
        url = f'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no={a}&from_station_no={b}&to_station_no={c}&seat_types={d}&train_date={e}'
        r = requests.get(url=url, headers=self.headers)
        if d == '1341':
            tb.field_names = (['软卧价格', '硬卧价格', '硬座价格', '无座价格'])
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hart_seat_price = r.json()['data']['A1']
            no_seat_price = r.json()['data']['WZ']
            tb.add_row([soft_sleep_price, hard_sleep_price, hart_seat_price, no_seat_price])
            return tb
        if d == '1346':
            tb.field_names = (['高级软卧软卧价格', '硬卧价格', '硬座价格', '无座价格'])
            high_soft_sleep_price = r.json()['data']['A6']
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hard_seat_price = r.json()['data']['A1']
            tb.add_row([high_soft_sleep_price, soft_sleep_price, hard_sleep_price, hard_seat_price])
            return tb
        if d == '134':
            tb.field_names = (['软卧价格', '硬卧价格', '硬座价格'])
            soft_sleep_price = r.json()['data']['A4']
            hard_sleep_price = r.json()['data']['A3']
            hart_seat_price = r.json()['data']['A1']
            tb.add_row([soft_sleep_price, hard_sleep_price, hart_seat_price])
            return tb

        if d == '9MO' or 'OM9':
            tb.field_names = (['商务座价格', '一等座', '二等座'])
            special_price = r.json()['data']['A9']
            first_seat_price = r.json()['data']['M']
            second_seat = r.json()['data']['O']
            tb.add_row([special_price, first_seat_price, second_seat])
            return tb
        if d == 'MOO':
            tb.field_names = (['一等座', '二等座'])
            first_seat_price = r.json()['data']['M']
            second_seat = r.json()['data']['O']
            tb.add_row([first_seat_price, second_seat])
            return tb
        if d=='FOO':
            tb.field_names = (['动卧', '二等座','无座'])
            dong_sleep = r.json()['data']['F']
            second_seat = r.json()['data']['O']
            no_seat = r.json()['data']['O']
            tb.add_row([dong_sleep, second_seat,no_seat])
            return tb
        if d=='FO':
            tb.field_names = (['动卧', '二等座'])
            dong_sleep = r.json()['data']['F']
            second_seat = r.json()['data']['O']
            tb.add_row([dong_sleep, second_seat])
            return tb
        if d=='F':
            tb.field_names = (['动卧'])
            dong_sleep = r.json()['data']['F']
            tb.add_row([dong_sleep])
            return tb

        if d=='MOP':
            tb.field_names = (['特等座', '二等座','一等座'])
            special_seat = r.json()['data']['P']
            second_seat = r.json()['data']['O']
            first_seat = r.json()['data']['M']
            tb.add_row([special_seat, second_seat,first_seat])
            return tb
        if d=='IJO':
            tb.field_names = (['二等座', '软卧','硬卧'])
            soft_sleep = r.json()['data']['AI']
            second_seat = r.json()['data']['O']
            hard_sleep = r.json()['data']['AJ']
            tb.add_row([second_seat, soft_sleep,hard_sleep])
            return tb


    def road(self, a, b, c, d):
        tb.field_names = (['到站时间', '站点', '站数', '停留时间', '出站时间'])

        url_1 = f'https://kyfw.12306.cn/otn/czxx/queryByTrainNo?train_no={a}&from_station_telecode={b}&to_station_telecode={c}&depart_date={d}'
        resp = requests.get(url=url_1, headers=self.headers)
        data = resp.json()['data']['data']
        for i in data:
            arrive_time = i['arrive_time']
            station_name = i['station_name']
            station_no = i['station_no']
            stopover_time = i['stopover_time']
            start_time = i['start_time']
            tb.add_row([arrive_time, station_name, station_no, stopover_time, start_time])
        return tb

    def success(self, e):
        while True:
            print(
                '0,重新选择\n'
                '1,查看价格\n'
                '2,火车路程\n'
            )
            f = input('请输入(-1退出):')
            if f == '-1':
                break
            if f == '2':
                choose = int(input('请输入火车的序号:')) - 1
                xunhao = e[choose]
                j = self.road(xunhao[1],xunhao[3], xunhao[4], self.times)
                print(j)
                tb.clear()
            if f == '0':
                continue
            if f == '1':
                choose = int(input('请输入火车的序号:'))-1
                xunhao = e[choose]
                A=xunhao[1]
                B=xunhao[5]
                C=xunhao[6]
                D=xunhao[7]
                E = self.price(A, B, C, D, self.times)
                print(E)
                tb.clear()
    def main(self):
        while True:
            self.times = self.choose_time()
            if self.times == '-1':
                break
            a = input('输入出发地:')
            b = input('输入终点:')
            c = self.get_code(a)
            d = self.get_code(b)
            e = self.get_data(self.times, c, d)
            tb.clear()
            if isinstance(e, list):
                self.success(e)
            else:
                print('没有返回数据!!!')
                choose = input('是否再次尝试\n1,是\n2,算了吧\n')
                if choose == '1':
                    continue
                else:
                    break

你可能感兴趣的:(python,开发语言)