python爬取全国公交线路---bs4方法

这里主要讲了bs4解析方法和json方法,以8684网页为例子,爬取了全国公交线路

import requests
import time
from bs4 import BeautifulSoup
import json
from xpinyin import Pinyin

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}


# 一级页面
def first_get(url, fp):
    # 访问页面
    first = requests.get(url=url, headers=headers)
    # 生成soup对象
    soup = BeautifulSoup(first.text, 'lxml')
    # 先获取数字一栏
    number_list = soup.select('.bus_kt_r1 > a')
    # 获取拼音一栏
    char_list = soup.select('.bus_kt_r2 > a')
    # 将数字和拼音拼接到一块
    all_list = number_list + char_list
    # 拼接二级url
    for href in all_list:
        two_url = url.rstrip('/') + href['href']
        # 进入二级页面
        two_get(two_url, fp, url)


# 二级页面
def two_get(two_url, fp, url):
    # 访问页面
    second = requests.get(url=two_url, headers=headers)
    # 生成soup对象
    soup = BeautifulSoup(second.text, 'lxml')
    # 获取公交编号
    bus_list = soup.select('.stie_list > a')
    # 拼接三级url
    for href in bus_list:
        three_url = url.strip('/') + href['href']
        # 进入三级页面
        three_get(three_url, fp)


def three_get(three_url, fp):
    # 访问页面
    three = requests.get(url=three_url, headers=headers)
    # 生成soup对象
    soup = BeautifulSoup(three.text, 'lxml')
    # 线路名称
    way_name = soup.select('.bus_i_t1 > h1')[0].text.strip(' ')
    # 运行时间
    bus_time = soup.select('.bus_i_content > .bus_i_t4')[0].text.strip('运行时间:')
    # 票价信息
    bus_price = soup.select('.bus_i_content > .bus_i_t4')[1].text.strip('票价信息:')
    # 公交公司
    bus_company = soup.select('.bus_i_content > .bus_i_t4 > a')[0].text
    # 上行总站数
    up_number = soup.select('.bus_line_top > .bus_line_no')[0].text.strip('共站').strip()

    # 获取上行总站牌
    up_number_name1 = soup.select('.bus_site_layer')
    up_number_name_list = []
    for i in range(0, len(up_number_name1) // 2):
        up_number_name2 = up_number_name1[i].select('a')
        # 将站牌存入list列表中
        for name in up_number_name2:
            up_number_name_list.append(name.text)
    try:
        # 下行总站数
        down_number = soup.select('.bus_line_top > .bus_line_no')[1].text.strip('共站').strip()
        # 获取下行总站牌
        down_number_name1 = soup.select(' .bus_site_layer')
        down_number_name_list = []
        for j in range(len(up_number_name1) // 2, len(up_number_name1)):
            down_number_name2 = down_number_name1[j].select('a')
            # 将站牌存入list列表中
            for name in down_number_name2:
                down_number_name_list.append(name.text)
    except Exception as s:
        down_number = '无下行线路'
        down_number_name_list = []
    print("正在抓取%s......" % way_name)
    item = {
        '线路名称': way_name,
        '运行时间': bus_time,
        '票价信息': bus_price,
        '公交公司': bus_company,
        '上行总站数': up_number,
        '获取上行总站牌': up_number_name_list,
        '下行总站数': down_number,
        '获取下行总站牌': down_number_name_list
    }
    string = json.dumps(item, ensure_ascii=False)
    fp.write(string + '\n')
    print("结束抓取%s......" % way_name)
    time.sleep(2)


def main():
    fp = open('西安公交.txt', 'w', encoding='utf8')
    url = 'https://{}.8684.cn/'
    pin = Pinyin()
    string = pin.get_pinyin(input("输入所要查询的城市:"))
    city_name = string.split('-')[0] + string.split('-')[1]
    url = url.format(city_name)
    # 进入一级页面
    first_get(url, fp)


if __name__ == '__main__':
    main()

你可能感兴趣的:(python爬虫)