用requests 爬去公交

import requests
import time
from bs4 import BeautifulSoup
import json

headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}

def parse_first_page(url):
   r = requests.get(url, headers=headers)
   soup = BeautifulSoup(r.text, 'lxml')
   # 查找得到所有的以数字开头的链接
   number_a_list = soup.select('.bus_kt_r1 > a')
   char_a_list = soup.select('.bus_kt_r2 > a')
   # 提取a里面的href
   a_list = number_a_list + char_a_list
   href_list = []
   for oa in a_list:
      href = url.rstrip('/') + oa['href']
      href_list.append(href)
   return href_list

def parse_second_page(url, href):
   r = requests.get(url=href, headers=headers)
   soup = BeautifulSoup(r.text, 'lxml')
   # 查找得到所有的公交链接
   bus_a_list = soup.select('#con_site_1 > a')
   href_list = []
   for oa in bus_a_list:
      href = url.rstrip('/') + oa['href']
      href_list.append(href)
   return href_list

def parse_third_page(href, fp):
   r = requests.get(href, headers=headers)
   soup = BeautifulSoup(r.text, 'lxml')
   # 线路名称
   route_name = soup.select('.bus_i_t1 > h1')[0].string
   print('正在爬取---%s---...' %route_name)
   # 运行时间
   run_time = soup.select('.bus_i_content > p')[0].string.lstrip('运行时间:')
   # 票价信息
   price_info = soup.select('.bus_i_content > p')[1].string.lstrip('票价信息:')
   # 公交公司
   company = soup.select('.bus_i_content > p > a')[0].string
   # 更新时间
   update_time = soup.select('.bus_i_content > p')[-1].string.lstrip('最后更新:')
   # 上行总个数
   up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip()
   # 上行总站牌
   up_name_list = []
   number = int(up_total)
   up_a_list = soup.select('.bus_site_layer > div > a')[:number]
   for oa in up_a_list:
      up_name_list.append(oa.string)
   # 下行总个数
   # 下行总站牌
   down_a_list = soup.select('.bus_site_layer > div > a')[number:]
   down_total = len(down_a_list)
   down_name_list = []
   for oa in down_a_list:
      down_name_list.append(oa.string)
   # print(down_name_list)
   # exit()
   
   # 保存到字典中
   item = {
      '线路名称': route_name,
      '运行时间': run_time,
      '票价信息': price_info,
      '公交公司': company,
      '更新时间': update_time,
      '上行个数': up_total,
      '上行站牌': up_name_list,
      '下行个数': down_total,
      '下行站牌': down_name_list,
   }
   string = json.dumps(item, ensure_ascii=False)
   fp.write(string + '\n')
   print('结束爬取---%s---' %route_name)
   # time.sleep(1)

def main():
   url = 'http://beijing.8684.cn/'
   number_char_list = parse_first_page(url)
   fp = open('北京.txt', 'w', encoding='utf8')
   # 向所有的以数字、字母开头的发送请求,解析二级页面
   for href in number_char_list:
      bus_href_list = parse_second_page(url, href)
      # 遍历所有的公交详情页,获取每一路公交的详细信息
      for href_detail in bus_href_list:
         parse_third_page(href_detail, fp)

   fp.close()

if __name__ == '__main__':
   main()

你可能感兴趣的:(用requests 爬去公交)