无论是工作的需要,还是业余时间想对网上数据进行分析,爬虫都是我们必须过的一个基本关。本次任务就是在工作需要的基础上,需要我们对整个XX市的公交站点进行爬取,获取其经纬度信息,便于后续数据的OD分析。按理说这个数据应该是公交公司有的,但是历史数据信息涵盖量不全,便寄希望于百度地图能给我们提供更多的一点信息吧。话不多说,开始今天的简易爬虫教程。
本次爬虫需要用到的工具:
百度api申请步骤:
1. 打开网页,输入百度地图开放平台,点击进入
2. 注册百度账号,并且滑动到页面最下方,点击申请密钥
3. 点击创建应用,应用类型选择浏览器端,其余信息可以随便填写,白名单填写*(英文半角)
4. 以上步骤变申请成功,查看应用,就有你所创建的应用ak了
import requests
import json
import pymysql
import time
left_bottom = [106.283337,29.263947]; # 设置区域左下角坐标(百度坐标系)
right_top = [106.753618,29.760811]; # 设置区域右上角坐标(百度坐标系)
part_n = 10; # 设置区域网格(10*10)
url0 = 'http://api.map.baidu.com/place/v2/search?';
x_item = (right_top[0]-left_bottom[0])/part_n;
y_item = (right_top[1]-left_bottom[1])/part_n;
query = '公交站'; #搜索关键词设置
ak = '***********************'; #百度地图api信令
url = url0 + 'query=' + query + '&page_size=20&page_num=' + str(k) + '&scope=1&bounds=' + str(left_bottom_part[1]) + ',' + str(left_bottom_part[0]) + ','+str(right_top_part[1]) + ',' + str(right_top_part[0]) + '&output=json&ak=' + ak;
response=requests.get(url)
data=response.text
hjson = json.loads(data);
time.sleep(1)
if hjson['message'] == 'ok':
results = hjson['results']
for m in range(len(results)): # 提取返回的结果
# write_to_file(results[m])
fileData['name'].append(results[m]['name'])
fileData['lat'].append(results[m]['location']['lat'])
fileData['lng'].append(results[m]['location']['lng'])
fileData['address'].append(results[m]['address'])
fileData['detail'].append(results[m]['detail'])
fileData['uid'].append(results[m]['uid'])
def write_to_mysql(contents):
# 连接数据库
con = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='',
db='busstation',
charset='utf8')
cur = con.cursor()
try:
for i in range(len(contents['name'])):
sql = "insert into bus_station_info(name,lat,lng,address,detail,uid) values (%s,%s,%s,%s,%s,%s)"
cur.execute(sql, [contents['name'][i], contents['lat'][i],contents['lng'][i], contents['address'][i],
contents['detail'][i], contents['uid'][i]])
# print(sql,[contents['name'][i], contents['lat'][i], contents['lng'][i], contents['address'][i],\
# contents['detail'][i], contents['uid'][i]])
except Exception as e:
print('加载sql语句出错', e)
else:
con.commit()
print('事务处理成功', cur.rowcount)
finally:
cur.close()
con.close()
中间可能有些小bug,导致收集到的数据大量重复,数据爬取不是很完整(可能与api开放的数据量有关),有待改进,但功能完成,能正常运行,以下是完整代码。
# -*- coding: utf-8 -*-
# Python 3.6
# 提取公交站点信息,并存储到文件和MySQL中
import requests
import json
import pymysql
import time
#从第373个切片就已经不允许爬了,调用api当天配额已经用光了
def main():
chipStart=373
left_bottom = [106.283337,29.263947]; # 设置区域左下角坐标(百度坐标系)
right_top = [106.753618,29.760811]; # 设置区域右上角坐标(百度坐标系)
part_n = 20; # 设置区域网格(2*2)
url0 = 'http://api.map.baidu.com/place/v2/search?';
x_item = (right_top[0]-left_bottom[0])/part_n;
y_item = (right_top[1]-left_bottom[1])/part_n;
query = '公交站'; #搜索关键词设置
ak = '********************'; #百度地图api信令
n = 0; # 切片计数器
for i in range(part_n):
for j in range(part_n):
n += 1
if n<=chipStart:
continue
left_bottom_part = [left_bottom[0]+i*x_item,left_bottom[1]+j*y_item]; # 切片的左下角坐标
right_top_part = [right_top[0]+i*x_item,right_top[1]+j*y_item]; # 切片的右上角坐标
fileData={
'name':[],
'lat':[],
'lng':[],
'address':[],
'detail':[],
'uid':[]
}
for k in range(20):
url = url0 + 'query=' + query + '&page_size=20&page_num=' + str(k) + '&scope=1&bounds=' + str(left_bottom_part[1]) + ',' + str(left_bottom_part[0]) + ','+str(right_top_part[1]) + ',' + str(right_top_part[0]) + '&output=json&ak=' + ak;
response=requests.get(url)
data=response.text
hjson = json.loads(data);
time.sleep(1)
if hjson['message'] == 'ok':
results = hjson['results']
for m in range(len(results)): # 提取返回的结果
# write_to_file(results[m])
fileData['name'].append(results[m]['name'])
fileData['lat'].append(results[m]['location']['lat'])
fileData['lng'].append(results[m]['location']['lng'])
fileData['address'].append(results[m]['address'])
fileData['detail'].append(results[m]['detail'])
fileData['uid'].append(results[m]['uid'])
# write_to_mysql(fileData)
print ('第',str(n),'个切片入库成功')
def write_to_file(content):
with open('busStation.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content) + '\n')
f.close()
def write_to_mysql(contents):
# 连接数据库
con = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='',
db='busstation',
charset='utf8')
cur = con.cursor()
try:
for i in range(len(contents['name'])):
sql = "insert into bus_station_info(name,lat,lng,address,detail,uid) values (%s,%s,%s,%s,%s,%s)"
cur.execute(sql, [contents['name'][i], contents['lat'][i],contents['lng'][i], contents['address'][i],
contents['detail'][i], contents['uid'][i]])
except Exception as e:
print('加载sql语句出错', e)
else:
con.commit()
print('事务处理成功', cur.rowcount)
finally:
cur.close()
con.close()
if __name__ == '__main__':
main()