通过高德地图抓起所有城市的银行信息

目标:通过高德地图的搜索接口,抓取每个城市的所有银行的分行信息

思路:1. 在本地mysql中存储有全国各城市名称、编码

  1. 将城市编码读取到数组中

  2. 按照数据读取每个编码,组拼URL,通过POST请求访问接口

  3. 获取xml后解析出我们需要的数据,插入到mysql中

第一步,定义访问接口的基本参数

file_name='result.txt'          # write result to this file

url_header='http://restapi.amap.com/v3/place/text?&keyword=&types=160100&'

url_end='&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base'

url_amap='city='

each_page_rec=20          # results that displays in one page

xml_file='tmp.xml'           # xml filen name

第二步,建立本地数据库访问请求,获取数据库中的所有城市编码
首先,在本地mysql中建立一张region表,可以从网上down一份全国各地省市区编码表,结构如下图:

通过高德地图抓起所有城市的银行信息_第1张图片
省市区编码表.png

为了方便大家,我将表的结构及数据导出为sql语句,直接复制到mysql中执行即可,链接如下 http://www.jianshu.com/p/0b9b0e3cda5f

def getallcity():
    cityarr = []
    connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
                                 db='icoachu', charset="utf8")
    cursor = connection.cursor()
    sql = "select * from region where parent_id in (select id from region where parent_id=0)"
    try:
        cursor.execute(sql)
        rows = cursor.fetchall()
        for row in rows:
            cityarr.append(row[0])
        return cityarr
    finally:
        cursor.close()
        connection.close()
    return cityarr

关于如何访问本地mysql的,比较简单,此处不做说明,需要强调的是在try 语句中,一定要在finally中关闭cursor及connection。

第三步,通过接口访问获取html数据,并将数据写入到文件中

# get html by url and save the data to xml file
def gethtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    # print(html)

    try:
        # open xml file and save data to it
        with open(xml_file, 'wb+') as xml_file_handle:
            xml_file_handle.write(html)
    except IOError as err:
        print
        "IO error: " + str(err)
        return -1
    return 0

第四步,获取xml格式的数据之后,解析相关字段,并插入到mysql中

# phrase data from xml
def parsexml():
    total_rec = 1  # record number

    # open xml file and get data record
    try:
        with open(file_name, 'a') as file_handle:
            dom = minidom.parse(xml_file)
            root = dom.getElementsByTagName("response")  # The function getElementsByTagName returns NodeList.

            for node in root:
                total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValue

                pois = node.getElementsByTagName("pois")
                for poi in pois[0].getElementsByTagName('poi'):
                    branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue
                    branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
                    branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue
                    bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue
                    pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
                    cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
                    aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
                    # address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
                    # biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue
                    # tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
                    # distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue
                    arr = branch_type.split(';')
                    bank_name = arr[-1]
                    sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
                    branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)

                    connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
                                                 db='icoachu', charset="utf8")
                    cursor = connection.cursor()
                    try:
                        print(sql)
                        cursor.execute(sql)
                        connection.commit()
                        if cursor.rowcount != 1:
                            raise Exception("数据插入失败%s", sql)
                    finally:
                        connection.close()
                        cursor.close()

    except IOError as err:
        print
        "IO error: " + str(err)

    return total_rec

第五步,在主函数中实现处理相关调用

if __name__ == '__main__':
    cityarr = getallcity()
    for cityId in cityarr:
        url = r'%scity=%s%s' % (url_header, cityId, url_end)
        if gethtml(url) == 0:
            total_record_str = parsexml()
            total_record = int(str(total_record_str))
            if (total_record % each_page_rec) != 0:
                page_number = total_record / each_page_rec + 2
            else:
                page_number = total_record / each_page_rec + 1

            for each_page in frange(2, float(page_number)):
                print
                'parsing page ' + str(each_page) + ' ... ...'
                url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
                print(url)
                gethtml(url)
                parsexml()
        else:
            print
            'error: fail to get xml from amap'

完整的代码如下

# coding:utf-8


# 目标:通过高德地图的搜索接口,抓取每个城市的所有银行的分行信息
# 思路:1. 在本地mysql中存储有全国各城市名称、编码
#      2. 将城市编码读取到数组中
#      3. 按照数据读取每个编码,组拼URL,通过POST请求访问接口
#      4. 获取xml后解析出我们需要的数据,插入到mysql中


import urllib
import xml.dom.minidom as minidom
import string
import urllib.request
import pymysql

file_name = 'result.txt'  # write result to this file
url_header = 'http://restapi.amap.com/v3/place/text?&keyword=&types=160100&'
url_end = '&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base'
url_amap = 'city='
each_page_rec = 20  # results that displays in one page
xml_file = 'tmp.xml'  # xml filen name


# get html by url and save the data to xml file
def gethtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    # print(html)

    try:
        # open xml file and save data to it
        with open(xml_file, 'wb+') as xml_file_handle:
            xml_file_handle.write(html)
    except IOError as err:
        print
        "IO error: " + str(err)
        return -1
    return 0


# phrase data from xml
def parsexml():
    total_rec = 1  # record number

    # open xml file and get data record
    try:
        with open(file_name, 'a') as file_handle:
            dom = minidom.parse(xml_file)
            root = dom.getElementsByTagName("response")  # The function getElementsByTagName returns NodeList.

            for node in root:
                total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValue

                pois = node.getElementsByTagName("pois")
                for poi in pois[0].getElementsByTagName('poi'):
                    branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue
                    branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
                    branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue
                    bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue
                    pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
                    cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
                    aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
                    # address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
                    # biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue
                    # tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
                    # distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue
                    arr = branch_type.split(';')
                    bank_name = arr[-1]
                    sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
                    branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)

                    connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
                                                 db='icoachu', charset="utf8")
                    cursor = connection.cursor()
                    try:
                        print(sql)
                        cursor.execute(sql)
                        connection.commit()
                        if cursor.rowcount != 1:
                            raise Exception("数据插入失败%s", sql)
                    finally:
                        connection.close()
                        cursor.close()

    except IOError as err:
        print
        "IO error: " + str(err)

    return total_rec


def frange(start, stop, step=1):
    i = start
    while i < stop:
        yield i
        i += step


def getallcity():
    cityarr = []
    connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
                                 db='icoachu', charset="utf8")
    cursor = connection.cursor()
    sql = "select * from region where parent_id in (select id from region where parent_id=0)"
    try:
        cursor.execute(sql)
        rows = cursor.fetchall()
        for row in rows:
            cityarr.append(row[0])
        return cityarr
    finally:
        cursor.close()
        connection.close()
    return cityarr


if __name__ == '__main__':
    cityarr = getallcity()
    for cityId in cityarr:
        url = r'%scity=%s%s' % (url_header, cityId, url_end)
        if gethtml(url) == 0:
            total_record_str = parsexml()
            total_record = int(str(total_record_str))
            if (total_record % each_page_rec) != 0:
                page_number = total_record / each_page_rec + 2
            else:
                page_number = total_record / each_page_rec + 1

            for each_page in frange(2, float(page_number)):
                print
                'parsing page ' + str(each_page) + ' ... ...'
                url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
                print(url)
                gethtml(url)
                parsexml()
        else:
            print
            'error: fail to get xml from amap'

数据库中数据如下:

通过高德地图抓起所有城市的银行信息_第2张图片
查询结果.png

你可能感兴趣的:(通过高德地图抓起所有城市的银行信息)