1.简介

case study: openstreetmap

available for download
xml
human edited
relatable

2.迭代解析

找到数据的所有顶层标签
遍历数据集，创建一个字典

解析方式:

树形解析：
将数据读入内存，将它当成树结构上的节点来处理
sax解析器/迭代解析
一次解析一个标签
每次看到一个标签时，都把它当成事件来处理

Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many.
Fill out the count_tags function. It should return a dictionary with the
tag name as the key and number of times this tag can be encountered in
the map as value.

import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    tags={}
    tree = ET.iterparse(filename,events=('start',))
    for event,elem in tree:
        if elem.tag not in tags.keys():
            tags[elem.tag] =1
        else:
            tags[elem.tag] +=1
    return tags

2.循环访问道路标签

def audit():
    tree = ET.iterparse(filename,events=('start',))
    for event,elem in tree:
        if elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    audit_street_type(street_types,tag.attrib['v'])
    pprint.pprint(dict(street_types))

3.标签类型

import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to check the "k" value for each "" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. 

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
"""

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k = element.attrib['k']

        if re.search(lower,k):
            keys['lower']+=1
        elif re.search(lower_colon,k):
            keys['lower_colon']+=1
        elif re.search(problemchars,k):
            keys['problemchars']+=1
        else:
            keys['other']+=1
        # YOUR CODE HERE
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for event, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

正则表达式匹配:match = re.search(pattern, string)

4.探索用户

import xml.etree.cElementTree as ET
import pprint
import re
"""
The task is  find out how many unique users
have contributed to the map in this particular area!
The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    if 'uid' in element.attrib:
        return element.attrib['uid']
    return

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))


    return users

def test():
    users = process_map('example.osm')
    pprint.pprint(users)
    assert len(users) == 6

if __name__ == "__main__":
    test()

5.审查街道名

import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict

osm_file= 'example.xml'
street_type_re = re.compile(r'\b\S+\.?$',re.IGNORECASE)      #提取或者匹配街道名称中的最后一个单词
street_types = defaultdict(set)

expected=['Street','Avenue','Boulevard','Drive','Court','Place']

def audit_street_type(street_types,street_name):
    m= street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k']=='addr:street')

def audit():
    for event,elem in ET.iterparse(osm_file):
        if elem.tag=='way':
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    audit_street_type(street_types,tag.attrib['v'])
    pprint.pprint(dict(street_types))

6.完善街道名

"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "St.": "Street",
            "Ave":"Avenue",
            "Rd.":"Road"
            }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    
    for x,y in mapping.items():
        if name.endswith(x):
            return name.replace(x,y)

7.为数据集做准备

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "example.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for node_field in NODE_FIELDS:
            node_attribs[node_field]=element.attrib[node_field]
            
        for node_tag in element.iter("tag"):
            node_tags_dict={}
            problem=re.match(PROBLEMCHARS,node_tag.attrib['k'])
            colon=re.match(LOWER_COLON,node_tag.attrib['k'])
            if problem:
                continue
            elif colon:
                node_tags_dict['id']=element.attrib['id']
                node_tags_dict['type']=node_tag.attrib['k'].split(":",1)[0]
                node_tags_dict['key']=node_tag.attrib['k'].split(":",1)[1]
                node_tags_dict['value']=node_tag.attrib['v']
                tags.append(node_tags_dict)
                
            else:
                node_tags_dict['id']=element.attrib['id']
                node_tags_dict['type']='regular'
                node_tags_dict['key']=node_tag.attrib['k']
                node_tags_dict['value']=node_tag.attrib['v']
                tags.append(node_tags_dict)
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for way_field in WAY_FIELDS:
            way_attribs[way_field]= element.attrib[way_field]
            
        counter =0
        for way_node in element.iter('nd'):
            way_nodes_dict = {}
            way_nodes_dict['id']=element.attrib['id']
            way_nodes_dict['node_id']=way_node.attrib['ref']
            way_nodes_dict['position']=counter
            counter +=1
            way_nodes.append(way_nodes_dict)
            

        for way_tag in element.iter('tag'):
            way_tags_dict={}
            problem=re.match(PROBLEMCHARS,way_tag.attrib['k'])
            colon=re.match(LOWER_COLON,way_tag.attrib['k'])
            if problem:
                continue
            elif colon:
                way_tags_dict['id']=element.attrib['id']
                way_tags_dict['key']=way_tag.attrib['k'].split(":",1)[1]
                way_tags_dict['value']=way_tag.attrib['v']
                way_tags_dict['type']=way_tag.attrib['k'].split(":",1)[0]
                tags.append(way_tags_dict)
                
            else:
                way_tags_dict['id']=element.attrib['id']
                way_tags_dict['key']=way_tag.attrib['k']
                way_tags_dict['value']=way_tag.attrib['v']
                way_tags_dict['type']="regular"
                tags.append(way_tags_dict)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)

案例研究：OpenStreetMap数据(SQL)