数据分析成长之路

Udacity-P5-OpenStreetMap数据集清洗

2017-11-04  本文已影响69人  闪亮的日子hp

整理 OpenStreetMap 数据

采用的地图

拉斯维加斯是世界著名赌城,对该城市的地图数据比较好奇,所以选择此城市。

地图中存在的问题

邮政编码的格式不统一

from lxml import etree
osm_file = open('sample.osm', 'r')

def is_postcode(elem):
    '''判断 elem 的标签是否为 tag 以及 k 属性 是否包含 addr:postcode
       若满足上述条件,函数返回 True'''
    return (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:postcode')

def audit():
    for event, elem in etree.iterparse(osm_file):
        if is_postcode(elem):
            print elem.attrib['v']

if __name__ == '__main__':
    audit()

运行上述代码断后,发现 Postcode 的值 有如下三种格式。

<tag k="addr:postcode" v="89119"/>
<tag k="addr:postcode" v="NV 89149"/>
<tag k="addr:postcode" v="89108-7049"/>

数据清洗

把类似于 'NV 89031' 和 '89109-1907' 格式的邮政编码全部转换为 '89119' 这样的格式。

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "sample.osm"
postcode_NV_re = re.compile(r'[A-Z]+')
postcode_re = re.compile(r'-')


def audit_postcode_type(pt_types, pt_name):
    m = re.search(postcode_NV_re, pt_name)
    n = re.search(postcode_re, pt_name)
    if m:
        pt_type = m.group()
        pt_types[pt_type].add(pt_name)
    elif n:
        pt_type = n.group()
        pt_types[pt_type].add(pt_name)

def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    pt_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        for tag in elem.iter("tag"):
            if is_postcode(tag):
                audit_postcode_type(pt_types, tag.attrib['v'])
    osm_file.close()
    return pt_types


def update_name(name):
    m = re.search(postcode_NV_re, name)
    n = re.search(postcode_re, name)
    if m:
        name = (re.sub(postcode_NV_re, '', name)).strip()
    elif n:
        name = name.split('-')[0]
    return name


def test():
    postcode_types = audit(OSMFILE)
    pprint.pprint(dict(postcode_types))

    for postcode_type, postcode in postcode_types.iteritems():
        for name in postcode:
            better_name = update_name(name)
            print name, "=>", better_name
            if name == "89108-7049":
                assert better_name == "89108"
            if name == "NV 89149":
                assert better_name == "89149"


if __name__ == '__main__':
    test()

修正后的格式如下:
89108-7049 => 89108
NV 89149 => 89149

把修正后的数据写入到 CSV 文件中

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "las-vegas_nevada.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
RELATION_TAGS_PATH = "relation_tags.csv"
RELATION_PATH = "relations.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
postcode_NV_re = re.compile(r'[A-Z]+')
postcode_re = re.compile(r'-')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
RELATION_TAGS_FIELDS = ['id', 'key', 'value', 'type']
RELATION_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']

def update_name(name):
    m = re.search(postcode_NV_re, name)
    n = re.search(postcode_re, name)
    if m:
        name = (re.sub(postcode_NV_re, '', name)).strip()
    elif n:
        name = name.split('-')[0]
    return name

def shape_tag(element):
    tags = []
    for child in element:
        if child.tag == 'tag':
            tags_attrib = {}
            tags_attrib['id'] = element.attrib['id']
            problem = re.search(PROBLEMCHARS, child.attrib['k'])
            colon = re.match(LOWER_COLON, child.attrib['k'])
            if problem:
                continue
            else:
                if colon:
                    k_value = child.attrib['k'].split(':', 1)
                    tags_attrib['key'] = k_value[1]
                    if child.attrib['k'] == 'addr:postcode':
                        tags_attrib['value'] = update_name(child.attrib['v'])
                    else:
                        tags_attrib['value'] = child.attrib['v']
                    tags_attrib['type'] = k_value[0]
                    tags.append(tags_attrib)
                else:
                    tags_attrib['key'] = child.attrib['k']
                    tags_attrib['value'] = child.attrib['v']
                    tags_attrib['type'] = 'regular'
                    tags.append(tags_attrib)
    return tags

def shape_way_nodes(element):
    way_nodes = []
    counter = 0
    for ch in element:
        if ch.tag == 'nd':
            way_nodes_attrib = {}
            way_nodes_attrib['id'] = element.attrib['id']
            way_nodes_attrib['node_id'] = ch.attrib['ref']
            way_nodes_attrib['position'] = counter
            counter += 1
            way_nodes.append(way_nodes_attrib)
    return way_nodes


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  re_attr_fields=RELATION_FIELDS,problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    relation_attribs = {}
     # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        node_attribs = {i: element.attrib[i] for i in node_attr_fields}
        tags = shape_tag(element)
        return {'node': node_attribs, 'node_tags': tags}                
    
    if element.tag == 'way':
        way_attribs = {i_way: element.attrib[i_way] for i_way in way_attr_fields}
        way_nodes = shape_way_nodes(element)
        tags = shape_tag(element)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    
    if element.tag == 'relation':
        relation_attribs = {i_re: element.attrib[i_re] for i_re in re_attr_fields}
        tags = shape_tag(element)
        return {'relation': relation_attribs, 'relation_tags': tags}
    


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file, \
         codecs.open(RELATION_PATH, 'w') as relation_file, \
         codecs.open(RELATION_TAGS_PATH, 'w') as relation_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        relation_tags_writer = UnicodeDictWriter(relation_tags_file, RELATION_TAGS_FIELDS)
        relation_writer = UnicodeDictWriter(relation_file, RELATION_FIELDS)
        
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        relation_writer.writeheader()
        relation_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way', 'relation')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
                elif element.tag == 'relation':
                    relation_writer.writerow(el['relation'])
                    relation_tags_writer.writerows(el['relation_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)

把 CSV 文件导入到数据库中

查询数据库中的邮编格式是否正确

SELECT key, value FROM nodes_tags 
WHERE key == 'postcode' AND value == '89108'
UNION 
SELECT key, value FROM relation_tags 
WHERE key == 'postcode' AND value == '89149'
postcode  89108
postcode  89149

数据概述

文件大小

import os
def get_size(file):
    size = os.path.getsize(file)/ (1024.0 * 1024)
    convert2MB = format(size, '0.2f') + " MB"
    print file.ljust(30,'.'),convert2MB
get_size('las-vegas_nevada.osm')
get_size('sample.osm')
get_size('data_wrangling_schema.db')
get_size('nodes.csv')
get_size('nodes_tags.csv')
get_size('ways.csv')
get_size('ways_nodes.csv')
get_size('ways_tags.csv')
get_size('relation_tags.csv')
get_size('relations.csv')
las-vegas_nevada.osm.......... 219.04 MB
data_wrangling_schema.db...... 496.64 MB
nodes.csv..................... 84.73 MB
nodes_tags.csv................ 2.28 MB
ways.csv...................... 6.54 MB
ways_nodes.csv................ 29.08 MB
ways_tags.csv................. 14.19 MB
relation_tags.csv............. 0.10 MB
relations.csv................. 0.03 MB

唯一用户的数量

SELECT COUNT(DISTINCT(e.uid)) FROM 
(SELECT uid FROM nodes UNION 
SELECT uid FROM ways UNION 
SELECT uid FROM relations) e;
1117

nodes 数量

SELECT COUNT(*) FROM nodes;
1063059

ways 数量

SELECT COUNT(*) FROM ways;
115082

警察局的数量

SELECT COUNT(*) FROM nodes_tags 
WHERE value == 'police';
16

中国餐馆的数量

SELECT COUNT(*) FROM nodes_tags 
WHERE key == 'cuisine' and value == 'chinese';
21

店面最多的10中咖啡店

SELECT value , COUNT(*)
FROM nodes_tags 
JOIN (SELECT DISTINCT id FROM nodes_tags WHERE value="cafe") nodes_ids
ON nodes_tags.id=nodes_ids.id
WHERE key="name" 
GROUP BY value 
ORDER BY COUNT(*) DESC  
LIMIT 10;
Starbucks,37
"Dunkin' Donuts",3
"Coffee Bean & Tea Leaf",2
"Starbucks Coffee",2
"Baga Hookah",1
"Brooklyn Bagels",1
"Cafe Bellagio",1
"Cafe Belle Madeleine",1
"Cafe Pan",1
"Café Berlin",1

星巴克的分店最多,同时发现一个问题,星巴克的名称出现了 'Starbucks' 和 'Starbucks Coffee'两种写法。

改进数据建议

在分析过程中发现,更多的数据其本身并没有错误,更多的是数据的格式不统一,比如本次分析中发现的邮编格式,星巴克的店名,深入分析后发现,电话号码的格式也不统一。

益处:

会提高用户在使用地图时的用户体验,提高用户使用率。

预期的问题

因为该建议会增加提交数据者在提交前修改数据的次数。可能会较低提交者的积极性,使得数据贡献者的人数减少。

结论

通过本次项目,让我熟悉了数据清洗的基本流程,了解了 SQL 的基本使用,同时在清洗邮政编码格式时,
更加熟悉了 Python 的基础知识。但是由于本次采用的是国外的地图,由于文化的差异,对地图中的一些信息了解的不是很清楚,
对项目的进行有一定的影响。

上一篇下一篇

猜你喜欢

热点阅读