Udacity-P5-OpenStreetMap数据集清洗
2017-11-04 本文已影响69人
闪亮的日子hp
整理 OpenStreetMap 数据
采用的地图
- https://www.openstreetmap.org/relation/165473
- https://mapzen.com/data/metro-extracts/metro/las-vegas_nevada/
拉斯维加斯是世界著名赌城,对该城市的地图数据比较好奇,所以选择此城市。
地图中存在的问题
邮政编码的格式不统一
from lxml import etree
osm_file = open('sample.osm', 'r')
def is_postcode(elem):
'''判断 elem 的标签是否为 tag 以及 k 属性 是否包含 addr:postcode
若满足上述条件,函数返回 True'''
return (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:postcode')
def audit():
for event, elem in etree.iterparse(osm_file):
if is_postcode(elem):
print elem.attrib['v']
if __name__ == '__main__':
audit()
运行上述代码断后,发现 Postcode 的值 有如下三种格式。
<tag k="addr:postcode" v="89119"/>
<tag k="addr:postcode" v="NV 89149"/>
<tag k="addr:postcode" v="89108-7049"/>
数据清洗
把类似于 'NV 89031' 和 '89109-1907' 格式的邮政编码全部转换为 '89119' 这样的格式。
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
OSMFILE = "sample.osm"
postcode_NV_re = re.compile(r'[A-Z]+')
postcode_re = re.compile(r'-')
def audit_postcode_type(pt_types, pt_name):
m = re.search(postcode_NV_re, pt_name)
n = re.search(postcode_re, pt_name)
if m:
pt_type = m.group()
pt_types[pt_type].add(pt_name)
elif n:
pt_type = n.group()
pt_types[pt_type].add(pt_name)
def is_postcode(elem):
return (elem.attrib['k'] == "addr:postcode")
def audit(osmfile):
osm_file = open(osmfile, "r")
pt_types = defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
for tag in elem.iter("tag"):
if is_postcode(tag):
audit_postcode_type(pt_types, tag.attrib['v'])
osm_file.close()
return pt_types
def update_name(name):
m = re.search(postcode_NV_re, name)
n = re.search(postcode_re, name)
if m:
name = (re.sub(postcode_NV_re, '', name)).strip()
elif n:
name = name.split('-')[0]
return name
def test():
postcode_types = audit(OSMFILE)
pprint.pprint(dict(postcode_types))
for postcode_type, postcode in postcode_types.iteritems():
for name in postcode:
better_name = update_name(name)
print name, "=>", better_name
if name == "89108-7049":
assert better_name == "89108"
if name == "NV 89149":
assert better_name == "89149"
if __name__ == '__main__':
test()
修正后的格式如下:
89108-7049 => 89108
NV 89149 => 89149
把修正后的数据写入到 CSV 文件中
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus
import schema
OSM_PATH = "las-vegas_nevada.osm"
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
RELATION_TAGS_PATH = "relation_tags.csv"
RELATION_PATH = "relations.csv"
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
postcode_NV_re = re.compile(r'[A-Z]+')
postcode_re = re.compile(r'-')
SCHEMA = schema.schema
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
RELATION_TAGS_FIELDS = ['id', 'key', 'value', 'type']
RELATION_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
def update_name(name):
m = re.search(postcode_NV_re, name)
n = re.search(postcode_re, name)
if m:
name = (re.sub(postcode_NV_re, '', name)).strip()
elif n:
name = name.split('-')[0]
return name
def shape_tag(element):
tags = []
for child in element:
if child.tag == 'tag':
tags_attrib = {}
tags_attrib['id'] = element.attrib['id']
problem = re.search(PROBLEMCHARS, child.attrib['k'])
colon = re.match(LOWER_COLON, child.attrib['k'])
if problem:
continue
else:
if colon:
k_value = child.attrib['k'].split(':', 1)
tags_attrib['key'] = k_value[1]
if child.attrib['k'] == 'addr:postcode':
tags_attrib['value'] = update_name(child.attrib['v'])
else:
tags_attrib['value'] = child.attrib['v']
tags_attrib['type'] = k_value[0]
tags.append(tags_attrib)
else:
tags_attrib['key'] = child.attrib['k']
tags_attrib['value'] = child.attrib['v']
tags_attrib['type'] = 'regular'
tags.append(tags_attrib)
return tags
def shape_way_nodes(element):
way_nodes = []
counter = 0
for ch in element:
if ch.tag == 'nd':
way_nodes_attrib = {}
way_nodes_attrib['id'] = element.attrib['id']
way_nodes_attrib['node_id'] = ch.attrib['ref']
way_nodes_attrib['position'] = counter
counter += 1
way_nodes.append(way_nodes_attrib)
return way_nodes
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
re_attr_fields=RELATION_FIELDS,problem_chars=PROBLEMCHARS, default_tag_type='regular'):
"""Clean and shape node or way XML element to Python dict"""
node_attribs = {}
way_attribs = {}
relation_attribs = {}
# Handle secondary tags the same way for both node and way elements
if element.tag == 'node':
node_attribs = {i: element.attrib[i] for i in node_attr_fields}
tags = shape_tag(element)
return {'node': node_attribs, 'node_tags': tags}
if element.tag == 'way':
way_attribs = {i_way: element.attrib[i_way] for i_way in way_attr_fields}
way_nodes = shape_way_nodes(element)
tags = shape_tag(element)
return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
if element.tag == 'relation':
relation_attribs = {i_re: element.attrib[i_re] for i_re in re_attr_fields}
tags = shape_tag(element)
return {'relation': relation_attribs, 'relation_tags': tags}
# ================================================== #
# Helper Functions #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=SCHEMA):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_string = pprint.pformat(errors)
raise Exception(message_string.format(field, error_string))
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file, \
codecs.open(RELATION_PATH, 'w') as relation_file, \
codecs.open(RELATION_TAGS_PATH, 'w') as relation_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
relation_tags_writer = UnicodeDictWriter(relation_tags_file, RELATION_TAGS_FIELDS)
relation_writer = UnicodeDictWriter(relation_file, RELATION_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
relation_writer.writeheader()
relation_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way', 'relation')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
elif element.tag == 'relation':
relation_writer.writerow(el['relation'])
relation_tags_writer.writerows(el['relation_tags'])
if __name__ == '__main__':
# Note: Validation is ~ 10X slower. For the project consider using a small
# sample of the map when validating.
process_map(OSM_PATH, validate=False)
把 CSV 文件导入到数据库中
查询数据库中的邮编格式是否正确
SELECT key, value FROM nodes_tags
WHERE key == 'postcode' AND value == '89108'
UNION
SELECT key, value FROM relation_tags
WHERE key == 'postcode' AND value == '89149'
postcode 89108
postcode 89149
数据概述
文件大小
import os
def get_size(file):
size = os.path.getsize(file)/ (1024.0 * 1024)
convert2MB = format(size, '0.2f') + " MB"
print file.ljust(30,'.'),convert2MB
get_size('las-vegas_nevada.osm')
get_size('sample.osm')
get_size('data_wrangling_schema.db')
get_size('nodes.csv')
get_size('nodes_tags.csv')
get_size('ways.csv')
get_size('ways_nodes.csv')
get_size('ways_tags.csv')
get_size('relation_tags.csv')
get_size('relations.csv')
las-vegas_nevada.osm.......... 219.04 MB
data_wrangling_schema.db...... 496.64 MB
nodes.csv..................... 84.73 MB
nodes_tags.csv................ 2.28 MB
ways.csv...................... 6.54 MB
ways_nodes.csv................ 29.08 MB
ways_tags.csv................. 14.19 MB
relation_tags.csv............. 0.10 MB
relations.csv................. 0.03 MB
唯一用户的数量
SELECT COUNT(DISTINCT(e.uid)) FROM
(SELECT uid FROM nodes UNION
SELECT uid FROM ways UNION
SELECT uid FROM relations) e;
1117
nodes 数量
SELECT COUNT(*) FROM nodes;
1063059
ways 数量
SELECT COUNT(*) FROM ways;
115082
警察局的数量
SELECT COUNT(*) FROM nodes_tags
WHERE value == 'police';
16
中国餐馆的数量
SELECT COUNT(*) FROM nodes_tags
WHERE key == 'cuisine' and value == 'chinese';
21
店面最多的10中咖啡店
SELECT value , COUNT(*)
FROM nodes_tags
JOIN (SELECT DISTINCT id FROM nodes_tags WHERE value="cafe") nodes_ids
ON nodes_tags.id=nodes_ids.id
WHERE key="name"
GROUP BY value
ORDER BY COUNT(*) DESC
LIMIT 10;
Starbucks,37
"Dunkin' Donuts",3
"Coffee Bean & Tea Leaf",2
"Starbucks Coffee",2
"Baga Hookah",1
"Brooklyn Bagels",1
"Cafe Bellagio",1
"Cafe Belle Madeleine",1
"Cafe Pan",1
"Café Berlin",1
星巴克的分店最多,同时发现一个问题,星巴克的名称出现了 'Starbucks' 和 'Starbucks Coffee'两种写法。
改进数据建议
在分析过程中发现,更多的数据其本身并没有错误,更多的是数据的格式不统一,比如本次分析中发现的邮编格式,星巴克的店名,深入分析后发现,电话号码的格式也不统一。
益处:
会提高用户在使用地图时的用户体验,提高用户使用率。
预期的问题
因为该建议会增加提交数据者在提交前修改数据的次数。可能会较低提交者的积极性,使得数据贡献者的人数减少。
结论
通过本次项目,让我熟悉了数据清洗的基本流程,了解了 SQL 的基本使用,同时在清洗邮政编码格式时,
更加熟悉了 Python 的基础知识。但是由于本次采用的是国外的地图,由于文化的差异,对地图中的一些信息了解的不是很清楚,
对项目的进行有一定的影响。