Repo created
This commit is contained in:
parent
4af19165ec
commit
68073add76
12458 changed files with 12350765 additions and 2 deletions
211
tools/python/planet_checks/validate_big_cities.py
Normal file
211
tools/python/planet_checks/validate_big_cities.py
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
"""
|
||||
Current version of reference city list can be found at
|
||||
http://testdata.mapsme.cloud.devmail.ru/planet_checks/osm_big_cities_full_data.geojson
|
||||
|
||||
The script verifies that OSM dump contains all cities from a geojson-formatted
|
||||
reference list. This file is usually obtained with the overpass-query:
|
||||
node[place=city](if:t["population"] >= 200000 || t["capital"]=='yes');
|
||||
|
||||
Thus, the result doesn't contain some cities which, during extraction:
|
||||
1) have had 'town' status;
|
||||
2) have been [temporarily] lacking the "place=*" tag at all.
|
||||
|
||||
On the other hand, the generated reference list contains even small cities
|
||||
with poorly formatted population like "3,565" or "3 565". However this
|
||||
reference list helps not to lose already revealed cities.
|
||||
|
||||
The reference list is compared with cities from the filtered planet dump:
|
||||
|
||||
/path/to/osmctools/osmfilter "${PLANET}" \
|
||||
--keep="( place=city OR place=town ) AND ( capital=yes OR capital=2 OR \
|
||||
population>=200000 )" \
|
||||
--keep-tags="all place= capital= name= name:en= population=" \
|
||||
--ignore-dependencies \
|
||||
--drop-version \
|
||||
--out-osm \
|
||||
-o="big_cities-filtered.osm"
|
||||
|
||||
Note that in the result there would not be cities with non-integer population
|
||||
tag, but we would be able to fix this tag in OSM when detect such cities
|
||||
with the help of the script.
|
||||
|
||||
TODO:
|
||||
*) inform about big cities found in OSM but not in reference list, to
|
||||
augment reference list with cities that were broken or have had no population.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as etree
|
||||
|
||||
|
||||
PLACE_TAGS = ('place', 'name', 'name:en')
|
||||
|
||||
|
||||
class ValidationError(Exception):
|
||||
"""The exception is thrown if cities validation failed."""
|
||||
|
||||
|
||||
def big_cities_generator(filename):
|
||||
for event, element in etree.iterparse(filename):
|
||||
feature_type = element.tag
|
||||
# TODO: include also ways/relations. Being got with osmfilter they don't
|
||||
## contain coordinates. Try osmconvert --all-to-nodes.
|
||||
if feature_type != 'node':
|
||||
continue
|
||||
tags = dict.fromkeys(PLACE_TAGS)
|
||||
for child in element:
|
||||
if child.tag == 'tag':
|
||||
tag_name = child.get('k')
|
||||
if tag_name in tags:
|
||||
tag_value = child.get('v')
|
||||
tags[tag_name] = tag_value
|
||||
if tags['place'] is None:
|
||||
continue
|
||||
feature = {
|
||||
'id': f"{feature_type[0]}{element.get('id')}",
|
||||
'tags': tags,
|
||||
'position': [float(element.get(c)) for c in ('lon', 'lat')]
|
||||
}
|
||||
yield feature
|
||||
|
||||
# If we don't need xml document tree it makes sense to clear
|
||||
# elements to save memory.
|
||||
element.clear()
|
||||
|
||||
|
||||
def read_reference_cities(reference_geojson_filename):
|
||||
"""Returns list of cities as objects with fields 'id',
|
||||
'properties', 'geometry' (as geojson features generated by overpass-api).
|
||||
"""
|
||||
with open(reference_geojson_filename) as f:
|
||||
geojson = json.load(f)
|
||||
return geojson['features']
|
||||
|
||||
|
||||
def extract_osm_cities(big_cities_osm_filename):
|
||||
cities_by_name = defaultdict(list)
|
||||
for city in big_cities_generator(big_cities_osm_filename):
|
||||
name, name_en = (city['tags'].get(tag) for tag in ('name', 'name:en'))
|
||||
if not name and not name_en:
|
||||
logging.warning(f"City without name and name:en {city['id']}")
|
||||
continue
|
||||
if name:
|
||||
cities_by_name[name].append(city)
|
||||
if name_en and name_en != name:
|
||||
cities_by_name[name_en].append(city)
|
||||
return cities_by_name
|
||||
|
||||
|
||||
coord_isclose = functools.partial(math.isclose, abs_tol=0.25) # 1deg~100 km
|
||||
|
||||
|
||||
def are_locations_close(position1, position2):
|
||||
return (
|
||||
coord_isclose(position1[0], position2[0]) and
|
||||
coord_isclose(position1[1], position2[1])
|
||||
)
|
||||
|
||||
|
||||
def get_city_names(city):
|
||||
return list(
|
||||
filter(
|
||||
None, # Filters only "True" (not False/None/0/'') values
|
||||
(city['properties'].get(tag) for tag in ('name:en', 'name'))
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def find_matching_cities_for_reference(reference_city, osm_cities_by_name):
|
||||
names = get_city_names(reference_city)
|
||||
candidate_cities = itertools.chain.from_iterable(
|
||||
osm_cities_by_name[name] for name in names
|
||||
)
|
||||
matching_cities = []
|
||||
seen_ids = set()
|
||||
for city in candidate_cities:
|
||||
if city['id'] in seen_ids:
|
||||
continue
|
||||
if are_locations_close(
|
||||
reference_city['geometry']['coordinates'],
|
||||
city['position']
|
||||
):
|
||||
matching_cities.append(city)
|
||||
seen_ids.add(city['id'])
|
||||
return matching_cities
|
||||
|
||||
|
||||
def validate_big_cities(big_cities_osm_filename, reference_geojson_filename):
|
||||
"""This function compares reference city file with OSM data, generates
|
||||
logging messages of different level on data errors, and raises an exception
|
||||
if critical errors encounter.
|
||||
"""
|
||||
reference_cities = read_reference_cities(reference_geojson_filename)
|
||||
osm_cities_by_name = extract_osm_cities(big_cities_osm_filename)
|
||||
|
||||
all_cities_found = True
|
||||
for ref_city in reference_cities:
|
||||
city_names = get_city_names(ref_city)
|
||||
if not city_names:
|
||||
raise ValidationError(f"Reference city {ref_city['id']} "
|
||||
f"without name tags")
|
||||
display_name = city_names[0]
|
||||
matching_cities = find_matching_cities_for_reference(
|
||||
ref_city,
|
||||
osm_cities_by_name
|
||||
)
|
||||
num_matched_cities = len(matching_cities)
|
||||
if num_matched_cities != 1:
|
||||
if num_matched_cities > 1:
|
||||
logging.warning(f"More than one city {display_name} at one "
|
||||
f"place: {[x['id'] for x in matching_cities]}")
|
||||
else:
|
||||
all_cities_found = False
|
||||
logging.critical(f"Not found city {display_name} ({ref_city['id']})"
|
||||
f" {ref_city['geometry']['coordinates']}")
|
||||
|
||||
if not all_cities_found:
|
||||
raise ValidationError("Not all cities found in OSM")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
log_levels = [
|
||||
name.lower()
|
||||
for number, name in logging._levelToName.items()
|
||||
if number > 0
|
||||
]
|
||||
parser.add_argument('-L', '--log-level', choices=log_levels,
|
||||
default='critical', help='log level')
|
||||
parser.add_argument('-c', '--big-cities-current', required=True,
|
||||
help='Path to *.osm with big cities')
|
||||
parser.add_argument('-r', '--big-cities-reference', required=True,
|
||||
help='Path to geojson with required cities')
|
||||
options = parser.parse_args()
|
||||
|
||||
log_level_name = options.log_level.upper()
|
||||
logging.basicConfig(level=getattr(logging, log_level_name),
|
||||
format='%(levelname)-8s %(message)s')
|
||||
|
||||
try:
|
||||
validate_big_cities(
|
||||
options.big_cities_current,
|
||||
options.big_cities_reference
|
||||
)
|
||||
except ValidationError as e:
|
||||
logging.critical(e)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logging.critical("", exc_info=1)
|
||||
sys.exit(1)
|
||||
Loading…
Add table
Add a link
Reference in a new issue