Repo created
This commit is contained in:
parent
4af19165ec
commit
68073add76
12458 changed files with 12350765 additions and 2 deletions
0
tools/python/post_generation/__init__.py
Normal file
0
tools/python/post_generation/__init__.py
Normal file
158
tools/python/post_generation/__main__.py
Normal file
158
tools/python/post_generation/__main__.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from post_generation.hierarchy_to_countries import (
|
||||
hierarchy_to_countries as hierarchy_to_countries_,
|
||||
)
|
||||
from post_generation.inject_promo_ids import inject_promo_ids
|
||||
from post_generation.localads_mwm_to_csv import create_csv
|
||||
|
||||
|
||||
class PostGeneration:
|
||||
def __init__(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Post-generation instruments",
|
||||
usage="""post_generation <command> [<args>]
|
||||
The post_generation commands are:
|
||||
localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files.
|
||||
hierarchy_to_countries Produces countries.txt from hierarchy.txt.
|
||||
inject_promo_ids Injects promo osm ids into countries.txt
|
||||
""",
|
||||
)
|
||||
parser.add_argument("command", help="Subcommand to run")
|
||||
args = parser.parse_args(sys.argv[1:2])
|
||||
if not hasattr(self, args.command):
|
||||
print(f"Unrecognized command {args.command}")
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
getattr(self, args.command)()
|
||||
|
||||
@staticmethod
|
||||
def localads_mwm_to_csv():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Prepares CSV files for uploading to localads database "
|
||||
"from mwm files."
|
||||
)
|
||||
parser.add_argument("mwm", help="path to mwm files")
|
||||
parser.add_argument(
|
||||
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", default=".", help="path to generated files ('.' by default)"
|
||||
)
|
||||
types_default = os.path.join(
|
||||
os.path.dirname(__file__), "..", "..", "..", "data", "types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--types", default=types_default, help="path to omim/data/types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threads", type=int, default=1, help="number of threads to process files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mwm_version", type=int, required=True, help="Mwm version"
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
if not args.osm2ft:
|
||||
args.osm2ft = args.mwm
|
||||
|
||||
create_csv(
|
||||
args.output,
|
||||
args.mwm,
|
||||
args.osm2ft,
|
||||
args.mwm_version,
|
||||
args.threads,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def hierarchy_to_countries():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Produces countries.txt from hierarchy.txt."
|
||||
)
|
||||
parser.add_argument("--target", required=True, help="Path to mwm files")
|
||||
parser.add_argument(
|
||||
"--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file"
|
||||
)
|
||||
parser.add_argument("--old", required=True, help="old_vs_new.csv file")
|
||||
parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file")
|
||||
parser.add_argument(
|
||||
"--countries_synonyms", required=True, help="countries_synonyms.csv file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mwm_version", type=int, required=True, help="Mwm version"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
required=True,
|
||||
help="Output countries.txt file (default is stdout)",
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
countries = hierarchy_to_countries_(
|
||||
args.old,
|
||||
args.osm,
|
||||
args.countries_synonyms,
|
||||
args.hierarchy,
|
||||
args.target,
|
||||
args.mwm_version,
|
||||
)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(countries, f, ensure_ascii=False, indent=1)
|
||||
else:
|
||||
print(json.dumps(countries, ensure_ascii=False, indent=1))
|
||||
|
||||
@staticmethod
|
||||
def inject_promo_ids():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Injects promo cities osm ids into countries.txt"
|
||||
)
|
||||
parser.add_argument("--mwm", required=True, help="path to mwm files")
|
||||
parser.add_argument(
|
||||
"--types", required=True, help="path to omim/data/types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--promo_cities", required=True, help="Path to promo cities file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--promo_countries", required=True, help="Path to promo countries file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--countries",
|
||||
help="path to countries.txt file (default is countries.txt file into mwm directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="Output countries.txt file (default is countries.txt file into mwm directory)",
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
|
||||
if not args.osm2ft:
|
||||
args.osm2ft = args.mwm
|
||||
if not args.countries:
|
||||
args.countries = os.path.join(args.mwm, "countries.txt")
|
||||
if not args.output:
|
||||
args.output = os.path.join(args.mwm, "countries.txt")
|
||||
|
||||
with open(args.countries) as f:
|
||||
countries = json.load(f)
|
||||
|
||||
inject_promo_ids(
|
||||
countries,
|
||||
args.promo_cities,
|
||||
args.promo_countries,
|
||||
args.mwm,
|
||||
args.types,
|
||||
args.osm2ft,
|
||||
)
|
||||
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(countries, f, ensure_ascii=False, indent=1)
|
||||
|
||||
|
||||
PostGeneration()
|
||||
188
tools/python/post_generation/hierarchy_to_countries.py
Executable file
188
tools/python/post_generation/hierarchy_to_countries.py
Executable file
|
|
@ -0,0 +1,188 @@
|
|||
# Produces countries.txt from hierarchy.txt
|
||||
#
|
||||
# Hierarchy.txt format:
|
||||
#
|
||||
# Sample lines:
|
||||
# Iran;Q794;ir;fa
|
||||
# Iran_South;Q794-South
|
||||
#
|
||||
# Number of leading spaces mean hierarchy depth. In above case, Iran_South is inside Iran.
|
||||
# Then follows a semicolon-separated list:
|
||||
# 1. MWM file name without extension
|
||||
# 2. Region name template using wikidata Qxxx codes and predefined strings
|
||||
# 3. Country ISO code (used for flags in the legacy format)
|
||||
# 4. Comma-separated list of language ISO codes for the region
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os.path
|
||||
import re
|
||||
|
||||
|
||||
class CountryDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
dict.__init__(self, *args, **kwargs)
|
||||
self.order = ["id", "n", "v", "c", "s", "sha1_base64", "rs", "g"]
|
||||
|
||||
def __iter__(self):
|
||||
for key in self.order:
|
||||
if key in self:
|
||||
yield key
|
||||
for key in dict.__iter__(self):
|
||||
if key not in self.order:
|
||||
yield key
|
||||
|
||||
def iteritems(self):
|
||||
for key in self.__iter__():
|
||||
yield (key, self.__getitem__(key))
|
||||
|
||||
|
||||
def get_mwm_hash(path, name):
|
||||
filename = os.path.join(path, f"{name}.mwm")
|
||||
h = hashlib.sha1()
|
||||
with open(filename, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
h.update(chunk)
|
||||
return str(base64.b64encode(h.digest()), "utf-8")
|
||||
|
||||
|
||||
def get_mwm_size(path, name):
|
||||
filename = os.path.join(path, f"{name}.mwm")
|
||||
return os.path.getsize(filename)
|
||||
|
||||
|
||||
def collapse_single(root):
|
||||
for i in range(len(root["g"])):
|
||||
if "g" in root["g"][i]:
|
||||
if len(root["g"][i]["g"]) == 1:
|
||||
# replace group by a leaf
|
||||
if "c" in root["g"][i]:
|
||||
root["g"][i]["g"][0]["c"] = root["g"][i]["c"]
|
||||
root["g"][i] = root["g"][i]["g"][0]
|
||||
else:
|
||||
collapse_single(root["g"][i])
|
||||
|
||||
|
||||
def get_name(leaf):
|
||||
if "n" in leaf:
|
||||
return leaf["n"].lower()
|
||||
else:
|
||||
return leaf["id"].lower()
|
||||
|
||||
|
||||
def sort_tree(root):
|
||||
root["g"].sort(key=get_name)
|
||||
for leaf in root["g"]:
|
||||
if "g" in leaf:
|
||||
sort_tree(leaf)
|
||||
|
||||
|
||||
def parse_old_vs_new(old_vs_new_csv_path):
|
||||
oldvs = {}
|
||||
if not old_vs_new_csv_path:
|
||||
return oldvs
|
||||
|
||||
with open(old_vs_new_csv_path) as f:
|
||||
for line in f:
|
||||
m = re.match(r"(.+?)\t(.+)", line.strip())
|
||||
assert m
|
||||
if m.group(2) in oldvs:
|
||||
oldvs[m.group(2)].append(m.group(1))
|
||||
else:
|
||||
oldvs[m.group(2)] = [m.group(1)]
|
||||
return oldvs
|
||||
|
||||
|
||||
def parse_borders_vs_osm(borders_vs_osm_csv_path):
|
||||
vsosm = {}
|
||||
if not borders_vs_osm_csv_path:
|
||||
return vsosm
|
||||
|
||||
with open(borders_vs_osm_csv_path) as f:
|
||||
for line in f:
|
||||
m = re.match(r"(.+)\t(\d)\t(.+)", line.strip())
|
||||
assert m
|
||||
if m.group(1) in vsosm:
|
||||
vsosm[m.group(1)].append(m.group(3))
|
||||
else:
|
||||
vsosm[m.group(1)] = [m.group(3)]
|
||||
return vsosm
|
||||
|
||||
|
||||
def parse_countries_synonyms(countries_synonyms_csv_path):
|
||||
countries_synonyms = {}
|
||||
if not countries_synonyms_csv_path:
|
||||
return countries_synonyms
|
||||
|
||||
with open(countries_synonyms_csv_path) as f:
|
||||
for line in f:
|
||||
m = re.match(r"(.+)\t(.+)", line.strip())
|
||||
assert m
|
||||
if m.group(1) in countries_synonyms:
|
||||
countries_synonyms[m.group(1)].append(m.group(2))
|
||||
else:
|
||||
countries_synonyms[m.group(1)] = [m.group(2)]
|
||||
return countries_synonyms
|
||||
|
||||
|
||||
def hierarchy_to_countries(
|
||||
old_vs_new_csv_path,
|
||||
borders_vs_osm_csv_path,
|
||||
countries_synonyms_csv_path,
|
||||
hierarchy_path,
|
||||
target_path,
|
||||
version,
|
||||
):
|
||||
def fill_last(last, stack):
|
||||
name = last["id"]
|
||||
if not os.path.exists(os.path.join(target_path, f"{name}.mwm")):
|
||||
return
|
||||
last["s"] = get_mwm_size(target_path, name)
|
||||
last["sha1_base64"] = get_mwm_hash(target_path, name)
|
||||
if last["s"] >= 0:
|
||||
stack[-1]["g"].append(last)
|
||||
|
||||
oldvs = parse_old_vs_new(old_vs_new_csv_path)
|
||||
vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path)
|
||||
countries_synonyms = parse_countries_synonyms(countries_synonyms_csv_path)
|
||||
stack = [CountryDict(v=int(version), id="Countries", g=[])]
|
||||
last = None
|
||||
with open(hierarchy_path) as f:
|
||||
for line in f:
|
||||
m = re.match("( *)(.+)", line)
|
||||
assert m
|
||||
depth = len(m.group(1))
|
||||
if last is not None:
|
||||
lastd = last["d"]
|
||||
del last["d"]
|
||||
if lastd < depth:
|
||||
# last is a group
|
||||
last["g"] = []
|
||||
stack.append(last)
|
||||
else:
|
||||
fill_last(last, stack)
|
||||
while depth < len(stack) - 1:
|
||||
# group ended, add it to higher group
|
||||
g = stack.pop()
|
||||
if len(g["g"]) > 0:
|
||||
stack[-1]["g"].append(g)
|
||||
items = m.group(2).split(";")
|
||||
last = CountryDict({"id": items[0], "d": depth})
|
||||
if items[0] in oldvs:
|
||||
last["old"] = oldvs[items[0]]
|
||||
if items[0] in vsosm:
|
||||
last["affiliations"] = vsosm[items[0]]
|
||||
if items[0] in countries_synonyms:
|
||||
last["country_name_synonyms"] = countries_synonyms[items[0]]
|
||||
|
||||
# the last line is always a file
|
||||
del last["d"]
|
||||
fill_last(last, stack)
|
||||
while len(stack) > 1:
|
||||
g = stack.pop()
|
||||
if len(g["g"]) > 0:
|
||||
stack[-1]["g"].append(g)
|
||||
|
||||
collapse_single(stack[-1])
|
||||
return stack[-1]
|
||||
146
tools/python/post_generation/inject_promo_ids.py
Normal file
146
tools/python/post_generation/inject_promo_ids.py
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from multiprocessing import Pool
|
||||
|
||||
from mwm import Mwm
|
||||
from mwm.ft2osm import read_osm2ft
|
||||
|
||||
|
||||
class PromoIds(object):
|
||||
def __init__(self, countries, cities, mwm_path, types_path, osm2ft_path):
|
||||
self.countries = countries
|
||||
self.cities = cities
|
||||
self.mwm_path = mwm_path
|
||||
self.types_path = types_path
|
||||
self.osm2ft_path = osm2ft_path
|
||||
|
||||
def inject_into_country(self, country):
|
||||
nodes = self._get_nodes(country)
|
||||
with Pool() as pool:
|
||||
proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1)
|
||||
|
||||
countries_ids = [
|
||||
ids for node_ids in proposed_ids for ids in node_ids["countries"]
|
||||
]
|
||||
if countries_ids:
|
||||
country["top_countries_geo_ids"] = countries_ids
|
||||
|
||||
for idx, node_ids in enumerate(proposed_ids):
|
||||
if not node_ids["cities"]:
|
||||
continue
|
||||
node = nodes[idx]
|
||||
best = self._choose_best_city(node_ids["cities"])
|
||||
node["top_city_geo_id"] = best["id"]
|
||||
if best["id"] < 0:
|
||||
node["top_city_geo_id"] += 1 << 64
|
||||
|
||||
def _find(self, leaf_id):
|
||||
result = {"countries": [], "cities": []}
|
||||
ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
|
||||
|
||||
for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")):
|
||||
osm_id = ft2osm.get(feature.index(), None)
|
||||
types = feature.readable_types()
|
||||
|
||||
if "sponsored-promo_catalog" in types and osm_id in self.cities:
|
||||
city = self._get_city(osm_id, types)
|
||||
result["cities"].append(city)
|
||||
|
||||
if "place-country" in types and osm_id in self.countries:
|
||||
result["countries"].append(osm_id)
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _get_nodes(root):
|
||||
def __get_nodes(node, mwm_nodes):
|
||||
if "g" in node:
|
||||
for item in node["g"]:
|
||||
__get_nodes(item, mwm_nodes)
|
||||
else:
|
||||
mwm_nodes.append(node)
|
||||
|
||||
mwm_nodes = []
|
||||
__get_nodes(root, mwm_nodes)
|
||||
return mwm_nodes
|
||||
|
||||
def _get_city(self, osm_id, types):
|
||||
city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []}
|
||||
|
||||
for t in types:
|
||||
if t.startswith("place"):
|
||||
city["types"].append(t)
|
||||
|
||||
if not city["types"]:
|
||||
logging.error(
|
||||
f"Incorrect types for sponsored-promo_catalog "
|
||||
f"feature osm_id {osm_id}"
|
||||
)
|
||||
sys.exit(3)
|
||||
|
||||
return city
|
||||
|
||||
def _choose_best_city(self, proposed_cities):
|
||||
def key_compare(city):
|
||||
return city["count_of_guides"], self._score_city_types(city["types"])
|
||||
|
||||
return max(proposed_cities, key=key_compare)
|
||||
|
||||
def _score_city_types(self, types):
|
||||
return max(self._city_type_to_int(t) for t in types)
|
||||
|
||||
@staticmethod
|
||||
def _city_type_to_int(t):
|
||||
if t == "place-town":
|
||||
return 1
|
||||
if t == "place-city":
|
||||
return 2
|
||||
|
||||
m = re.match(r"^place-city-capital?(-(?P<admin_level>\d+)|)$", t)
|
||||
if m:
|
||||
admin_level = int(m.groupdict("1")["admin_level"])
|
||||
if 1 <= admin_level <= 12:
|
||||
return 14 - admin_level
|
||||
return 0
|
||||
|
||||
|
||||
def load_promo_ids(path):
|
||||
with open(path) as f:
|
||||
root = json.load(f)
|
||||
|
||||
ids = {}
|
||||
for item in root["data"]:
|
||||
ids[item["osmid"]] = item["paid_bundles_count"]
|
||||
|
||||
return ids
|
||||
|
||||
|
||||
def load_osm2ft(osm2ft_path, mwm_id):
|
||||
osm2ft_name = os.path.join(osm2ft_path, mwm_id + ".mwm.osm2ft")
|
||||
if not os.path.exists(osm2ft_name):
|
||||
logging.error(f"Cannot find {osm2ft_name}")
|
||||
sys.exit(3)
|
||||
with open(osm2ft_name, "rb") as f:
|
||||
return read_osm2ft(f, ft2osm=True, tuples=False)
|
||||
|
||||
|
||||
def inject_promo_ids(
|
||||
countries_json,
|
||||
promo_cities_path,
|
||||
promo_countries_path,
|
||||
mwm_path,
|
||||
types_path,
|
||||
osm2ft_path,
|
||||
):
|
||||
promo_ids = PromoIds(
|
||||
load_promo_ids(promo_countries_path),
|
||||
load_promo_ids(promo_cities_path),
|
||||
mwm_path,
|
||||
types_path,
|
||||
osm2ft_path,
|
||||
)
|
||||
for country in countries_json["g"]:
|
||||
promo_ids.inject_into_country(country)
|
||||
1
tools/python/post_generation/requirements.txt
Normal file
1
tools/python/post_generation/requirements.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
omim-mwm
|
||||
1
tools/python/post_generation/requirements_dev.txt
Normal file
1
tools/python/post_generation/requirements_dev.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
-r ../mwm/requirements_dev.txt
|
||||
31
tools/python/post_generation/setup.py
Normal file
31
tools/python/post_generation/setup.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
|
||||
import setuptools
|
||||
|
||||
module_dir = os.path.abspath(os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(module_dir, "..", "..", ".."))
|
||||
|
||||
from pyhelpers.setup import chdir
|
||||
from pyhelpers.setup import get_version
|
||||
from pyhelpers.setup import get_requirements
|
||||
|
||||
|
||||
with chdir(os.path.abspath(os.path.dirname(__file__))):
|
||||
setuptools.setup(
|
||||
name="omim-post_generation",
|
||||
version=str(get_version()),
|
||||
author="CoMaps",
|
||||
author_email="info@comaps.app",
|
||||
description="This package is a library for post-processing the generated maps.",
|
||||
url="https://codeberg.org/comaps",
|
||||
package_dir={"post_generation": ""},
|
||||
packages=["post_generation"],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
],
|
||||
python_requires=">=3.6",
|
||||
install_requires=get_requirements(),
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue