Repo created

This commit is contained in:
Fr4nz D13trich 2025-11-22 13:58:55 +01:00
parent 4af19165ec
commit 68073add76
12458 changed files with 12350765 additions and 2 deletions

View file

View file

@ -0,0 +1,158 @@
import argparse
import json
import os
import sys
from post_generation.hierarchy_to_countries import (
hierarchy_to_countries as hierarchy_to_countries_,
)
from post_generation.inject_promo_ids import inject_promo_ids
from post_generation.localads_mwm_to_csv import create_csv
class PostGeneration:
def __init__(self):
parser = argparse.ArgumentParser(
description="Post-generation instruments",
usage="""post_generation <command> [<args>]
The post_generation commands are:
localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files.
hierarchy_to_countries Produces countries.txt from hierarchy.txt.
inject_promo_ids Injects promo osm ids into countries.txt
""",
)
parser.add_argument("command", help="Subcommand to run")
args = parser.parse_args(sys.argv[1:2])
if not hasattr(self, args.command):
print(f"Unrecognized command {args.command}")
parser.print_help()
exit(1)
getattr(self, args.command)()
@staticmethod
def localads_mwm_to_csv():
parser = argparse.ArgumentParser(
description="Prepares CSV files for uploading to localads database "
"from mwm files."
)
parser.add_argument("mwm", help="path to mwm files")
parser.add_argument(
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
)
parser.add_argument(
"--output", default=".", help="path to generated files ('.' by default)"
)
types_default = os.path.join(
os.path.dirname(__file__), "..", "..", "..", "data", "types.txt"
)
parser.add_argument(
"--types", default=types_default, help="path to omim/data/types.txt"
)
parser.add_argument(
"--threads", type=int, default=1, help="number of threads to process files"
)
parser.add_argument(
"--mwm_version", type=int, required=True, help="Mwm version"
)
args = parser.parse_args(sys.argv[2:])
if not args.osm2ft:
args.osm2ft = args.mwm
create_csv(
args.output,
args.mwm,
args.osm2ft,
args.mwm_version,
args.threads,
)
@staticmethod
def hierarchy_to_countries():
parser = argparse.ArgumentParser(
description="Produces countries.txt from hierarchy.txt."
)
parser.add_argument("--target", required=True, help="Path to mwm files")
parser.add_argument(
"--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file"
)
parser.add_argument("--old", required=True, help="old_vs_new.csv file")
parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file")
parser.add_argument(
"--countries_synonyms", required=True, help="countries_synonyms.csv file"
)
parser.add_argument(
"--mwm_version", type=int, required=True, help="Mwm version"
)
parser.add_argument(
"-o",
"--output",
required=True,
help="Output countries.txt file (default is stdout)",
)
args = parser.parse_args(sys.argv[2:])
countries = hierarchy_to_countries_(
args.old,
args.osm,
args.countries_synonyms,
args.hierarchy,
args.target,
args.mwm_version,
)
if args.output:
with open(args.output, "w") as f:
json.dump(countries, f, ensure_ascii=False, indent=1)
else:
print(json.dumps(countries, ensure_ascii=False, indent=1))
@staticmethod
def inject_promo_ids():
parser = argparse.ArgumentParser(
description="Injects promo cities osm ids into countries.txt"
)
parser.add_argument("--mwm", required=True, help="path to mwm files")
parser.add_argument(
"--types", required=True, help="path to omim/data/types.txt"
)
parser.add_argument(
"--promo_cities", required=True, help="Path to promo cities file"
)
parser.add_argument(
"--promo_countries", required=True, help="Path to promo countries file"
)
parser.add_argument(
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
)
parser.add_argument(
"--countries",
help="path to countries.txt file (default is countries.txt file into mwm directory)",
)
parser.add_argument(
"--output",
help="Output countries.txt file (default is countries.txt file into mwm directory)",
)
args = parser.parse_args(sys.argv[2:])
if not args.osm2ft:
args.osm2ft = args.mwm
if not args.countries:
args.countries = os.path.join(args.mwm, "countries.txt")
if not args.output:
args.output = os.path.join(args.mwm, "countries.txt")
with open(args.countries) as f:
countries = json.load(f)
inject_promo_ids(
countries,
args.promo_cities,
args.promo_countries,
args.mwm,
args.types,
args.osm2ft,
)
with open(args.output, "w") as f:
json.dump(countries, f, ensure_ascii=False, indent=1)
PostGeneration()

View file

@ -0,0 +1,188 @@
# Produces countries.txt from hierarchy.txt
#
# Hierarchy.txt format:
#
# Sample lines:
# Iran;Q794;ir;fa
# Iran_South;Q794-South
#
# Number of leading spaces mean hierarchy depth. In above case, Iran_South is inside Iran.
# Then follows a semicolon-separated list:
# 1. MWM file name without extension
# 2. Region name template using wikidata Qxxx codes and predefined strings
# 3. Country ISO code (used for flags in the legacy format)
# 4. Comma-separated list of language ISO codes for the region
import base64
import hashlib
import json
import os.path
import re
class CountryDict(dict):
def __init__(self, *args, **kwargs):
dict.__init__(self, *args, **kwargs)
self.order = ["id", "n", "v", "c", "s", "sha1_base64", "rs", "g"]
def __iter__(self):
for key in self.order:
if key in self:
yield key
for key in dict.__iter__(self):
if key not in self.order:
yield key
def iteritems(self):
for key in self.__iter__():
yield (key, self.__getitem__(key))
def get_mwm_hash(path, name):
filename = os.path.join(path, f"{name}.mwm")
h = hashlib.sha1()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
h.update(chunk)
return str(base64.b64encode(h.digest()), "utf-8")
def get_mwm_size(path, name):
filename = os.path.join(path, f"{name}.mwm")
return os.path.getsize(filename)
def collapse_single(root):
for i in range(len(root["g"])):
if "g" in root["g"][i]:
if len(root["g"][i]["g"]) == 1:
# replace group by a leaf
if "c" in root["g"][i]:
root["g"][i]["g"][0]["c"] = root["g"][i]["c"]
root["g"][i] = root["g"][i]["g"][0]
else:
collapse_single(root["g"][i])
def get_name(leaf):
if "n" in leaf:
return leaf["n"].lower()
else:
return leaf["id"].lower()
def sort_tree(root):
root["g"].sort(key=get_name)
for leaf in root["g"]:
if "g" in leaf:
sort_tree(leaf)
def parse_old_vs_new(old_vs_new_csv_path):
oldvs = {}
if not old_vs_new_csv_path:
return oldvs
with open(old_vs_new_csv_path) as f:
for line in f:
m = re.match(r"(.+?)\t(.+)", line.strip())
assert m
if m.group(2) in oldvs:
oldvs[m.group(2)].append(m.group(1))
else:
oldvs[m.group(2)] = [m.group(1)]
return oldvs
def parse_borders_vs_osm(borders_vs_osm_csv_path):
vsosm = {}
if not borders_vs_osm_csv_path:
return vsosm
with open(borders_vs_osm_csv_path) as f:
for line in f:
m = re.match(r"(.+)\t(\d)\t(.+)", line.strip())
assert m
if m.group(1) in vsosm:
vsosm[m.group(1)].append(m.group(3))
else:
vsosm[m.group(1)] = [m.group(3)]
return vsosm
def parse_countries_synonyms(countries_synonyms_csv_path):
countries_synonyms = {}
if not countries_synonyms_csv_path:
return countries_synonyms
with open(countries_synonyms_csv_path) as f:
for line in f:
m = re.match(r"(.+)\t(.+)", line.strip())
assert m
if m.group(1) in countries_synonyms:
countries_synonyms[m.group(1)].append(m.group(2))
else:
countries_synonyms[m.group(1)] = [m.group(2)]
return countries_synonyms
def hierarchy_to_countries(
old_vs_new_csv_path,
borders_vs_osm_csv_path,
countries_synonyms_csv_path,
hierarchy_path,
target_path,
version,
):
def fill_last(last, stack):
name = last["id"]
if not os.path.exists(os.path.join(target_path, f"{name}.mwm")):
return
last["s"] = get_mwm_size(target_path, name)
last["sha1_base64"] = get_mwm_hash(target_path, name)
if last["s"] >= 0:
stack[-1]["g"].append(last)
oldvs = parse_old_vs_new(old_vs_new_csv_path)
vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path)
countries_synonyms = parse_countries_synonyms(countries_synonyms_csv_path)
stack = [CountryDict(v=int(version), id="Countries", g=[])]
last = None
with open(hierarchy_path) as f:
for line in f:
m = re.match("( *)(.+)", line)
assert m
depth = len(m.group(1))
if last is not None:
lastd = last["d"]
del last["d"]
if lastd < depth:
# last is a group
last["g"] = []
stack.append(last)
else:
fill_last(last, stack)
while depth < len(stack) - 1:
# group ended, add it to higher group
g = stack.pop()
if len(g["g"]) > 0:
stack[-1]["g"].append(g)
items = m.group(2).split(";")
last = CountryDict({"id": items[0], "d": depth})
if items[0] in oldvs:
last["old"] = oldvs[items[0]]
if items[0] in vsosm:
last["affiliations"] = vsosm[items[0]]
if items[0] in countries_synonyms:
last["country_name_synonyms"] = countries_synonyms[items[0]]
# the last line is always a file
del last["d"]
fill_last(last, stack)
while len(stack) > 1:
g = stack.pop()
if len(g["g"]) > 0:
stack[-1]["g"].append(g)
collapse_single(stack[-1])
return stack[-1]

View file

@ -0,0 +1,146 @@
import json
import logging
import os
import re
import sys
from multiprocessing import Pool
from mwm import Mwm
from mwm.ft2osm import read_osm2ft
class PromoIds(object):
def __init__(self, countries, cities, mwm_path, types_path, osm2ft_path):
self.countries = countries
self.cities = cities
self.mwm_path = mwm_path
self.types_path = types_path
self.osm2ft_path = osm2ft_path
def inject_into_country(self, country):
nodes = self._get_nodes(country)
with Pool() as pool:
proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1)
countries_ids = [
ids for node_ids in proposed_ids for ids in node_ids["countries"]
]
if countries_ids:
country["top_countries_geo_ids"] = countries_ids
for idx, node_ids in enumerate(proposed_ids):
if not node_ids["cities"]:
continue
node = nodes[idx]
best = self._choose_best_city(node_ids["cities"])
node["top_city_geo_id"] = best["id"]
if best["id"] < 0:
node["top_city_geo_id"] += 1 << 64
def _find(self, leaf_id):
result = {"countries": [], "cities": []}
ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")):
osm_id = ft2osm.get(feature.index(), None)
types = feature.readable_types()
if "sponsored-promo_catalog" in types and osm_id in self.cities:
city = self._get_city(osm_id, types)
result["cities"].append(city)
if "place-country" in types and osm_id in self.countries:
result["countries"].append(osm_id)
return result
@staticmethod
def _get_nodes(root):
def __get_nodes(node, mwm_nodes):
if "g" in node:
for item in node["g"]:
__get_nodes(item, mwm_nodes)
else:
mwm_nodes.append(node)
mwm_nodes = []
__get_nodes(root, mwm_nodes)
return mwm_nodes
def _get_city(self, osm_id, types):
city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []}
for t in types:
if t.startswith("place"):
city["types"].append(t)
if not city["types"]:
logging.error(
f"Incorrect types for sponsored-promo_catalog "
f"feature osm_id {osm_id}"
)
sys.exit(3)
return city
def _choose_best_city(self, proposed_cities):
def key_compare(city):
return city["count_of_guides"], self._score_city_types(city["types"])
return max(proposed_cities, key=key_compare)
def _score_city_types(self, types):
return max(self._city_type_to_int(t) for t in types)
@staticmethod
def _city_type_to_int(t):
if t == "place-town":
return 1
if t == "place-city":
return 2
m = re.match(r"^place-city-capital?(-(?P<admin_level>\d+)|)$", t)
if m:
admin_level = int(m.groupdict("1")["admin_level"])
if 1 <= admin_level <= 12:
return 14 - admin_level
return 0
def load_promo_ids(path):
with open(path) as f:
root = json.load(f)
ids = {}
for item in root["data"]:
ids[item["osmid"]] = item["paid_bundles_count"]
return ids
def load_osm2ft(osm2ft_path, mwm_id):
osm2ft_name = os.path.join(osm2ft_path, mwm_id + ".mwm.osm2ft")
if not os.path.exists(osm2ft_name):
logging.error(f"Cannot find {osm2ft_name}")
sys.exit(3)
with open(osm2ft_name, "rb") as f:
return read_osm2ft(f, ft2osm=True, tuples=False)
def inject_promo_ids(
countries_json,
promo_cities_path,
promo_countries_path,
mwm_path,
types_path,
osm2ft_path,
):
promo_ids = PromoIds(
load_promo_ids(promo_countries_path),
load_promo_ids(promo_cities_path),
mwm_path,
types_path,
osm2ft_path,
)
for country in countries_json["g"]:
promo_ids.inject_into_country(country)

View file

@ -0,0 +1 @@
omim-mwm

View file

@ -0,0 +1 @@
-r ../mwm/requirements_dev.txt

View file

@ -0,0 +1,31 @@
#!/usr/bin/env python3
import os
import sys
import setuptools
module_dir = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(module_dir, "..", "..", ".."))
from pyhelpers.setup import chdir
from pyhelpers.setup import get_version
from pyhelpers.setup import get_requirements
with chdir(os.path.abspath(os.path.dirname(__file__))):
setuptools.setup(
name="omim-post_generation",
version=str(get_version()),
author="CoMaps",
author_email="info@comaps.app",
description="This package is a library for post-processing the generated maps.",
url="https://codeberg.org/comaps",
package_dir={"post_generation": ""},
packages=["post_generation"],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
],
python_requires=">=3.6",
install_requires=get_requirements(),
)