Repo created

2025-11-22 13:58:55 +01:00 · 2025-11-22 13:58:55 +01:00 · 68073add76
commit 68073add76
parent 4af19165ec
12458 changed files with 12350765 additions and 2 deletions
--- a/tools/python/post_generation/init.py
+++ b/tools/python/post_generation/init.py
--- a/tools/python/post_generation/main.py
+++ b/tools/python/post_generation/main.py
@ -0,0 +1,158 @@
+import argparse
+import json
+import os
+import sys
+
+from post_generation.hierarchy_to_countries import (
+    hierarchy_to_countries as hierarchy_to_countries_,
+)
+from post_generation.inject_promo_ids import inject_promo_ids
+from post_generation.localads_mwm_to_csv import create_csv
+
+
+class PostGeneration:
+    def __init__(self):
+        parser = argparse.ArgumentParser(
+            description="Post-generation instruments",
+            usage="""post_generation <command> [<args>]
+The post_generation commands are:
+    localads_mwm_to_csv    Prepares CSV files for uploading to localads database from mwm files.
+    hierarchy_to_countries Produces countries.txt from hierarchy.txt.
+    inject_promo_ids       Injects promo osm ids into countries.txt
+    """,
+        )
+        parser.add_argument("command", help="Subcommand to run")
+        args = parser.parse_args(sys.argv[1:2])
+        if not hasattr(self, args.command):
+            print(f"Unrecognized command {args.command}")
+            parser.print_help()
+            exit(1)
+        getattr(self, args.command)()
+
+    @staticmethod
+    def localads_mwm_to_csv():
+        parser = argparse.ArgumentParser(
+            description="Prepares CSV files for uploading to localads database "
+            "from mwm files."
+        )
+        parser.add_argument("mwm", help="path to mwm files")
+        parser.add_argument(
+            "--osm2ft", help="path to osm2ft files (default is the same as mwm)"
+        )
+        parser.add_argument(
+            "--output", default=".", help="path to generated files ('.' by default)"
+        )
+        types_default = os.path.join(
+            os.path.dirname(__file__), "..", "..", "..", "data", "types.txt"
+        )
+        parser.add_argument(
+            "--types", default=types_default, help="path to omim/data/types.txt"
+        )
+        parser.add_argument(
+            "--threads", type=int, default=1, help="number of threads to process files"
+        )
+        parser.add_argument(
+            "--mwm_version", type=int, required=True, help="Mwm version"
+        )
+        args = parser.parse_args(sys.argv[2:])
+        if not args.osm2ft:
+            args.osm2ft = args.mwm
+
+        create_csv(
+            args.output,
+            args.mwm,
+            args.osm2ft,
+            args.mwm_version,
+            args.threads,
+        )
+
+    @staticmethod
+    def hierarchy_to_countries():
+        parser = argparse.ArgumentParser(
+            description="Produces countries.txt from hierarchy.txt."
+        )
+        parser.add_argument("--target", required=True, help="Path to mwm files")
+        parser.add_argument(
+            "--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file"
+        )
+        parser.add_argument("--old", required=True, help="old_vs_new.csv file")
+        parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file")
+        parser.add_argument(
+            "--countries_synonyms", required=True, help="countries_synonyms.csv file"
+        )
+        parser.add_argument(
+            "--mwm_version", type=int, required=True, help="Mwm version"
+        )
+        parser.add_argument(
+            "-o",
+            "--output",
+            required=True,
+            help="Output countries.txt file (default is stdout)",
+        )
+        args = parser.parse_args(sys.argv[2:])
+        countries = hierarchy_to_countries_(
+            args.old,
+            args.osm,
+            args.countries_synonyms,
+            args.hierarchy,
+            args.target,
+            args.mwm_version,
+        )
+        if args.output:
+            with open(args.output, "w") as f:
+                json.dump(countries, f, ensure_ascii=False, indent=1)
+        else:
+            print(json.dumps(countries, ensure_ascii=False, indent=1))
+
+    @staticmethod
+    def inject_promo_ids():
+        parser = argparse.ArgumentParser(
+            description="Injects promo cities osm ids into countries.txt"
+        )
+        parser.add_argument("--mwm", required=True, help="path to mwm files")
+        parser.add_argument(
+            "--types", required=True, help="path to omim/data/types.txt"
+        )
+        parser.add_argument(
+            "--promo_cities", required=True, help="Path to promo cities file"
+        )
+        parser.add_argument(
+            "--promo_countries", required=True, help="Path to promo countries file"
+        )
+        parser.add_argument(
+            "--osm2ft", help="path to osm2ft files (default is the same as mwm)"
+        )
+        parser.add_argument(
+            "--countries",
+            help="path to countries.txt file (default is countries.txt file into mwm directory)",
+        )
+        parser.add_argument(
+            "--output",
+            help="Output countries.txt file (default is countries.txt file into mwm directory)",
+        )
+        args = parser.parse_args(sys.argv[2:])
+
+        if not args.osm2ft:
+            args.osm2ft = args.mwm
+        if not args.countries:
+            args.countries = os.path.join(args.mwm, "countries.txt")
+        if not args.output:
+            args.output = os.path.join(args.mwm, "countries.txt")
+
+        with open(args.countries) as f:
+            countries = json.load(f)
+
+        inject_promo_ids(
+            countries,
+            args.promo_cities,
+            args.promo_countries,
+            args.mwm,
+            args.types,
+            args.osm2ft,
+        )
+
+        with open(args.output, "w") as f:
+            json.dump(countries, f, ensure_ascii=False, indent=1)
+
+
+PostGeneration()
--- a/tools/python/post_generation/hierarchy_to_countries.py
+++ b/tools/python/post_generation/hierarchy_to_countries.py
@ -0,0 +1,188 @@
+# Produces countries.txt from hierarchy.txt
+#
+# Hierarchy.txt format:
+#
+# Sample lines:
+# Iran;Q794;ir;fa
+#  Iran_South;Q794-South
+#
+# Number of leading spaces mean hierarchy depth. In above case, Iran_South is inside Iran.
+# Then follows a semicolon-separated list:
+# 1. MWM file name without extension
+# 2. Region name template using wikidata Qxxx codes and predefined strings
+# 3. Country ISO code (used for flags in the legacy format)
+# 4. Comma-separated list of language ISO codes for the region
+
+import base64
+import hashlib
+import json
+import os.path
+import re
+
+
+class CountryDict(dict):
+    def __init__(self, *args, **kwargs):
+        dict.__init__(self, *args, **kwargs)
+        self.order = ["id", "n", "v", "c", "s", "sha1_base64", "rs", "g"]
+
+    def __iter__(self):
+        for key in self.order:
+            if key in self:
+                yield key
+        for key in dict.__iter__(self):
+            if key not in self.order:
+                yield key
+
+    def iteritems(self):
+        for key in self.__iter__():
+            yield (key, self.__getitem__(key))
+
+
+def get_mwm_hash(path, name):
+    filename = os.path.join(path, f"{name}.mwm")
+    h = hashlib.sha1()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            h.update(chunk)
+    return str(base64.b64encode(h.digest()), "utf-8")
+
+
+def get_mwm_size(path, name):
+    filename = os.path.join(path, f"{name}.mwm")
+    return os.path.getsize(filename)
+
+
+def collapse_single(root):
+    for i in range(len(root["g"])):
+        if "g" in root["g"][i]:
+            if len(root["g"][i]["g"]) == 1:
+                # replace group by a leaf
+                if "c" in root["g"][i]:
+                    root["g"][i]["g"][0]["c"] = root["g"][i]["c"]
+                root["g"][i] = root["g"][i]["g"][0]
+            else:
+                collapse_single(root["g"][i])
+
+
+def get_name(leaf):
+    if "n" in leaf:
+        return leaf["n"].lower()
+    else:
+        return leaf["id"].lower()
+
+
+def sort_tree(root):
+    root["g"].sort(key=get_name)
+    for leaf in root["g"]:
+        if "g" in leaf:
+            sort_tree(leaf)
+
+
+def parse_old_vs_new(old_vs_new_csv_path):
+    oldvs = {}
+    if not old_vs_new_csv_path:
+        return oldvs
+
+    with open(old_vs_new_csv_path) as f:
+        for line in f:
+            m = re.match(r"(.+?)\t(.+)", line.strip())
+            assert m
+            if m.group(2) in oldvs:
+                oldvs[m.group(2)].append(m.group(1))
+            else:
+                oldvs[m.group(2)] = [m.group(1)]
+    return oldvs
+
+
+def parse_borders_vs_osm(borders_vs_osm_csv_path):
+    vsosm = {}
+    if not borders_vs_osm_csv_path:
+        return vsosm
+
+    with open(borders_vs_osm_csv_path) as f:
+        for line in f:
+            m = re.match(r"(.+)\t(\d)\t(.+)", line.strip())
+            assert m
+            if m.group(1) in vsosm:
+                vsosm[m.group(1)].append(m.group(3))
+            else:
+                vsosm[m.group(1)] = [m.group(3)]
+    return vsosm
+
+
+def parse_countries_synonyms(countries_synonyms_csv_path):
+    countries_synonyms = {}
+    if not countries_synonyms_csv_path:
+        return countries_synonyms
+
+    with open(countries_synonyms_csv_path) as f:
+        for line in f:
+            m = re.match(r"(.+)\t(.+)", line.strip())
+            assert m
+            if m.group(1) in countries_synonyms:
+                countries_synonyms[m.group(1)].append(m.group(2))
+            else:
+                countries_synonyms[m.group(1)] = [m.group(2)]
+    return countries_synonyms
+
+
+def hierarchy_to_countries(
+    old_vs_new_csv_path,
+    borders_vs_osm_csv_path,
+    countries_synonyms_csv_path,
+    hierarchy_path,
+    target_path,
+    version,
+):
+    def fill_last(last, stack):
+        name = last["id"]
+        if not os.path.exists(os.path.join(target_path, f"{name}.mwm")):
+            return
+        last["s"] = get_mwm_size(target_path, name)
+        last["sha1_base64"] = get_mwm_hash(target_path, name)
+        if last["s"] >= 0:
+            stack[-1]["g"].append(last)
+
+    oldvs = parse_old_vs_new(old_vs_new_csv_path)
+    vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path)
+    countries_synonyms = parse_countries_synonyms(countries_synonyms_csv_path)
+    stack = [CountryDict(v=int(version), id="Countries", g=[])]
+    last = None
+    with open(hierarchy_path) as f:
+        for line in f:
+            m = re.match("( *)(.+)", line)
+            assert m
+            depth = len(m.group(1))
+            if last is not None:
+                lastd = last["d"]
+                del last["d"]
+                if lastd < depth:
+                    # last is a group
+                    last["g"] = []
+                    stack.append(last)
+                else:
+                    fill_last(last, stack)
+            while depth < len(stack) - 1:
+                # group ended, add it to higher group
+                g = stack.pop()
+                if len(g["g"]) > 0:
+                    stack[-1]["g"].append(g)
+            items = m.group(2).split(";")
+            last = CountryDict({"id": items[0], "d": depth})
+            if items[0] in oldvs:
+                last["old"] = oldvs[items[0]]
+            if items[0] in vsosm:
+                last["affiliations"] = vsosm[items[0]]
+            if items[0] in countries_synonyms:
+                last["country_name_synonyms"] = countries_synonyms[items[0]]
+
+    # the last line is always a file
+    del last["d"]
+    fill_last(last, stack)
+    while len(stack) > 1:
+        g = stack.pop()
+        if len(g["g"]) > 0:
+            stack[-1]["g"].append(g)
+
+    collapse_single(stack[-1])
+    return stack[-1]
--- a/tools/python/post_generation/inject_promo_ids.py
+++ b/tools/python/post_generation/inject_promo_ids.py
@ -0,0 +1,146 @@
+import json
+import logging
+import os
+import re
+import sys
+from multiprocessing import Pool
+
+from mwm import Mwm
+from mwm.ft2osm import read_osm2ft
+
+
+class PromoIds(object):
+    def __init__(self, countries, cities, mwm_path, types_path, osm2ft_path):
+        self.countries = countries
+        self.cities = cities
+        self.mwm_path = mwm_path
+        self.types_path = types_path
+        self.osm2ft_path = osm2ft_path
+
+    def inject_into_country(self, country):
+        nodes = self._get_nodes(country)
+        with Pool() as pool:
+            proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1)
+
+        countries_ids = [
+            ids for node_ids in proposed_ids for ids in node_ids["countries"]
+        ]
+        if countries_ids:
+            country["top_countries_geo_ids"] = countries_ids
+
+        for idx, node_ids in enumerate(proposed_ids):
+            if not node_ids["cities"]:
+                continue
+            node = nodes[idx]
+            best = self._choose_best_city(node_ids["cities"])
+            node["top_city_geo_id"] = best["id"]
+            if best["id"] < 0:
+                node["top_city_geo_id"] += 1 << 64
+
+    def _find(self, leaf_id):
+        result = {"countries": [], "cities": []}
+        ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
+
+        for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")):
+            osm_id = ft2osm.get(feature.index(), None)
+            types = feature.readable_types()
+
+            if "sponsored-promo_catalog" in types and osm_id in self.cities:
+                city = self._get_city(osm_id, types)
+                result["cities"].append(city)
+
+            if "place-country" in types and osm_id in self.countries:
+                result["countries"].append(osm_id)
+
+        return result
+
+    @staticmethod
+    def _get_nodes(root):
+        def __get_nodes(node, mwm_nodes):
+            if "g" in node:
+                for item in node["g"]:
+                    __get_nodes(item, mwm_nodes)
+            else:
+                mwm_nodes.append(node)
+
+        mwm_nodes = []
+        __get_nodes(root, mwm_nodes)
+        return mwm_nodes
+
+    def _get_city(self, osm_id, types):
+        city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []}
+
+        for t in types:
+            if t.startswith("place"):
+                city["types"].append(t)
+
+        if not city["types"]:
+            logging.error(
+                f"Incorrect types for sponsored-promo_catalog "
+                f"feature osm_id {osm_id}"
+            )
+            sys.exit(3)
+
+        return city
+
+    def _choose_best_city(self, proposed_cities):
+        def key_compare(city):
+            return city["count_of_guides"], self._score_city_types(city["types"])
+
+        return max(proposed_cities, key=key_compare)
+
+    def _score_city_types(self, types):
+        return max(self._city_type_to_int(t) for t in types)
+
+    @staticmethod
+    def _city_type_to_int(t):
+        if t == "place-town":
+            return 1
+        if t == "place-city":
+            return 2
+
+        m = re.match(r"^place-city-capital?(-(?P<admin_level>\d+)|)$", t)
+        if m:
+            admin_level = int(m.groupdict("1")["admin_level"])
+            if 1 <= admin_level <= 12:
+                return 14 - admin_level
+        return 0
+
+
+def load_promo_ids(path):
+    with open(path) as f:
+        root = json.load(f)
+
+    ids = {}
+    for item in root["data"]:
+        ids[item["osmid"]] = item["paid_bundles_count"]
+
+    return ids
+
+
+def load_osm2ft(osm2ft_path, mwm_id):
+    osm2ft_name = os.path.join(osm2ft_path, mwm_id + ".mwm.osm2ft")
+    if not os.path.exists(osm2ft_name):
+        logging.error(f"Cannot find {osm2ft_name}")
+        sys.exit(3)
+    with open(osm2ft_name, "rb") as f:
+        return read_osm2ft(f, ft2osm=True, tuples=False)
+
+
+def inject_promo_ids(
+    countries_json,
+    promo_cities_path,
+    promo_countries_path,
+    mwm_path,
+    types_path,
+    osm2ft_path,
+):
+    promo_ids = PromoIds(
+        load_promo_ids(promo_countries_path),
+        load_promo_ids(promo_cities_path),
+        mwm_path,
+        types_path,
+        osm2ft_path,
+    )
+    for country in countries_json["g"]:
+        promo_ids.inject_into_country(country)
--- a/tools/python/post_generation/requirements.txt
+++ b/tools/python/post_generation/requirements.txt
@ -0,0 +1 @@
+omim-mwm
--- a/tools/python/post_generation/requirements_dev.txt
+++ b/tools/python/post_generation/requirements_dev.txt
@ -0,0 +1 @@
+-r ../mwm/requirements_dev.txt
--- a/tools/python/post_generation/setup.py
+++ b/tools/python/post_generation/setup.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+import setuptools
+
+module_dir = os.path.abspath(os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(module_dir, "..", "..", ".."))
+
+from pyhelpers.setup import chdir
+from pyhelpers.setup import get_version
+from pyhelpers.setup import get_requirements
+
+
+with chdir(os.path.abspath(os.path.dirname(__file__))):
+    setuptools.setup(
+        name="omim-post_generation",
+        version=str(get_version()),
+        author="CoMaps",
+        author_email="info@comaps.app",
+        description="This package is a library for post-processing the generated maps.",
+        url="https://codeberg.org/comaps",
+        package_dir={"post_generation": ""},
+        packages=["post_generation"],
+        classifiers=[
+            "Programming Language :: Python :: 3",
+            "License :: OSI Approved :: Apache Software License",
+        ],
+        python_requires=">=3.6",
+        install_requires=get_requirements(),
+    )