Repo created

This commit is contained in:
Fr4nz D13trich 2025-11-22 13:58:55 +01:00
parent 4af19165ec
commit 68073add76
12458 changed files with 12350765 additions and 2 deletions

View file

View file

@ -0,0 +1,63 @@
import argparse
import itertools
import logging
import os
import wikipediaapi
from descriptions.descriptions_downloader import check_and_get_checker
from descriptions.descriptions_downloader import download_from_wikidata_tags
from descriptions.descriptions_downloader import download_from_wikipedia_tags
from descriptions.descriptions_downloader import log
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.", usage="python3 -m descriptions "
"--output_dir ~/maps_build/descriptions "
"--wikipedia ~/maps_build/wiki_urls.txt "
"--wikidata ~/maps_build/id_to_wikidata.csv "
"--langs en de fr es ru tr"
)
parser.add_argument(
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages."
)
parser.add_argument(
"--popularity", metavar="PATH", type=str,
help="File with popular object ids with wikipedia data to download. If not given, download all objects.",
)
parser.add_argument(
"--wikipedia", metavar="PATH", type=str, required=True, help="Input file with wikipedia url.",
)
parser.add_argument(
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
)
parser.add_argument("--langs", metavar="LANGS", type=str, nargs="+", action="append",
help="Languages for pages. If left blank, pages in all available languages will be loaded.",
)
return parser.parse_args()
def main():
log.setLevel(logging.WARNING)
wikipediaapi.log.setLevel(logging.DEBUG)
args = parse_args()
wikipedia_file = args.wikipedia
wikidata_file = args.wikidata
output_dir = args.output_dir
popularity_file = args.popularity
langs = list(itertools.chain.from_iterable(args.langs))
os.makedirs(output_dir, exist_ok=True)
checker = check_and_get_checker(popularity_file)
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
if wikidata_file is None:
log.warning(f"Wikidata file not set.")
elif os.path.exists(wikidata_file):
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
else:
log.warning(f"Wikidata ({wikidata_file}) file not found.")
main()

View file

@ -0,0 +1,318 @@
import json
import logging
import os
import random
import time
import types
import urllib.error
import urllib.parse
import http.client
from concurrent.futures import ThreadPoolExecutor
import htmlmin
import requests
import wikipediaapi
from bs4 import BeautifulSoup
from wikidata.client import Client
from descriptions.exceptions import GettingError, ParseError
"""
This script downloads Wikipedia pages for different languages.
"""
log = logging.getLogger(__name__)
WORKERS = 80
REQUEST_ATTEMPTS = 8
ATTEMPTS_PAUSE_SECONDS = 4.0
HEADERS = {f"h{x}" for x in range(1, 7)}
BAD_SECTIONS = {
"en": [
"External links",
"Sources",
"See also",
"Bibliography",
"Further reading",
"References",
],
"de": [
"Einzelnachweise",
"Weblinks",
"Literatur",
"Siehe auch",
"Anmerkungen",
"Anmerkungen und Einzelnachweise",
"Filme",
"Einzelbelege",
],
"fr": [
"Bibliographie",
"Lien externe",
"Voir aussi",
"Liens externes",
"Références",
"Notes et références",
"Articles connexes",
],
"es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"],
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
"tr": ["Kaynakça", "Ayrıca bakınız", "Dış bağlantılar", "Notlar", "Dipnot"],
}
def try_get(obj, prop, *args, **kwargs):
attempts = REQUEST_ATTEMPTS
while attempts != 0:
try:
attr = getattr(obj, prop)
is_method = isinstance(attr, types.MethodType)
return attr(*args, **kwargs) if is_method else attr
except (
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
json.decoder.JSONDecodeError,
http.client.HTTPException,
) as e:
log.debug(e)
except urllib.error.HTTPError as e:
if e.code == 404:
raise GettingError(f"Page not found {e.msg}")
except KeyError:
raise GettingError(f"Getting {prop} field failed. {prop} not found.")
except urllib.error.URLError:
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
time.sleep(random.uniform(0.0, ATTEMPTS_PAUSE_SECONDS))
attempts -= 1
raise GettingError(f"Getting {prop} field failed")
def read_popularity(path):
"""
:param path: a path of popularity file. A file contains '<id>,<rank>' rows.
:return: a set of popularity object ids
"""
ids = set()
for line in open(path):
try:
ident = int(line.split(",", maxsplit=1)[0])
except (AttributeError, IndexError):
continue
ids.add(ident)
return ids
def should_download_page(popularity_set):
def wrapped(ident):
return popularity_set is None or ident in popularity_set
return wrapped
def remove_bad_sections(soup, lang):
if lang not in BAD_SECTIONS:
return soup
it = iter(soup.find_all())
current = next(it, None)
current_header_level = None
while current is not None:
if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
current_header_level = current.name
current.extract()
current = next(it, None)
while current is not None:
if current.name == current_header_level:
break
current.extract()
current = next(it, None)
else:
current = next(it, None)
return soup
def beautify_page(html, lang):
soup = BeautifulSoup(html, "html.parser")
for x in soup.find_all():
if len(x.text.strip()) == 0:
x.extract()
soup = remove_bad_sections(soup, lang)
html = str(soup.prettify())
html = htmlmin.minify(html, remove_empty_space=True)
return html
def need_lang(lang, langs):
return lang in langs if langs else True
def get_page_info(url):
url = urllib.parse.unquote(url)
parsed = urllib.parse.urlparse(url)
try:
lang = parsed.netloc.split(".", maxsplit=1)[0]
except (AttributeError, IndexError):
raise ParseError(f"{parsed.netloc} is incorrect.")
try:
page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
except (AttributeError, IndexError):
raise ParseError(f"{parsed.path} is incorrect.")
return lang, page_name
def get_wiki_page(lang, page_name):
wiki = wikipediaapi.Wikipedia(
language=lang, extract_format=wikipediaapi.ExtractFormat.HTML
)
return wiki.page(page_name)
def download(directory, url):
try:
lang, page_name = get_page_info(url)
except ParseError:
log.exception(f"Parsing failed. {url} is incorrect.")
return None
path = os.path.join(directory, f"{lang}.html")
if os.path.exists(path):
log.debug(f"{path} already exists.")
return None
page = get_wiki_page(lang, page_name)
try:
text = try_get(page, "text")
except GettingError as e:
log.exception(f"Error: page {page_name} is not downloaded for lang {lang} and url {url} ({e}).")
return None
page_size = len(text)
if page_size > 0:
os.makedirs(directory, exist_ok=True)
text = beautify_page(text, lang)
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
with open(path, "w") as file:
file.write(text)
else:
log.warning(f"Page {url} is empty. It has not been saved.")
return text
def get_wiki_langs(url):
lang, page_name = get_page_info(url)
page = get_wiki_page(lang, page_name)
curr_lang = [(lang, url)]
try:
langlinks = try_get(page, "langlinks")
return (
list(zip(langlinks.keys(), [link.fullurl for link in langlinks.values()]))
+ curr_lang
)
except GettingError as e:
log.exception(f"Error: no languages for page {page_name} with url {url} ({e}).")
return curr_lang
def download_all_from_wikipedia(path, url, langs):
try:
available_langs = get_wiki_langs(url)
except ParseError:
log.exception("Parsing failed. {url} is incorrect.")
return
available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
for lang in available_langs:
download(path, lang[1])
def wikipedia_worker(output_dir, checker, langs):
def wrapped(line):
if not line.strip():
return
try:
# First param is mwm_path, which added this line entry.
_, ident, url = line.split("\t")
ident = int(ident)
if not checker(ident):
return
url = url.strip()
except (AttributeError, ValueError):
log.exception(f"{line} is incorrect.")
return
parsed = urllib.parse.urlparse(url)
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
download_all_from_wikipedia(path, url, langs)
return wrapped
def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
with open(input_file) as file:
_ = file.readline() # skip header
with ThreadPoolExecutor(WORKERS) as pool:
pool.map(wikipedia_worker(output_dir, checker, langs), file)
def get_wikidata_urls(entity, langs):
try:
keys = entity.data["sitelinks"].keys()
except (KeyError, AttributeError):
log.exception(f"Sitelinks not found for {entity.id}.")
return None
return [
entity.data["sitelinks"][k]["url"]
for k in keys
if any([k.startswith(lang) for lang in langs])
]
def wikidata_worker(output_dir, checker, langs):
def wrapped(line):
if not line.strip():
return
try:
ident, wikidata_id = line.split("\t")
ident = int(ident)
wikidata_id = wikidata_id.strip()
if not checker(ident):
return
except (AttributeError, ValueError):
log.exception(f"{line} is incorrect.")
return
client = Client()
try:
entity = try_get(client, "get", wikidata_id, load=True)
except GettingError:
log.exception(f"Error: page is not downloaded {wikidata_id}.")
return
urls = get_wikidata_urls(entity, langs)
if not urls:
return
path = os.path.join(output_dir, wikidata_id)
for url in urls:
download(path, url)
return wrapped
def download_from_wikidata_tags(input_file, output_dir, langs, checker):
wikidata_output_dir = os.path.join(output_dir, "wikidata")
os.makedirs(wikidata_output_dir, exist_ok=True)
with open(input_file) as file:
with ThreadPoolExecutor(WORKERS) as pool:
pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file)
def check_and_get_checker(popularity_file):
popularity_set = None
if popularity_file is None:
log.warning(f"Popularity file not set.")
elif os.path.exists(popularity_file):
popularity_set = read_popularity(popularity_file)
log.info(f"Popularity set size: {len(popularity_set)}.")
else:
log.error(f"Popularity file ({popularity_file}) not found.")
return should_download_page(popularity_set)

View file

@ -0,0 +1,10 @@
class DescriptionError(Exception):
pass
class ParseError(DescriptionError):
pass
class GettingError(DescriptionError):
pass

View file

@ -0,0 +1,5 @@
htmlmin2==0.1.13
requests>=2.31.0
beautifulsoup4==4.9.1
wikidata==0.6.1
wikipedia-api==0.5.4

View file

@ -0,0 +1,5 @@
htmlmin2==0.1.13
requests>=2.31.0
beautifulsoup4==4.9.1
wikidata==0.6.1
wikipedia-api==0.5.4

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python3
import os
import sys
import setuptools
module_dir = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(module_dir, "..", "..", ".."))
from pyhelpers.setup import chdir
from pyhelpers.setup import get_version
from pyhelpers.setup import get_requirements
with chdir(os.path.abspath(os.path.dirname(__file__))):
setuptools.setup(
name="omim-descriptions",
version=str(get_version()),
author="CoMaps",
author_email="info@comaps.app",
description="This package is a library that provides descriptions "
"(such as those from Wikipedia) to geographic objects.",
url="https://codeberg.org/comaps",
package_dir={"descriptions": ""},
packages=["descriptions"],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
],
python_requires=">=3.6",
install_requires=get_requirements(),
)