Repo created
This commit is contained in:
parent
4af19165ec
commit
68073add76
12458 changed files with 12350765 additions and 2 deletions
0
tools/python/descriptions/__init__.py
Normal file
0
tools/python/descriptions/__init__.py
Normal file
63
tools/python/descriptions/__main__.py
Normal file
63
tools/python/descriptions/__main__.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import argparse
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
|
||||
import wikipediaapi
|
||||
|
||||
from descriptions.descriptions_downloader import check_and_get_checker
|
||||
from descriptions.descriptions_downloader import download_from_wikidata_tags
|
||||
from descriptions.descriptions_downloader import download_from_wikipedia_tags
|
||||
from descriptions.descriptions_downloader import log
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Download wiki pages.", usage="python3 -m descriptions "
|
||||
"--output_dir ~/maps_build/descriptions "
|
||||
"--wikipedia ~/maps_build/wiki_urls.txt "
|
||||
"--wikidata ~/maps_build/id_to_wikidata.csv "
|
||||
"--langs en de fr es ru tr"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--popularity", metavar="PATH", type=str,
|
||||
help="File with popular object ids with wikipedia data to download. If not given, download all objects.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikipedia", metavar="PATH", type=str, required=True, help="Input file with wikipedia url.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
||||
)
|
||||
parser.add_argument("--langs", metavar="LANGS", type=str, nargs="+", action="append",
|
||||
help="Languages for pages. If left blank, pages in all available languages will be loaded.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
log.setLevel(logging.WARNING)
|
||||
wikipediaapi.log.setLevel(logging.DEBUG)
|
||||
|
||||
args = parse_args()
|
||||
wikipedia_file = args.wikipedia
|
||||
wikidata_file = args.wikidata
|
||||
output_dir = args.output_dir
|
||||
popularity_file = args.popularity
|
||||
langs = list(itertools.chain.from_iterable(args.langs))
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
checker = check_and_get_checker(popularity_file)
|
||||
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
|
||||
|
||||
if wikidata_file is None:
|
||||
log.warning(f"Wikidata file not set.")
|
||||
elif os.path.exists(wikidata_file):
|
||||
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
|
||||
else:
|
||||
log.warning(f"Wikidata ({wikidata_file}) file not found.")
|
||||
|
||||
|
||||
main()
|
||||
318
tools/python/descriptions/descriptions_downloader.py
Normal file
318
tools/python/descriptions/descriptions_downloader.py
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import types
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import http.client
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import htmlmin
|
||||
import requests
|
||||
import wikipediaapi
|
||||
from bs4 import BeautifulSoup
|
||||
from wikidata.client import Client
|
||||
|
||||
from descriptions.exceptions import GettingError, ParseError
|
||||
|
||||
"""
|
||||
This script downloads Wikipedia pages for different languages.
|
||||
"""
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
WORKERS = 80
|
||||
REQUEST_ATTEMPTS = 8
|
||||
ATTEMPTS_PAUSE_SECONDS = 4.0
|
||||
|
||||
HEADERS = {f"h{x}" for x in range(1, 7)}
|
||||
BAD_SECTIONS = {
|
||||
"en": [
|
||||
"External links",
|
||||
"Sources",
|
||||
"See also",
|
||||
"Bibliography",
|
||||
"Further reading",
|
||||
"References",
|
||||
],
|
||||
"de": [
|
||||
"Einzelnachweise",
|
||||
"Weblinks",
|
||||
"Literatur",
|
||||
"Siehe auch",
|
||||
"Anmerkungen",
|
||||
"Anmerkungen und Einzelnachweise",
|
||||
"Filme",
|
||||
"Einzelbelege",
|
||||
],
|
||||
"fr": [
|
||||
"Bibliographie",
|
||||
"Lien externe",
|
||||
"Voir aussi",
|
||||
"Liens externes",
|
||||
"Références",
|
||||
"Notes et références",
|
||||
"Articles connexes",
|
||||
],
|
||||
"es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"],
|
||||
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
|
||||
"tr": ["Kaynakça", "Ayrıca bakınız", "Dış bağlantılar", "Notlar", "Dipnot"],
|
||||
}
|
||||
|
||||
|
||||
def try_get(obj, prop, *args, **kwargs):
|
||||
attempts = REQUEST_ATTEMPTS
|
||||
while attempts != 0:
|
||||
try:
|
||||
attr = getattr(obj, prop)
|
||||
is_method = isinstance(attr, types.MethodType)
|
||||
return attr(*args, **kwargs) if is_method else attr
|
||||
except (
|
||||
requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
json.decoder.JSONDecodeError,
|
||||
http.client.HTTPException,
|
||||
) as e:
|
||||
log.debug(e)
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
raise GettingError(f"Page not found {e.msg}")
|
||||
except KeyError:
|
||||
raise GettingError(f"Getting {prop} field failed. {prop} not found.")
|
||||
except urllib.error.URLError:
|
||||
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
|
||||
|
||||
time.sleep(random.uniform(0.0, ATTEMPTS_PAUSE_SECONDS))
|
||||
attempts -= 1
|
||||
|
||||
raise GettingError(f"Getting {prop} field failed")
|
||||
|
||||
|
||||
def read_popularity(path):
|
||||
"""
|
||||
:param path: a path of popularity file. A file contains '<id>,<rank>' rows.
|
||||
:return: a set of popularity object ids
|
||||
"""
|
||||
ids = set()
|
||||
for line in open(path):
|
||||
try:
|
||||
ident = int(line.split(",", maxsplit=1)[0])
|
||||
except (AttributeError, IndexError):
|
||||
continue
|
||||
ids.add(ident)
|
||||
return ids
|
||||
|
||||
|
||||
def should_download_page(popularity_set):
|
||||
def wrapped(ident):
|
||||
return popularity_set is None or ident in popularity_set
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def remove_bad_sections(soup, lang):
|
||||
if lang not in BAD_SECTIONS:
|
||||
return soup
|
||||
it = iter(soup.find_all())
|
||||
current = next(it, None)
|
||||
current_header_level = None
|
||||
while current is not None:
|
||||
if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
|
||||
current_header_level = current.name
|
||||
current.extract()
|
||||
current = next(it, None)
|
||||
while current is not None:
|
||||
if current.name == current_header_level:
|
||||
break
|
||||
current.extract()
|
||||
current = next(it, None)
|
||||
else:
|
||||
current = next(it, None)
|
||||
return soup
|
||||
|
||||
|
||||
def beautify_page(html, lang):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for x in soup.find_all():
|
||||
if len(x.text.strip()) == 0:
|
||||
x.extract()
|
||||
soup = remove_bad_sections(soup, lang)
|
||||
html = str(soup.prettify())
|
||||
html = htmlmin.minify(html, remove_empty_space=True)
|
||||
return html
|
||||
|
||||
|
||||
def need_lang(lang, langs):
|
||||
return lang in langs if langs else True
|
||||
|
||||
|
||||
def get_page_info(url):
|
||||
url = urllib.parse.unquote(url)
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
try:
|
||||
lang = parsed.netloc.split(".", maxsplit=1)[0]
|
||||
except (AttributeError, IndexError):
|
||||
raise ParseError(f"{parsed.netloc} is incorrect.")
|
||||
try:
|
||||
page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
|
||||
except (AttributeError, IndexError):
|
||||
raise ParseError(f"{parsed.path} is incorrect.")
|
||||
return lang, page_name
|
||||
|
||||
|
||||
def get_wiki_page(lang, page_name):
|
||||
wiki = wikipediaapi.Wikipedia(
|
||||
language=lang, extract_format=wikipediaapi.ExtractFormat.HTML
|
||||
)
|
||||
return wiki.page(page_name)
|
||||
|
||||
|
||||
def download(directory, url):
|
||||
try:
|
||||
lang, page_name = get_page_info(url)
|
||||
except ParseError:
|
||||
log.exception(f"Parsing failed. {url} is incorrect.")
|
||||
return None
|
||||
|
||||
path = os.path.join(directory, f"{lang}.html")
|
||||
if os.path.exists(path):
|
||||
log.debug(f"{path} already exists.")
|
||||
return None
|
||||
|
||||
page = get_wiki_page(lang, page_name)
|
||||
try:
|
||||
text = try_get(page, "text")
|
||||
except GettingError as e:
|
||||
log.exception(f"Error: page {page_name} is not downloaded for lang {lang} and url {url} ({e}).")
|
||||
return None
|
||||
|
||||
page_size = len(text)
|
||||
if page_size > 0:
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
text = beautify_page(text, lang)
|
||||
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
|
||||
with open(path, "w") as file:
|
||||
file.write(text)
|
||||
else:
|
||||
log.warning(f"Page {url} is empty. It has not been saved.")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def get_wiki_langs(url):
|
||||
lang, page_name = get_page_info(url)
|
||||
page = get_wiki_page(lang, page_name)
|
||||
|
||||
curr_lang = [(lang, url)]
|
||||
try:
|
||||
langlinks = try_get(page, "langlinks")
|
||||
return (
|
||||
list(zip(langlinks.keys(), [link.fullurl for link in langlinks.values()]))
|
||||
+ curr_lang
|
||||
)
|
||||
except GettingError as e:
|
||||
log.exception(f"Error: no languages for page {page_name} with url {url} ({e}).")
|
||||
return curr_lang
|
||||
|
||||
|
||||
def download_all_from_wikipedia(path, url, langs):
|
||||
try:
|
||||
available_langs = get_wiki_langs(url)
|
||||
except ParseError:
|
||||
log.exception("Parsing failed. {url} is incorrect.")
|
||||
return
|
||||
available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
|
||||
for lang in available_langs:
|
||||
download(path, lang[1])
|
||||
|
||||
|
||||
def wikipedia_worker(output_dir, checker, langs):
|
||||
def wrapped(line):
|
||||
if not line.strip():
|
||||
return
|
||||
try:
|
||||
# First param is mwm_path, which added this line entry.
|
||||
_, ident, url = line.split("\t")
|
||||
ident = int(ident)
|
||||
if not checker(ident):
|
||||
return
|
||||
url = url.strip()
|
||||
except (AttributeError, ValueError):
|
||||
log.exception(f"{line} is incorrect.")
|
||||
return
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
|
||||
download_all_from_wikipedia(path, url, langs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
|
||||
with open(input_file) as file:
|
||||
_ = file.readline() # skip header
|
||||
with ThreadPoolExecutor(WORKERS) as pool:
|
||||
pool.map(wikipedia_worker(output_dir, checker, langs), file)
|
||||
|
||||
|
||||
def get_wikidata_urls(entity, langs):
|
||||
try:
|
||||
keys = entity.data["sitelinks"].keys()
|
||||
except (KeyError, AttributeError):
|
||||
log.exception(f"Sitelinks not found for {entity.id}.")
|
||||
return None
|
||||
return [
|
||||
entity.data["sitelinks"][k]["url"]
|
||||
for k in keys
|
||||
if any([k.startswith(lang) for lang in langs])
|
||||
]
|
||||
|
||||
|
||||
def wikidata_worker(output_dir, checker, langs):
|
||||
def wrapped(line):
|
||||
if not line.strip():
|
||||
return
|
||||
try:
|
||||
ident, wikidata_id = line.split("\t")
|
||||
ident = int(ident)
|
||||
wikidata_id = wikidata_id.strip()
|
||||
if not checker(ident):
|
||||
return
|
||||
except (AttributeError, ValueError):
|
||||
log.exception(f"{line} is incorrect.")
|
||||
return
|
||||
client = Client()
|
||||
try:
|
||||
entity = try_get(client, "get", wikidata_id, load=True)
|
||||
except GettingError:
|
||||
log.exception(f"Error: page is not downloaded {wikidata_id}.")
|
||||
return
|
||||
urls = get_wikidata_urls(entity, langs)
|
||||
if not urls:
|
||||
return
|
||||
path = os.path.join(output_dir, wikidata_id)
|
||||
for url in urls:
|
||||
download(path, url)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def download_from_wikidata_tags(input_file, output_dir, langs, checker):
|
||||
wikidata_output_dir = os.path.join(output_dir, "wikidata")
|
||||
os.makedirs(wikidata_output_dir, exist_ok=True)
|
||||
with open(input_file) as file:
|
||||
with ThreadPoolExecutor(WORKERS) as pool:
|
||||
pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file)
|
||||
|
||||
|
||||
def check_and_get_checker(popularity_file):
|
||||
popularity_set = None
|
||||
if popularity_file is None:
|
||||
log.warning(f"Popularity file not set.")
|
||||
elif os.path.exists(popularity_file):
|
||||
popularity_set = read_popularity(popularity_file)
|
||||
log.info(f"Popularity set size: {len(popularity_set)}.")
|
||||
else:
|
||||
log.error(f"Popularity file ({popularity_file}) not found.")
|
||||
return should_download_page(popularity_set)
|
||||
10
tools/python/descriptions/exceptions.py
Normal file
10
tools/python/descriptions/exceptions.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
class DescriptionError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ParseError(DescriptionError):
|
||||
pass
|
||||
|
||||
|
||||
class GettingError(DescriptionError):
|
||||
pass
|
||||
5
tools/python/descriptions/requirements.txt
Normal file
5
tools/python/descriptions/requirements.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
htmlmin2==0.1.13
|
||||
requests>=2.31.0
|
||||
beautifulsoup4==4.9.1
|
||||
wikidata==0.6.1
|
||||
wikipedia-api==0.5.4
|
||||
5
tools/python/descriptions/requirements_dev.txt
Normal file
5
tools/python/descriptions/requirements_dev.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
htmlmin2==0.1.13
|
||||
requests>=2.31.0
|
||||
beautifulsoup4==4.9.1
|
||||
wikidata==0.6.1
|
||||
wikipedia-api==0.5.4
|
||||
32
tools/python/descriptions/setup.py
Executable file
32
tools/python/descriptions/setup.py
Executable file
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
|
||||
import setuptools
|
||||
|
||||
module_dir = os.path.abspath(os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(module_dir, "..", "..", ".."))
|
||||
|
||||
from pyhelpers.setup import chdir
|
||||
from pyhelpers.setup import get_version
|
||||
from pyhelpers.setup import get_requirements
|
||||
|
||||
|
||||
with chdir(os.path.abspath(os.path.dirname(__file__))):
|
||||
setuptools.setup(
|
||||
name="omim-descriptions",
|
||||
version=str(get_version()),
|
||||
author="CoMaps",
|
||||
author_email="info@comaps.app",
|
||||
description="This package is a library that provides descriptions "
|
||||
"(such as those from Wikipedia) to geographic objects.",
|
||||
url="https://codeberg.org/comaps",
|
||||
package_dir={"descriptions": ""},
|
||||
packages=["descriptions"],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
],
|
||||
python_requires=">=3.6",
|
||||
install_requires=get_requirements(),
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue