Skip to content

Commit

Permalink
[mod] replace utils.match_language by locales.match_locale
Browse files Browse the repository at this point in the history
This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] searxng#1652

Signed-off-by: Markus Heiser <[email protected]>
  • Loading branch information
return42 committed Mar 24, 2023
1 parent 4d4aa13 commit 16f0db4
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 138 deletions.
108 changes: 107 additions & 1 deletion searx/locales.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
"""

from typing import Set
from typing import Set, Optional, List
import os
import pathlib

Expand Down Expand Up @@ -177,6 +177,17 @@ def language_tag(locale: babel.Locale) -> str:
return sxng_lang


def get_locale(locale_tag: str) -> Optional[babel.Locale]:
"""Returns a :py:obj:`babel.Locale` object parsed from argument
``locale_tag``"""
try:
locale = babel.Locale.parse(locale_tag, sep='-')
return locale

except babel.core.UnknownLocaleError:
return None


def get_offical_locales(
territory: str, languages=None, regional: bool = False, de_facto: bool = True
) -> Set[babel.Locale]:
Expand Down Expand Up @@ -363,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine_locale = default

return default


def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]:
"""Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
:param str searxng_locale: SearXNG's internal representation of locale (de,
de-DE, fr-BE, zh, zh-CN, zh-TW ..).
:param list locale_tag_list: The list of locale tags to select from
:param str fallback: fallback locale tag (if unset --> ``None``)
The rules to find a match are implemented in :py:obj:`get_engine_locale`,
the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
.. hint::
The *SearXNG locale* string and the members of ``locale_tag_list`` has to
be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
UI and are not known by babel --> will be ignored.
"""

# searxng_locale = 'es'
# locale_tag_list = ['es-AR', 'es-ES', 'es-MX']

if not searxng_locale:
return fallback

locale = get_locale(searxng_locale)
if locale is None:
return fallback

# normalize to a SearXNG locale that can be passed to get_engine_locale

searxng_locale = language_tag(locale)
if locale.territory:
searxng_locale = region_tag(locale)

# clean up locale_tag_list

tag_list = []
for tag in locale_tag_list:
if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
continue
tag_list.append(tag)

# emulate fetch_traits
engine_locales = build_engine_locales(tag_list)
return get_engine_locale(searxng_locale, engine_locales, default=fallback)


def build_engine_locales(tag_list: List[str]):
"""From a list of locale tags a dictionary is build that can be passed by
argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function
is mainly used by :py:obj:`match_locale` and is similar to what the
``fetch_traits(..)`` function of engines do.
If there are territory codes in the ``tag_list`` that have a *script code*
additional keys are added to the returned dictionary.
.. code:: python
>>> import locales
>>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
>>> engine_locales
{
'en': 'en', 'en-US': 'en-US',
'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
}
>>> get_engine_locale('zh-Hans', engine_locales)
'zh-CN'
This function is a good example to understand the language/region model
of SearXNG:
SearXNG only distinguishes between **search languages** and **search
regions**, by adding the *script-tags*, languages with *script-tags* can
be assigned to the **regions** that SearXNG supports.
"""
engine_locales = {}

for tag in tag_list:
locale = get_locale(tag)
if locale is None:
logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag)
continue
if locale.territory:
engine_locales[region_tag(locale)] = tag
if locale.script:
engine_locales[language_tag(locale)] = tag
else:
engine_locales[language_tag(locale)] = tag
return engine_locales
88 changes: 0 additions & 88 deletions searx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
from babel.core import get_global


from searx import settings
from searx.data import USER_AGENTS, data_dir
Expand Down Expand Up @@ -365,92 +363,6 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
return None


def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
key = str(lang_list)
value = _LANG_TO_LC_CACHE.get(key, None)
if value is None:
value = {}
for lang in lang_list:
value.setdefault(lang.split('-')[0], lang)
_LANG_TO_LC_CACHE[key] = value
return value


# babel's get_global contains all sorts of miscellaneous locale and territory related data
# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
def _get_from_babel(lang_code: str, key):
match = get_global(key).get(lang_code.replace('-', '_'))
# for some keys, such as territory_aliases, match may be a list
if isinstance(match, str):
return match.replace('_', '-')
return match


def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102
"""auxiliary function to match lang_code in lang_list"""
# replace language code with a custom alias if necessary
if lang_code in custom_aliases:
lang_code = custom_aliases[lang_code]

if lang_code in lang_list:
return lang_code

# try to get the most likely country for this language
subtags = _get_from_babel(lang_code, 'likely_subtags')
if subtags:
if subtags in lang_list:
return subtags
subtag_parts = subtags.split('-')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
if new_code in custom_aliases:
new_code = custom_aliases[new_code]
if new_code in lang_list:
return new_code

# try to get the any supported country for this language
return _get_lang_to_lc_dict(lang_list).get(lang_code)


def match_language( # pylint: disable=W0102
locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US'
) -> Optional[str]:
"""get the language code from lang_list that best matches locale_code"""
# try to get language from given locale_code
language = _match_language(locale_code, lang_list, custom_aliases)
if language:
return language

locale_parts = locale_code.split('-')
lang_code = locale_parts[0]

# if locale_code has script, try matching without it
if len(locale_parts) > 2:
language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
if language:
return language

# try to get language using an equivalent country code
if len(locale_parts) > 1:
country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
if country_alias:
language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
if language:
return language

# try to get language using an equivalent language code
alias = _get_from_babel(lang_code, 'language_aliases')
if alias:
language = _match_language(alias, lang_list, custom_aliases)
if language:
return language

if lang_code != locale_code:
# try to get language from given language without giving the country
language = _match_language(lang_code, lang_list, custom_aliases)

return language or fallback


def load_module(filename: str, module_dir: str) -> types.ModuleType:
modname = splitext(filename)[0]
modpath = join(module_dir, filename)
Expand Down
30 changes: 18 additions & 12 deletions searx/webapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@
html_to_text,
gen_useragent,
dict_subset,
match_language,
)
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery
Expand Down Expand Up @@ -117,6 +116,7 @@
RTL_LOCALES,
localeselector,
locales_initialize,
match_locale,
)

# renaming names from searx imports ...
Expand Down Expand Up @@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list):
if '-' in lang:
lang_parts = lang.split('-')
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
locale = match_language(lang, lang_list, fallback=None)
locale = match_locale(lang, lang_list, fallback=None)
if locale is not None:
return locale
return 'en'
Expand Down Expand Up @@ -407,7 +407,7 @@ def get_client_settings():


def render(template_name: str, **kwargs):

# pylint: disable=too-many-statements
kwargs['client_settings'] = str(
base64.b64encode(
bytes(
Expand Down Expand Up @@ -445,10 +445,13 @@ def render(template_name: str, **kwargs):

if locale in RTL_LOCALES and 'rtl' not in kwargs:
kwargs['rtl'] = True

if 'current_language' not in kwargs:
kwargs['current_language'] = match_language(
request.preferences.get_value('language'), settings['search']['languages']
)
_locale = request.preferences.get_value('language')
if _locale in ('auto', 'all'):
kwargs['current_language'] = _locale
else:
kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])

# values from settings
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
Expand Down Expand Up @@ -810,6 +813,13 @@ def search():
)
)

if search_query.lang in ('auto', 'all'):
current_language = search_query.lang
else:
current_language = match_locale(
search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
)

# search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang
Expand All @@ -832,12 +842,8 @@ def search():
result_container.unresponsive_engines
),
current_locale = request.preferences.get_value("locale"),
current_language = match_language(
search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
search_language = match_language(
current_language = current_language,
search_language = match_locale(
search.search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
Expand Down
8 changes: 4 additions & 4 deletions searxng_extra/update/update_engine_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from lxml.html import fromstring

from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES, locales_initialize
from searx.utils import extract_text
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
Expand Down Expand Up @@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website):
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)
Expand Down
Loading

0 comments on commit 16f0db4

Please sign in to comment.