Suggest name, when available, for unknown codes

When parsing the CLDR data, we only handle language, script and territory (which we call country) codes if they are known to our enumdata.py tables. When reporting the rest as unknown, in the content of an actual locale definition (not the likely subtag data), check whether en.xml can resolve the code for us; if it can, report the full name it provides, as a hint to whoever's running the script that an update to enumdata.py may be in order. Change-Id: I9ca1d6922a91d45bc436f4b622e5557261897d7f Reviewed-by: Thiago Macieira <[email protected]> Reviewed-by: Lars Knoll <[email protected]> Reviewed-by: Konstantin Ritt <[email protected]>
dougcooper · May 20, 2019 · b7d8169 · b7d8169
1 parent 248b675
commit b7d8169
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 5 deletions.
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
@@ -95,6 +95,34 @@ def skip_repeating_pattern(x):
         result.append(pattern)
     return result
 
+def raiseUnknownCode(code, form, cache={}):
+    """Check whether an unknown code could be supported.
+
+    We declare a language, script or country code unknown if it's not
+    known to enumdata.py; however, if it's present in main/en.xml's
+    mapping of codes to names, we have the option of adding support.
+    This caches the necessary look-up (so we only read main/en.xml
+    once) and returns the name we should use if we do add support.
+
+    First parameter, code, is the unknown code.  Second parameter,
+    form, is one of 'language', 'script' or 'country' to select the
+    type of code to look up.  Do not pass further parameters (the next
+    will deprive you of the cache).
+
+    Raises xpathlite.Error with a suitable message, that includes the
+    unknown code's full name if found.
+
+    Relies on global cldr_dir being set before it's called; see tail
+    of this file.
+    """
+    if not cache:
+        cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
+    name = cache[form].get(code)
+    msg = 'unknown %s code "%s"' % (form, code)
+    if name:
+        msg += ' - could use "%s"' % name
+    raise xpathlite.Error(msg)
+
 def parse_list_pattern_part_format(pattern):
     # This is a very limited parsing of the format for list pattern part only.
     return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
@@ -193,18 +221,18 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
 
     language_id = enumdata.languageCodeToId(language_code)
     if language_id <= 0:
-        raise xpathlite.Error('unknown language code "%s"' % language_code)
+        raiseUnknownCode(language_code, 'language')
 
     script_id = enumdata.scriptCodeToId(script_code)
     if script_id == -1:
-        raise xpathlite.Error('unknown script code "%s"' % script_code)
+        raiseUnknownCode(script_code, 'script')
 
     # we should handle fully qualified names with the territory
     if not country_code:
         return {}
     country_id = enumdata.countryCodeToId(country_code)
     if country_id <= 0:
-        raise xpathlite.Error('unknown country code "%s"' % country_code)
+        raiseUnknownCode(country_code, 'country')
 
     # So we say we accept only those values that have "contributed" or
     # "approved" resolution. see http://www.unicode.org/cldr/process.html

diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py
@@ -78,14 +78,38 @@ def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
         return node
     return False
 
+def codeMapsFromFile(file):
+    """Extract mappings of language, script and country codes to names.
+
+    The file shall typically be common/main/en.xml, which contains a
+    localeDisplayNames element with children languages, scripts and
+    territories; each element in each of these has a code as its type
+    attribute and its name as element content.  This returns a mapping
+    withe keys 'language', 'script' and 'country', each of which
+    has, as value, a mapping of the relevant codes to names.
+    """
+    parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
+    keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
+    for src, dst in keys.items():
+        child = findChild(parent, src)
+        data = result[dst] = {}
+        for elt in child.childNodes:
+            if elt.attributes and elt.attributes.has_key('type'):
+                key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
+                # Don't over-write previously-read data for an alt form:
+                if elt.attributes.has_key('alt') and data.has_key(key):
+                    continue
+                data[key] = value
+
+    return result
+
 def findTagsInFile(file, path):
     doc = parseDoc(file)
 
     elt = doc.documentElement
     tag_spec_list = path.split("/")
     last_entry = None
-    for i in range(len(tag_spec_list)):
-        tag_spec = tag_spec_list[i]
+    for tag_spec in tag_spec_list:
         tag_name = tag_spec
         arg_name = 'type'
         arg_value = ''