encodings/lang_enc.h

// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// This file is for i18n. It contains two enums, namely Language and
// Encoding, where Language is the linguistic convention, and Encoding
// contains information on both language encoding and character set.
//
// The language and encoding are both based on Teragram's conventions,
// except for some common ISO-8859 encodings that are not detected by
// Teragram but might be in the future.
//
// This file also includes functions that do mappings among
// Language/Encoding enums, language/encoding string names (typically
// the output from Language Encoding identifier), and language codes
// (iso 639), and two-letter country codes (iso 3166)
//
// NOTE: Both Language and Encoding enums should always start from
// zero value. This assumption has been made and used.
//

#ifndef ENCODINGS_LANG_ENC_H__
#define ENCODINGS_LANG_ENC_H__

#include "languages/public/languages.h"
#include "encodings/public/encodings.h"


// EncodingsForLanguage
// --------------------
//
// Given the language, returns a pointer to an array of encodings this
// language supports. Typically, the encs array has at least one
// element: UNKNOWN_ENCODING, which is always the last element of the
// array. The first encoding is the default encoding of the language.
// Return NULL if the input is invalid.
//
// Note: The output encoding array does not include ASCII_7BIT, UTF8
// or UNICODE which are good for all languages. TODO: Find out whether
// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
// as special cases.
//
const Encoding* EncodingsForLanguage(Language lang);


// DefaultEncodingForLanguage
// --------------------------
//
// Given the language, returns the default encoding for the language
// via the argument encoding.
//
// The function returns true if the input lang is valid. Otherwise,
// false is returned, and encoding is set to UNKNOWN_ENCODING.
//
bool DefaultEncodingForLanguage(Language lang,
                                Encoding *encoding);

// LanguagesForEncoding
// --------------------
//
// Given the encoding, returns a pointer to an array of languages this
// encoding supports. Typically, the langs array has at least one
// element: UNKNOWN_LANGUAGE, which is always the last element of the
// array. The first language in the array if the most popular
// language for that encoding. NULL is returned if the input is
// invalid.
//
// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
// the languages or to treat these two encodings as special cases.
//
// For other known encodings, ENGLISH is always included. This is
// because English (Latin) characters are included in each encoding.
//
const Language* LanguagesForEncoding(Encoding enc);

// DefaultLanguageForEncoding
// --------------------------
//
// Given the encoding, returns the default language for that encoding
// via the argument language.
//
// The function returns true if the input enc is valid. Otherwise,
// false is returned, and language is set to UNKNOWN_LANGUAGE.
//
// Note, this function is more useful for the encodings that have only
// one corresponding language i.e. shift_jis => Japanese. There are
// cases that multiple langauges have the same encoding, for which the
// default language is an arbitrary choice from them.
//
bool DefaultLanguageForEncoding(Encoding enc, Language* language);

//
// IsLangEncCompatible
// -------------------
//
// This function is to determine whether the input language and
// encoding are compatible. For example, FRENCH and LATIN1 are
// compatible, but FRENCH and GB are not.
//
// If either lang or enc is invalid return false.
// If either lang is unknown, return true.
//    (e.g. we can detect a page's encoding as latin1 from metatag info, but
//     cannot derive it language since there are more than one
//     language encoding in Latin1 )
// If language is known, but encoding is unknown, return false.
//    (return true will do us no good since we cannot convert to UTF8 anyway)
// If enc is unicode or utf8, return true.
// Otherwise check if lang is supported by enc and enc supported by
// lang.
//
bool IsLangEncCompatible(Language lang, Encoding enc);

//
// DominantLanguageFromEncoding
// ----------------------------
//
// This function determine if there exists a dominant language for the
// input encoding. For example, the encoding GB has a dominant
// language (Chinese), but Latin1 does not.
//
// The word "dominant" is used here because English characters are
// included in each encoding.
//
// If there is no dominant langauge for the encoding, such as Latin1,
// UNKNOWN_LANGUAGE is returned.
//
Language DominantLanguageFromEncoding(Encoding enc);

// LanguageCode
// ------------------------
// Given the Language and Encoding, return language code with dialects
// (>= 2 letters).  Encoding is necessary to disambiguate between
// Simplified and Traditional Chinese.
//
// See the note on Chinese Language Codes in
// i18n/languages/public/languages.h
// for the details.

const char* LanguageCode(Language lang, Encoding enc);

//
// IsEncodingWithSupportedLanguage()
// ---------------------------------
//
// There are some encoding listed here just because they are commonly
// used.  There is no interface language for them yet. They are not
// detected by Teragram, but can be detected from the meta info of the
// HTML page.
//
// For example, we have list ARABIC_ENCODING but there is no arabic in
// the Language enum. If the user input an Arabic query from Google
// main page, Netscape will just send the raw bytes to GWS, and GWS
// will treat them as Latin1.  Therefore, there is no use to detect
// ARABIC_ENCODING for indexing, since they will never match the
// queries which are treated as Latin1 by GWS. On the contrary, if we
// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
// fall them through as Latin1 in indexing time. And there might be a
// match for some ARABIC queries which are also treated as Latin1 by
// GWS. In fact, some people are relying on this feature to do Arabic
// searches.
//
// Thus for these type of encoding, before we have the UI support for
// their language and have a pretty comprehensive language/encoding
// identification quality, it is better to revert them as
// UNKNOWN_ENCODING.
//
// This function checks whether the input encoding is one with
// an interface language.
bool IsEncodingWithSupportedLanguage(Encoding enc);


//
// LangsFromCountryCode and EncFromCountryCode
// -------------------------------------------
//
// These two functions return the possible languages and encodings,
// respectively, according to the input country code, which is a
// 2-letter string. The country code is usually specified in the url
// of a document.
//
//

// LangsFromCountryCode
// --------------------
//
// This function takes a string of arbitrary length. It treats the
// first 2 bytes of the string as the country code, as defined in iso
// 3166-1993 (E).  It returns, via arguments, an array of the
// languages that are popular in that country, roughly in order of
// popularity, together with the size of the array.
//
// This function returns true if we have language information for
// country_code.  Otherwise, it returns false.
//
bool LangsFromCountryCode(const char* country_code,
                          const Language** lang_arry,
                          int* num_langs);


//
// EncFromCountryCode
// ------------------
//
// This function takes a string of arbitrary length. It treats the
// first 2 bytes of that string as the country code, as defined in iso
// 3166-1993 (E). It sets *enc to the encoding that is
// most often used for the languages spoken in that country.
//
// This function returns true if we have encoding information for
// country_code.  Otherwise, it returns false, and *enc is set to
// UNKNOWN_ENCODING.
//
bool EncFromCountryCode(const char* country_code, Encoding* enc);


// VisualType
// ----------
//
// Right-to-left documents may be in logical or visual order. When they
// are in visual order we convert them to logical order before processing.
// This enum lists the types of visual document we can encounter.
// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
// The other documents in those languages, and all documents in non-RTL
// languages, will be NOT_VISUAL_DOCUMENT.
enum VisualType {
  NOT_VISUAL_DOCUMENT = 0,
  VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
  CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
};

VisualType default_visualtype();

// VisualTypeName
// --------------
//
// Given the visual type, returns a string name useful for debug output.
const char* VisualTypeName(VisualType visualtype);


// InitLangEnc
// -----------
//
// Ensures the LangEnc module has been initialized.  Normally this
// happens during InitGoogle, but this allows access for scripts that
// don't support InitGoogle. InitLangEnc calls InitEncodings (see
// i18n/encodings/public/encodings.h) and also initializes data
// structures used in lang_enc.cc.
//
void InitLangEnc();

#endif  // ENCODINGS_LANG_ENC_H__