Skip to content

Commit

Permalink
Intl.Collator for Apple platforms
Browse files Browse the repository at this point in the history
Summary:
Implement Intl.Collator on top of NSLocale. There isn't a perfect 1:1
mapping of Collator's options and NSLocale, so this uses a few
workarounds:
1. ignorePunctuation is implemented by stripping punctuation from the
string before doing the comparison, since there's no way to directly
ignore punctuation.
2. Sensitivity is implemented by setting specific flags on the
comparison to simulate the desired effect.
3. There is no way to directly specify "usage", so we instead supply
"search" as a collation in the event that no other collation is
specified.

It also does not check the validity of flags passed in, since we don't
have direct access to the underlying LocaleData. This is something we
can implement later, by enumerating the allowed flag values in this
file and checking against them.

Reviewed By: dmitryrykun

Differential Revision: D34675880

fbshipit-source-id: 19a8807e89116bd0b8a40a9f9ead3e61492b5448
  • Loading branch information
neildhar authored and facebook-github-bot committed Mar 25, 2022
1 parent f4a9677 commit e533a51
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 9 deletions.
3 changes: 2 additions & 1 deletion include/hermes/Platform/Intl/BCP47Parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#ifndef HERMES_BCP47_PARSER_H
#define HERMES_BCP47_PARSER_H

#include "llvh/ADT/ArrayRef.h"
#include "llvh/ADT/Optional.h"

#include <map>
Expand All @@ -19,6 +18,8 @@
namespace hermes {
namespace platform_intl {

bool isUnicodeExtensionType(const std::u16string &);

struct ParsedLocaleIdentifier {
// Parses \p localeId and returns ParsedLocaleIdentifier if it is a
// structurally valid language tag.
Expand Down
7 changes: 7 additions & 0 deletions lib/Platform/Intl/BCP47Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#include "hermes/Platform/Intl/BCP47Parser.h"

#include "llvh/ADT/STLExtras.h"

namespace hermes {
namespace platform_intl {
namespace {
Expand Down Expand Up @@ -91,6 +93,11 @@ std::vector<std::u16string> splitIntoSubtags(const std::u16string &locale) {
}
} // namespace

bool isUnicodeExtensionType(const std::u16string &str) {
// = alphanum{3,8} (sep alphanum{3,8})*;
return llvh::all_of(splitIntoSubtags(str), isUnicodeExtensionKeyTypeItem);
}

class LanguageTagParser {
public:
LanguageTagParser(const std::u16string &localeId);
Expand Down
195 changes: 192 additions & 3 deletions lib/Platform/Intl/PlatformIntlApple.mm
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,15 @@ ResolvedLocale resolveLocale(
}

struct Collator::Impl {
NSLocale *nsLocale;
NSStringCompareOptions nsCompareOptions;
std::u16string locale;
std::u16string usage;
std::u16string collation;
std::u16string caseFirst;
std::u16string sensitivity;
bool numeric;
bool ignorePunctuation;
};

Collator::Collator() : impl_(std::make_unique<Impl>()) {}
Expand All @@ -546,25 +554,206 @@ ResolvedLocale resolveLocale(
return supportedLocales(availableLocales, *requestedLocalesRes);
}

/// https://402.ecma-international.org/8.0/#sec-initializecollator
vm::ExecutionStatus Collator::initialize(
vm::Runtime &runtime,
const std::vector<std::u16string> &locales,
const Options &options) noexcept {
impl_->locale = u"en-US";
// 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
auto requestedLocalesRes = canonicalizeLocaleList(runtime, locales);
if (LLVM_UNLIKELY(requestedLocalesRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 2. Set options to ? CoerceOptionsToObject(options).
// 3. Let usage be ? GetOption(options, "usage", "string", « "sort", "search"
// », "sort").
auto usageRes = getOptionString(
runtime, options, u"usage", {u"sort", u"search"}, {u"sort"});
if (LLVM_UNLIKELY(usageRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 4. Set collator.[[Usage]] to usage.
impl_->usage = std::move(**usageRes);
// 5. If usage is "sort", then
// a. Let localeData be %Collator%.[[SortLocaleData]].
// 6. Else,
// a. Let localeData be %Collator%.[[SearchLocaleData]].
// 7. Let opt be a new Record.
std::unordered_map<std::u16string, std::u16string> opt;
// 8. Let matcher be ? GetOption(options, "localeMatcher", "string", «
// "lookup", "best fit" », "best fit").
// 9. Set opt.[[localeMatcher]] to matcher.
// We only implement lookup matcher, so we don't need to record this.
auto localeMatcherRes = getOptionString(
runtime,
options,
u"localeMatcher",
{u"lookup", u"best fit"},
u"best fit");
if (LLVM_UNLIKELY(localeMatcherRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 10. Let collation be ? GetOption(options, "collation", "string", undefined,
// undefined).
auto collationRes = getOptionString(runtime, options, u"collation", {}, {});
if (LLVM_UNLIKELY(collationRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 11. If collation is not undefined, then
// a. If collation does not match the Unicode Locale Identifier type
// nonterminal, throw a RangeError exception.
// 12. Set opt.[[co]] to collation.
if (auto &collationOpt = *collationRes) {
if (!isUnicodeExtensionType(*collationOpt)) {
return runtime.raiseRangeError(
vm::TwineChar16("Invalid collation: ") +
vm::TwineChar16(collationOpt->c_str()));
}
opt.emplace(u"co", std::move(*collationOpt));
}
// 13. Let numeric be ? GetOption(options, "numeric", "boolean", undefined,
// undefined).
auto numericOpt = getOptionBool(runtime, options, u"numeric", {});
// 14. If numeric is not undefined, then
// a. Let numeric be ! ToString(numeric).
// 15. Set opt.[[kn]] to numeric.
if (numericOpt)
opt.emplace(u"kn", *numericOpt ? u"true" : u"false");
// 16. Let caseFirst be ? GetOption(options, "caseFirst", "string", « "upper",
// "lower", "false" », undefined).
auto caseFirstRes = getOptionString(
runtime, options, u"caseFirst", {u"upper", u"lower", u"false"}, {});
if (LLVM_UNLIKELY(caseFirstRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 17. Set opt.[[kf]] to caseFirst.
if (auto caseFirstOpt = *caseFirstRes)
opt.emplace(u"kf", *caseFirstOpt);
// 18. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
std::vector<std::u16string> relevantExtensionKeys = {u"co", u"kn", u"kf"};
// 19. Let r be ResolveLocale(%Collator%.[[AvailableLocales]],
// requestedLocales, opt,relevantExtensionKeys, localeData).
auto r = resolveLocale(
getAvailableLocales(), *requestedLocalesRes, opt, relevantExtensionKeys);
// 20. Set collator.[[Locale]] to r.[[locale]].
impl_->locale = std::move(r.locale);
// 21. Let collation be r.[[co]].
auto coIt = r.extensions.find(u"co");
// 22. If collation is null, let collation be "default".
// 23. Set collator.[[Collation]] to collation.
if (coIt == r.extensions.end())
impl_->collation = u"default";
else
impl_->collation = std::move(coIt->second);
// 24. If relevantExtensionKeys contains "kn", then
// a. Set collator.[[Numeric]] to ! SameValue(r.[[kn]], "true").
auto knIt = r.extensions.find(u"kn");
if (knIt == r.extensions.end())
impl_->numeric = false;
else
impl_->numeric = (knIt->second == u"true");

// 25. If relevantExtensionKeys contains "kf", then
// a. Set collator.[[CaseFirst]] to r.[[kf]].
auto kfIt = r.extensions.find(u"kf");
if (kfIt == r.extensions.end())
impl_->caseFirst = u"false";
else
impl_->caseFirst = kfIt->second;

// 26. Let sensitivity be ? GetOption(options, "sensitivity", "string", «
// "base", "accent", "case", "variant" », undefined).
auto sensitivityRes = getOptionString(
runtime,
options,
u"sensitivity",
{u"base", u"accent", u"case", u"variant"},
{});
if (LLVM_UNLIKELY(sensitivityRes == vm::ExecutionStatus::EXCEPTION))
return vm::ExecutionStatus::EXCEPTION;
// 27. If sensitivity is undefined, then
// a. If usage is "sort", then
// i. Let sensitivity be "variant".
// b. Else,
// i. Let dataLocale be r.[[dataLocale]].
// ii. Let dataLocaleData be localeData.[[<dataLocale>]].
// iii. Let sensitivity be dataLocaleData.[[sensitivity]].
// 28. Set collator.[[Sensitivity]] to sensitivity.
if (auto &sensitivityOpt = *sensitivityRes)
impl_->sensitivity = std::move(*sensitivityOpt);
else
impl_->sensitivity = u"variant";

// 29. Let ignorePunctuation be ? GetOption(options, "ignorePunctuation",
// "boolean", undefined,false).
auto ignorePunctuationOpt =
getOptionBool(runtime, options, u"ignorePunctuation", false);
// 30. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
impl_->ignorePunctuation = *ignorePunctuationOpt;

// Set up the state for calling into Obj-C APIs.
NSStringCompareOptions cmpOpts = 0;
if (impl_->numeric)
cmpOpts |= NSNumericSearch;
if (impl_->sensitivity == u"base")
cmpOpts |= (NSDiacriticInsensitiveSearch | NSCaseInsensitiveSearch);
else if (impl_->sensitivity == u"accent")
cmpOpts |= NSCaseInsensitiveSearch;
else if (impl_->sensitivity == u"case")
cmpOpts |= NSDiacriticInsensitiveSearch;
impl_->nsCompareOptions = cmpOpts;

std::u16string nsLocaleExtensions;
if (impl_->collation != u"default")
nsLocaleExtensions.append(u"-co-").append(impl_->collation);
else if (impl_->usage == u"search")
nsLocaleExtensions.append(u"-co-search");
if (impl_->caseFirst != u"false")
nsLocaleExtensions.append(u"-kf-").append(impl_->caseFirst);
auto nsLocaleIdentifier = r.dataLocale;
if (!nsLocaleExtensions.empty())
nsLocaleIdentifier.append(u"-u").append(nsLocaleExtensions);
impl_->nsLocale = [NSLocale
localeWithLocaleIdentifier:u16StringToNSString(nsLocaleIdentifier)];
// 31. Return collator.
return vm::ExecutionStatus::RETURNED;
}

/// https://402.ecma-international.org/8.0/#sec-intl.collator.prototype.resolvedoptions
Options Collator::resolvedOptions() noexcept {
Options options;
options.emplace(u"locale", Option(impl_->locale));
options.emplace(u"numeric", Option(false));
options.emplace(u"usage", Option(impl_->usage));
options.emplace(u"sensitivity", Option(impl_->sensitivity));
options.emplace(u"ignorePunctuation", Option(impl_->ignorePunctuation));
options.emplace(u"collation", Option(impl_->collation));
options.emplace(u"numeric", Option(impl_->numeric));
options.emplace(u"caseFirst", Option(impl_->caseFirst));
return options;
}

/// https://402.ecma-international.org/8.0/#sec-intl.collator.prototype.compare
double Collator::compare(
const std::u16string &x,
const std::u16string &y) noexcept {
return x.compare(y);
NSString *nsX = u16StringToNSString(x);
NSString *nsY = u16StringToNSString(y);
if (impl_->ignorePunctuation) {
// Unfortunately, NSLocale does not provide a way to specify alternate
// handling, so we simulate it by manually stripping punctuation and
// whitespace.
auto *set = [NSMutableCharacterSet punctuationCharacterSet];
[set formUnionWithCharacterSet:[NSCharacterSet
whitespaceAndNewlineCharacterSet]];
auto removePunctuation = [&](NSString *nsStr) {
auto *res = [NSMutableString new];
for (size_t i = 0; i < nsStr.length; ++i)
if (![set characterIsMember:[nsStr characterAtIndex:i]])
[res appendFormat:@"%c", [nsStr characterAtIndex:i]];
return res;
};
nsX = removePunctuation(nsX);
nsY = removePunctuation(nsY);
}
return [nsX compare:nsY
options:impl_->nsCompareOptions
range:NSMakeRange(0, nsX.length)
locale:impl_->nsLocale];
}

struct DateTimeFormat::Impl {
Expand Down
1 change: 1 addition & 0 deletions lib/VM/JSLib/Intl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ const OptionData kCollatorOptions[] = {
{u"caseFirst", platform_intl::Option::Kind::String, 0},
{u"sensitivity", platform_intl::Option::Kind::String, 0},
{u"ignorePunctuation", platform_intl::Option::Kind::Bool, 0},
{u"collation", platform_intl::Option::Kind::String, 0},
{nullptr, platform_intl::Option::Kind::Bool, 0},
};

Expand Down
42 changes: 42 additions & 0 deletions test/hermes/intl/collator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

// RUN: %hermes %s | %FileCheck --match-full-lines %s
// REQUIRES: intl

print("Intl.Collator Test")
// CHECK-LABEL: Intl.Collator Test

print(['Z', 'a', 'z', 'ä'].sort(new Intl.Collator('de').compare));
// CHECK-NEXT: a,ä,z,Z
print(['Z', 'a', 'z', 'ä'].sort(new Intl.Collator('sv').compare));
// CHECK-NEXT: a,z,Z,ä
print(['Z', 'a', 'z', 'ä'].sort(new Intl.Collator('sv', {caseFirst: 'upper'}).compare));
// CHECK-NEXT: a,Z,z,ä
print(['Z', 'a', 'z', 'ä'].sort(new Intl.Collator('de', { caseFirst: 'upper' } ).compare));
// CHECK-NEXT: a,ä,Z,z

print(["of", "öf"].sort(new Intl.Collator('de-DE', { } ).compare));
// CHECK-NEXT: of,öf
print(["of", "öf"].sort(new Intl.Collator('de-DE-u-co-phonebk', { } ).compare));
// CHECK-NEXT: öf,of
print(["of", "öf"].sort(new Intl.Collator('de-DE', { collation: "phonebk" } ).compare));
// CHECK-NEXT: öf,of

print(['test1', 'test2', 'test10'].sort(new Intl.Collator('en', { } ).compare));
// CHECK-NEXT: test1,test10,test2
print(['test1', 'test2', 'test10'].sort(new Intl.Collator('en', { numeric: true } ).compare));
// CHECK-NEXT: test1,test2,test10

try{ new Intl.Collator('en', { collation: "banananana" } ) } catch (e) { print(e) }
// CHECK-NEXT: RangeError: Invalid collation: banananana

var puncList = ["aa", "ab", "a,b", "a\nb", "a\na", "a a", "a b", "...", "", ".a.a"];
print(JSON.stringify([...puncList].sort(new Intl.Collator("en", ).compare)));
// CHECK-NEXT: ["","...",".a.a","a\na","a\nb","a a","a b","a,b","aa","ab"]
print(JSON.stringify([...puncList].sort(new Intl.Collator("en", { ignorePunctuation: true }).compare)));
// CHECK-NEXT: ["...","","aa","a\na","a a",".a.a","ab","a,b","a\nb","a b"]
15 changes: 10 additions & 5 deletions utils/testsuite/testsuite_skiplist.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,16 @@
# CoreFoundation. See T24545708 for a discussion.
"test262/test/built-ins/String/prototype/toLocaleLowerCase/special_casing_conditional.js",
"test262/test/built-ins/String/prototype/toLowerCase/special_casing_conditional.js",
# Intl
"test262/test/intl402/Collator/subclassing.js",
"test262/test/intl402/Collator/unicode-ext-value-collation.js",
"test262/test/intl402/Collator/ignore-invalid-unicode-ext-values.js",
"test262/test/intl402/Collator/proto-from-ctor-realm.js",
"test262/test/intl402/Collator/prototype/resolvedOptions/order.js",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Lithuanian.js",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Azeri.js",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Turkish.js",
"test262/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js",
# Unicode 13.0
"test262/test/language/identifiers/part-unicode-13.0.0-escaped.js",
"test262/test/language/identifiers/part-unicode-13.0.0.js",
Expand Down Expand Up @@ -1800,7 +1810,6 @@
"test262/test/built-ins/Atomics/",
"test262/test/built-ins/AsyncGeneratorFunction/",
"test262/test/built-ins/SharedArrayBuffer/",
"test262/test/intl402/Collator/",
"test262/test/intl402/Date/",
"test262/test/intl402/DateTimeFormat/",
"test262/test/intl402/DisplayNames/",
Expand All @@ -1814,10 +1823,6 @@
"test262/test/intl402/NumberFormat/",
"test262/test/intl402/PluralRules/",
"test262/test/intl402/Segmenter/",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Lithuanian.js",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Azeri.js",
"test262/test/intl402/String/prototype/toLocaleLowerCase/special_casing_Turkish.js",
"test262/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js",
"test262/test/intl402/RelativeTimeFormat/",
"mjsunit/asm/",
"mjsunit/regress/wasm/",
Expand Down

0 comments on commit e533a51

Please sign in to comment.