Skip to content

Commit 5a35a80

Browse files
author
Robert Fancsik
committed
Implement String.prototype.normalize
JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik [email protected]
1 parent 42523bd commit 5a35a80

14 files changed

+277
-40
lines changed

.github/workflows/gh-actions.yml

+8
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ jobs:
8585
Conformance_Tests_ES2015:
8686
runs-on: ubuntu-latest
8787
steps:
88+
- run: sudo apt update
89+
- run: sudo apt install libicu-dev
8890
- uses: actions/checkout@v2
8991
- run: $RUNNER --test262-es2015=update
9092
- run: $RUNNER --test262-es2015=update --build-debug
@@ -99,6 +101,8 @@ jobs:
99101
Conformance_Tests_ESNext:
100102
runs-on: ubuntu-latest
101103
steps:
104+
- run: sudo apt update
105+
- run: sudo apt install libicu-dev
102106
- uses: actions/checkout@v2
103107
- run: $RUNNER --test262-esnext=update
104108
- uses: actions/upload-artifact@v2
@@ -111,6 +115,8 @@ jobs:
111115
Conformance_Tests_ESNext_Debug_A:
112116
runs-on: ubuntu-latest
113117
steps:
118+
- run: sudo apt update
119+
- run: sudo apt install libicu-dev
114120
- uses: actions/checkout@v2
115121
- run: $RUNNER --test262-esnext=update --build-debug --test262-test-list=built-ins,annexB,harness,intl402
116122
- uses: actions/upload-artifact@v2
@@ -123,6 +129,8 @@ jobs:
123129
Conformance_Tests_ESNext_Debug_B:
124130
runs-on: ubuntu-latest
125131
steps:
132+
- run: sudo apt update
133+
- run: sudo apt install libicu-dev
126134
- uses: actions/checkout@v2
127135
- run: $RUNNER --test262-esnext=update --build-debug --test262-test-list=language
128136
- uses: actions/upload-artifact@v2

jerry-core/CMakeLists.txt

+22
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(JERRY_ERROR_MESSAGES OFF CACHE BOOL "Enable error mess
2525
set(JERRY_EXTERNAL_CONTEXT OFF CACHE BOOL "Enable external context?")
2626
set(JERRY_PARSER ON CACHE BOOL "Enable javascript-parser?")
2727
set(JERRY_FUNCTION_TO_STRING OFF CACHE BOOL "Enable function toString operation?")
28+
set(JERRY_ICU OFF CACHE BOOL "Enable ICU support?")
2829
set(JERRY_LINE_INFO OFF CACHE BOOL "Enable line info?")
2930
set(JERRY_LOGGING OFF CACHE BOOL "Enable logging?")
3031
set(JERRY_MEM_STATS OFF CACHE BOOL "Enable memory statistics?")
@@ -78,13 +79,24 @@ if(JERRY_MEM_STATS OR JERRY_PARSER_DUMP_BYTE_CODE OR JERRY_REGEXP_DUMP_BYTE_CODE
7879
set(JERRYRE_LOGGING_MESSAGE " (FORCED BY STATS OR DUMP)")
7980
endif()
8081

82+
# ICU
83+
if(JERRY_ICU)
84+
find_package(ICU REQUIRED COMPONENTS uc)
85+
86+
if(NOT ICU_FOUND)
87+
set(JERRY_ICU OFF)
88+
set(JERRY_ICU_MESSAGE " (FORCED BY MISSING LIBRARY)")
89+
endif()
90+
endif()
91+
8192
# Status messages
8293
message(STATUS "JERRY_CPOINTER_32_BIT " ${JERRY_CPOINTER_32_BIT} ${JERRY_CPOINTER_32_BIT_MESSAGE})
8394
message(STATUS "JERRY_DEBUGGER " ${JERRY_DEBUGGER})
8495
message(STATUS "JERRY_ERROR_MESSAGES " ${JERRY_ERROR_MESSAGES})
8596
message(STATUS "JERRY_EXTERNAL_CONTEXT " ${JERRY_EXTERNAL_CONTEXT})
8697
message(STATUS "JERRY_PARSER " ${JERRY_PARSER})
8798
message(STATUS "JERRY_FUNCTION_TO_STRING " ${JERRY_FUNCTION_TO_STRING})
99+
message(STATUS "JERRY_ICU " ${JERRY_ICU} ${JERRY_ICU_MESSAGE})
88100
message(STATUS "JERRY_LINE_INFO " ${JERRY_LINE_INFO})
89101
message(STATUS "JERRY_LOGGING " ${JERRY_LOGGING} ${JERRY_LOGGING_MESSAGE})
90102
message(STATUS "JERRY_MEM_STATS " ${JERRY_MEM_STATS})
@@ -641,6 +653,12 @@ if(JERRY_VALGRIND)
641653
set(INCLUDE_CORE_PRIVATE ${INCLUDE_CORE_PRIVATE} ${INCLUDE_THIRD_PARTY_VALGRIND})
642654
endif()
643655

656+
# ICU
657+
jerry_add_define01(JERRY_ICU)
658+
if(JERRY_ICU)
659+
set(INCLUDE_CORE_PRIVATE ${INCLUDE_CORE_PRIVATE} ${ICU_INCLUDE_DIRS})
660+
endif()
661+
644662
# Enable VM execution stop callback
645663
jerry_add_define01(JERRY_VM_HALT)
646664

@@ -766,6 +784,10 @@ else()
766784
endif()
767785
endif()
768786

787+
if(JERRY_ICU)
788+
target_link_libraries (${JERRY_CORE_NAME} ${ICU_LIBRARIES})
789+
endif()
790+
769791
separate_arguments(EXTERNAL_LINK_LIBS)
770792
foreach(EXT_LIB ${EXTERNAL_LINK_LIBS})
771793
target_link_libraries(${JERRY_CORE_NAME} ${EXT_LIB})

jerry-core/ecma/base/ecma-error-messages.inc.h

+6-5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ ECMA_ERROR_DEF (ECMA_ERR_INVALID_REGEXP_FLAGS, "Invalid RegExp flags")
7070
#if JERRY_BUILTIN_JSON
7171
ECMA_ERROR_DEF (ECMA_ERR_JSON_STRINGIFY_ERROR, "JSON stringify error")
7272
#endif /* JERRY_BUILTIN_JSON */
73+
#if JERRY_BUILTIN_STRING && JERRY_ESNEXT
74+
ECMA_ERROR_DEF (ECMA_ERR_NORMALIZATION_FAILED, "Normalization failed")
75+
#endif /* JERRY_BUILTIN_STRING && JERRY_ESNEXT */
7376
#if JERRY_BUILTIN_REGEXP
7477
ECMA_ERROR_DEF (ECMA_ERR_STACK_LIMIT_EXCEEDED, "Stack limit exceeded")
7578
#endif /* JERRY_BUILTIN_REGEXP */
@@ -203,6 +206,9 @@ ECMA_ERROR_DEF (ECMA_ERR_EXPECTED_A_FUNCTION_OBJECT, "Expected a function object
203206
#if JERRY_BUILTIN_TYPEDARRAY
204207
ECMA_ERROR_DEF (ECMA_ERR_INVALID_ARRAYBUFFER_LENGTH, "Invalid ArrayBuffer length")
205208
#endif /* JERRY_BUILTIN_TYPEDARRAY */
209+
#if JERRY_BUILTIN_STRING && JERRY_ESNEXT
210+
ECMA_ERROR_DEF (ECMA_ERR_INVALID_NORMALIZATION_FORM, "Invalid normalization form")
211+
#endif /* JERRY_BUILTIN_STRING && JERRY_ESNEXT */
206212
#if !(JERRY_MODULE_SYSTEM)
207213
ECMA_ERROR_DEF (ECMA_ERR_MODULE_NOT_SUPPORTED, "Module support is disabled")
208214
#endif /* !(JERRY_MODULE_SYSTEM) */
@@ -547,11 +553,6 @@ ECMA_ERROR_DEF (ECMA_ERR_CONSTRUCTOR_UINT32_ARRAY_REQUIRES_NEW, "Constructor Uin
547553
#if JERRY_ESNEXT
548554
ECMA_ERROR_DEF (ECMA_ERR_GENERATOR_IS_CURRENTLY_UNDER_EXECUTION, "Generator is currently under execution")
549555
ECMA_ERROR_DEF (ECMA_ERR_ITERATOR_RETURN_RESULT_IS_NOT_OBJECT, "Iterator 'return' result is not object")
550-
#endif /* JERRY_ESNEXT */
551-
#if JERRY_BUILTIN_TYPEDARRAY
552-
ECMA_ERROR_DEF (ECMA_ERR_RETURNED_ARRAYBUFFER_HAS_BEEN_DETACHED, "Returned ArrayBuffer has been detached")
553-
#endif /* JERRY_BUILTIN_TYPEDARRAY */
554-
#if JERRY_ESNEXT
555556
ECMA_ERROR_DEF (ECMA_ERR_SEARCH_STRING_CANNOT_BE_OF_TYPE_REGEXP, "Search string can't be of type: RegExp")
556557
ECMA_ERROR_DEF (ECMA_ERR_VALUE_RECEIVED_BY_YIELD_IS_NOT_OBJECT, "Value received by yield* is not object")
557558
#endif /* JERRY_ESNEXT */

jerry-core/ecma/base/ecma-error-messages.ini

+2-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,6 @@ ECMA_ERR_RESOLVE_MUST_BE_UNDEFINED = "Resolve must be undefined"
213213
ECMA_ERR_RESULT_OF_DEFAULTVALUE_IS_INVALID = "Result of [[DefaultValue]] is invalid"
214214
ECMA_ERR_RETURN_VALUE_IS_NOT_AN_ARRAYBUFFER_OBJECT = "Return value is not an ArrayBuffer object"
215215
ECMA_ERR_RETURN_VALUE_OF_EXEC_MUST_BE_AN_OBJECT_OR_NULL = "Return value of 'exec' must be an object or null"
216-
ECMA_ERR_RETURNED_ARRAYBUFFER_HAS_BEEN_DETACHED = "Returned ArrayBuffer has been detached"
217216
ECMA_ERR_RIGHT_VALUE_OF_IN_MUST_BE_AN_OBJECT = "Right value of 'in' must be an object"
218217
ECMA_ERR_RIGHT_VALUE_OF_INSTANCEOF_MUST_BE_AN_OBJECT = "Right value of 'instanceof' must be an object"
219218
ECMA_ERR_SEARCH_STRING_CANNOT_BE_OF_TYPE_REGEXP = "Search string can't be of type: RegExp"
@@ -333,3 +332,5 @@ ECMA_ERR_PRIVATE_METHOD_IS_NOT_WRITABLE = "Private method is not writable"
333332
ECMA_ERR_PRIVATE_FIELD_WAS_DEFINED_WITHOUT_A_SETTER = "Private field was defined without a setter"
334333
ECMA_ERR_CANNOT_READ_PRIVATE_MEMBER_TO_AN_OBJECT_WHOSE_CLASS_DID_NOT_DECLARE_IT = "Cannot read private member to an object whose class did not declare it"
335334
ECMA_ERR_PRIVATE_FIELD_WAS_DEFINED_WITHOUT_A_GETTER = "Private field was defined without a getter"
335+
ECMA_ERR_INVALID_NORMALIZATION_FORM = "Invalid normalization form"
336+
ECMA_ERR_NORMALIZATION_FAILED = "Normalization failed"

jerry-core/ecma/base/ecma-helpers-string.c

+52
Original file line numberDiff line numberDiff line change
@@ -2805,6 +2805,58 @@ ecma_op_advance_string_index (ecma_string_t *str_p, /**< input string */
28052805
} /* ecma_op_advance_string_index */
28062806
#endif /* JERRY_ESNEXT */
28072807

2808+
#if JERRY_ICU
2809+
/**
2810+
* Copy the string's data into a newly allocated UTF16 encoded buffer
2811+
*
2812+
* @return pointer to the allocated buffer
2813+
*/
2814+
uint16_t *
2815+
ecma_string_cesu8_to_utf16 (ecma_string_t *str_p, /**< input string */
2816+
lit_utf8_size_t *utf16_length_p) /**< [out] utf16 buffer size */
2817+
{
2818+
lit_utf8_size_t utf8_size;
2819+
lit_utf8_size_t utf8_length;
2820+
uint8_t flags = ECMA_STRING_FLAG_EMPTY;
2821+
const lit_utf8_byte_t *utf8_buffer_p = ecma_string_get_chars (str_p, &utf8_size, &utf8_length, NULL, &flags);
2822+
const lit_utf8_byte_t *utf8_buffer_end_p = utf8_buffer_p + utf8_size;
2823+
2824+
*utf16_length_p = utf8_length;
2825+
uint16_t *utf16_buff_p = (uint16_t *) jmem_heap_alloc_block (*utf16_length_p * sizeof (uint16_t));
2826+
uint16_t *utf16_buff_iter_p = utf16_buff_p;
2827+
2828+
while (utf8_buffer_p < utf8_buffer_end_p)
2829+
{
2830+
*utf16_buff_iter_p++ = (uint16_t) lit_cesu8_read_next (&utf8_buffer_p);
2831+
}
2832+
2833+
if (flags & ECMA_STRING_FLAG_MUST_BE_FREED)
2834+
{
2835+
jmem_heap_free_block ((void *) utf8_buffer_p, utf8_size);
2836+
}
2837+
2838+
return utf16_buff_p;
2839+
} /* ecma_string_cesu8_to_utf16 */
2840+
2841+
/**
2842+
* Allocate a new string from UTF16 encoded buffer
2843+
*
2844+
* @return pointer to the allocated string
2845+
*/
2846+
ecma_string_t *
2847+
ecma_new_ecma_string_from_utf16 (uint16_t *utf16_buff_p, lit_utf8_size_t utf16_length)
2848+
{
2849+
ecma_stringbuilder_t builder = ecma_stringbuilder_create ();
2850+
2851+
while (utf16_length--)
2852+
{
2853+
ecma_stringbuilder_append_codepoint (&builder, *utf16_buff_p++);
2854+
}
2855+
2856+
return ecma_stringbuilder_finalize (&builder);
2857+
} /* ecma_new_ecma_string_from_utf16 */
2858+
#endif /* JERRY_ICU */
2859+
28082860
/**
28092861
* @}
28102862
* @}

jerry-core/ecma/base/ecma-helpers.h

+4
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,10 @@ ecma_string_t *ecma_new_symbol_from_descriptor_string (ecma_value_t string_desc)
282282
bool ecma_prop_name_is_symbol (ecma_string_t *string_p);
283283
ecma_length_t ecma_op_advance_string_index (ecma_string_t *str_p, ecma_length_t index_num, bool is_unicode);
284284
#endif /* JERRY_ESNEXT */
285+
#if JERRY_ICU
286+
uint16_t *ecma_string_cesu8_to_utf16 (ecma_string_t *str_p, lit_utf8_size_t *utf16_length_p);
287+
ecma_string_t *ecma_new_ecma_string_from_utf16 (uint16_t *utf16_buff_p, lit_utf8_size_t utf16_length);
288+
#endif /* JERRY_ICU */
285289
#if JERRY_BUILTIN_CONTAINER
286290
ecma_string_t *ecma_new_map_key_string (ecma_value_t value);
287291
bool ecma_prop_name_is_map_key (ecma_string_t *string_p);

jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c

+145
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@
3838
#include "ecma-regexp-object.h"
3939
#endif /* JERRY_BUILTIN_REGEXP */
4040

41+
#if JERRY_ICU
42+
#include "unicode/unorm2.h"
43+
#endif /* JERRY_ICU */
44+
4145
#if JERRY_BUILTIN_STRING
4246

4347
#define ECMA_BUILTINS_INTERNAL
@@ -80,6 +84,7 @@ enum
8084

8185
ECMA_STRING_PROTOTYPE_SUBSTR,
8286

87+
ECMA_STRING_PROTOTYPE_NORMALIZE,
8388
ECMA_STRING_PROTOTYPE_REPEAT,
8489
ECMA_STRING_PROTOTYPE_CODE_POINT_AT,
8590
ECMA_STRING_PROTOTYPE_PAD_START,
@@ -1226,6 +1231,141 @@ ecma_builtin_string_prototype_object_trim (ecma_string_t *original_string_p) /**
12261231

12271232
#if JERRY_ESNEXT
12281233

1234+
/**
1235+
* ICU string normalizer instance callback
1236+
*/
1237+
typedef const UNormalizer2 *(*icu_string_normalizer_instance_cb_t) (UErrorCode *);
1238+
1239+
/**
1240+
* Normalization form descriptor
1241+
*/
1242+
typedef struct
1243+
{
1244+
lit_magic_string_id_t kind; /**< kind */
1245+
icu_string_normalizer_instance_cb_t instance_cb; /**< normalizer instance callback */
1246+
} icu_string_form_normalizer_t;
1247+
1248+
/**
1249+
* Helper macro to register form normalizer entries
1250+
*/
1251+
#if JERRY_ICU
1252+
#define FORM_ENTRY(id, instance_cb) \
1253+
{ \
1254+
id, instance_cb \
1255+
}
1256+
#else /* !JERRY_ICU */
1257+
#define FORM_ENTRY(id, instance_cb) \
1258+
{ \
1259+
(id, NULL) \
1260+
}
1261+
#endif /* JERRY_ICU */
1262+
1263+
/**
1264+
* List of normalization forms
1265+
*/
1266+
static const icu_string_form_normalizer_t icu_string_normalize_forms[] = {
1267+
FORM_ENTRY (LIT_MAGIC_STRING_NFC_U, unorm2_getNFCInstance),
1268+
FORM_ENTRY (LIT_MAGIC_STRING_NFD_U, unorm2_getNFDInstance),
1269+
FORM_ENTRY (LIT_MAGIC_STRING_NFKC_U, unorm2_getNFKCInstance),
1270+
FORM_ENTRY (LIT_MAGIC_STRING_NFKD_U, unorm2_getNFKDInstance)
1271+
};
1272+
1273+
#undef FORM_ENTRY
1274+
1275+
/**
1276+
* The String.prototype object's 'normalize' routine
1277+
*
1278+
* See also:
1279+
* ECMA-262 v12, 22.1.3.13
1280+
*
1281+
* @return ecma value
1282+
* Returned value must be freed with ecma_free_value.
1283+
*/
1284+
static ecma_value_t
1285+
ecma_builtin_string_prototype_object_normalize (ecma_string_t *original_string_p, /**< this argument */
1286+
ecma_value_t form_value) /**< normalization from */
1287+
{
1288+
icu_string_normalizer_instance_cb_t normalizer_instance_cb = unorm2_getNFCInstance;
1289+
1290+
if (!ecma_is_value_undefined (form_value))
1291+
{
1292+
ecma_string_t *form_p = ecma_op_to_string (form_value);
1293+
1294+
if (JERRY_UNLIKELY (form_p == NULL))
1295+
{
1296+
return ECMA_VALUE_ERROR;
1297+
}
1298+
1299+
size_t forms_size = sizeof (icu_string_normalize_forms) / sizeof (icu_string_normalize_forms[0]);
1300+
uint32_t form_idx = 0;
1301+
1302+
for (; form_idx < forms_size; form_idx++)
1303+
{
1304+
if (ecma_compare_ecma_string_to_magic_id (form_p, icu_string_normalize_forms[form_idx].kind))
1305+
{
1306+
normalizer_instance_cb = icu_string_normalize_forms[form_idx].instance_cb;
1307+
break;
1308+
}
1309+
}
1310+
1311+
ecma_deref_ecma_string (form_p);
1312+
1313+
if (form_idx >= forms_size)
1314+
{
1315+
return ecma_raise_range_error (ECMA_ERR_INVALID_NORMALIZATION_FORM);
1316+
}
1317+
}
1318+
1319+
#if JERRY_ICU
1320+
JERRY_ASSERT (normalizer_instance_cb != NULL);
1321+
size_t string_size = ecma_string_get_size (original_string_p);
1322+
1323+
if (string_size == 0)
1324+
{
1325+
#endif /* JERRY_ICU */
1326+
ecma_ref_ecma_string (original_string_p);
1327+
return ecma_make_string_value (original_string_p);
1328+
#if JERRY_ICU
1329+
}
1330+
#endif /* JERRY_ICU */
1331+
1332+
UErrorCode status = U_ZERO_ERROR;
1333+
const UNormalizer2 *normalizer_cb = normalizer_instance_cb (&status);
1334+
1335+
if (!U_FAILURE (status))
1336+
{
1337+
ecma_value_t result = ECMA_VALUE_ERROR;
1338+
1339+
lit_utf8_size_t length;
1340+
uint16_t *buffer_p = ecma_string_cesu8_to_utf16 (original_string_p, &length);
1341+
int32_t norm_length = unorm2_normalize (normalizer_cb, buffer_p, (int32_t) length, NULL, 0, &status);
1342+
1343+
if (!U_FAILURE (status) || status == U_BUFFER_OVERFLOW_ERROR)
1344+
{
1345+
uint16_t *norm_buff_p = (uint16_t *) jmem_heap_alloc_block ((uint32_t) norm_length * sizeof (uint16_t));
1346+
1347+
status = U_ZERO_ERROR;
1348+
norm_length = unorm2_normalize (normalizer_cb, buffer_p, (int32_t) length, norm_buff_p, norm_length, &status);
1349+
1350+
if (!U_FAILURE (status))
1351+
{
1352+
result = ecma_make_string_value (ecma_new_ecma_string_from_utf16 (norm_buff_p, (uint32_t) norm_length));
1353+
}
1354+
1355+
jmem_heap_free_block (norm_buff_p, (uint32_t) norm_length * sizeof (uint16_t));
1356+
}
1357+
1358+
jmem_heap_free_block (buffer_p, length * sizeof (uint16_t));
1359+
1360+
if (!ECMA_IS_VALUE_ERROR (result))
1361+
{
1362+
return result;
1363+
}
1364+
}
1365+
1366+
return ecma_raise_type_error (ECMA_ERR_NORMALIZATION_FAILED);
1367+
} /* ecma_builtin_string_prototype_object_normalize */
1368+
12291369
/**
12301370
* The String.prototype object's 'repeat' routine
12311371
*
@@ -1570,6 +1710,11 @@ ecma_builtin_string_prototype_dispatch_routine (uint8_t builtin_routine_id, /**<
15701710
}
15711711
#endif /* JERRY_BUILTIN_ANNEXB */
15721712
#if JERRY_ESNEXT
1713+
case ECMA_STRING_PROTOTYPE_NORMALIZE:
1714+
{
1715+
ret_value = ecma_builtin_string_prototype_object_normalize (string_p, arg1);
1716+
break;
1717+
}
15731718
case ECMA_STRING_PROTOTYPE_REPEAT:
15741719
{
15751720
ret_value = ecma_builtin_string_prototype_object_repeat (string_p, arg1);

jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.inc.h

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ ROUTINE (LIT_MAGIC_STRING_SUBSTR, ECMA_STRING_PROTOTYPE_SUBSTR, 2, 2)
6868
#endif /* JERRY_BUILTIN_ANNEXB */
6969

7070
#if JERRY_ESNEXT
71+
ROUTINE (LIT_MAGIC_STRING_NORMALIZE, ECMA_STRING_PROTOTYPE_NORMALIZE, 1, 0)
7172
ROUTINE (LIT_MAGIC_STRING_REPEAT, ECMA_STRING_PROTOTYPE_REPEAT, 1, 1)
7273
ROUTINE (LIT_MAGIC_STRING_STARTS_WITH, ECMA_STRING_PROTOTYPE_STARTS_WITH, 2, 1)
7374
ROUTINE (LIT_MAGIC_STRING_INCLUDES, ECMA_STRING_PROTOTYPE_INCLUDES, 2, 1)

0 commit comments

Comments
 (0)