diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp index bfb6c7e8aa..7ca031e5ae 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp @@ -27,6 +27,7 @@ #include "ecma-try-catch-macro.h" #include "jrt.h" #include "jrt-libc-includes.h" +#include "lit-char-helpers.h" #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_STRING_BUILTIN @@ -507,6 +508,164 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this return ret_value; } /* ecma_builtin_string_prototype_object_substring */ +/** + * Helper function to convert a string to upper or lower case. + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value. + */ +static ecma_completion_value_t +ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /**< this argument */ + bool lower_case) /**< convert to lower (true) + * or upper (false) case */ +{ + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + + /* 1. */ + ECMA_TRY_CATCH (check_coercible_val, + ecma_op_check_object_coercible (this_arg), + ret_value); + + /* 2. */ + ECMA_TRY_CATCH (to_string_val, + ecma_op_to_string (this_arg), + ret_value); + + /* 3. */ + ecma_string_t *input_string_p = ecma_get_string_from_value (to_string_val); + lit_utf8_size_t input_size = ecma_string_get_size (input_string_p); + + MEM_DEFINE_LOCAL_ARRAY (input_start_p, + input_size, + lit_utf8_byte_t); + + ecma_string_to_utf8_string (input_string_p, + input_start_p, + (ssize_t) (input_size)); + + /* + * The URI encoding has two major phases: first we compute + * the length of the lower case string, then we encode it. + */ + + lit_utf8_size_t output_length = 0; + lit_utf8_iterator_t input_iterator = lit_utf8_iterator_create (input_start_p, input_size); + + while (!lit_utf8_iterator_is_eos (&input_iterator)) + { + ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator); + ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; + lit_utf8_byte_t utf8_byte_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_POINT]; + lit_utf8_size_t character_length; + + /* + * We need to keep surrogate pairs. Surrogates are never converted, + * regardless they form a valid pair or not. + */ + if (lit_is_code_unit_high_surrogate (character)) + { + ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator); + + if (lit_is_code_unit_low_surrogate (next_character)) + { + lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character); + output_length += lit_code_point_to_utf8 (surrogate_code_point, utf8_byte_buffer); + lit_utf8_iterator_incr (&input_iterator); + continue; + } + } + + if (lower_case) + { + character_length = lit_char_to_lower_case (character, + character_buffer, + LIT_MAXIMUM_OTHER_CASE_LENGTH); + } + else + { + character_length = lit_char_to_upper_case (character, + character_buffer, + LIT_MAXIMUM_OTHER_CASE_LENGTH); + } + + JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); + + for (lit_utf8_size_t i = 0; i < character_length; i++) + { + output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer); + } + } + + /* Second phase. */ + + MEM_DEFINE_LOCAL_ARRAY (output_start_p, + output_length, + lit_utf8_byte_t); + + lit_utf8_byte_t *output_char_p = output_start_p; + + /* Encoding the output. */ + lit_utf8_iterator_seek_bos (&input_iterator); + + while (!lit_utf8_iterator_is_eos (&input_iterator)) + { + ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator); + ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; + lit_utf8_size_t character_length; + + /* + * We need to keep surrogate pairs. Surrogates are never converted, + * regardless they form a valid pair or not. + */ + if (lit_is_code_unit_high_surrogate (character)) + { + ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator); + + if (lit_is_code_unit_low_surrogate (next_character)) + { + lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character); + output_char_p += lit_code_point_to_utf8 (surrogate_code_point, output_char_p); + lit_utf8_iterator_incr (&input_iterator); + continue; + } + } + + if (lower_case) + { + character_length = lit_char_to_lower_case (character, + character_buffer, + LIT_MAXIMUM_OTHER_CASE_LENGTH); + } + else + { + character_length = lit_char_to_upper_case (character, + character_buffer, + LIT_MAXIMUM_OTHER_CASE_LENGTH); + } + + JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); + + for (lit_utf8_size_t i = 0; i < character_length; i++) + { + output_char_p += lit_code_point_to_utf8 (character_buffer[i], output_char_p); + } + } + + JERRY_ASSERT (output_start_p + output_length == output_char_p); + + ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length); + + ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p)); + + MEM_FINALIZE_LOCAL_ARRAY (output_start_p); + MEM_FINALIZE_LOCAL_ARRAY (input_start_p); + + ECMA_FINALIZE (to_string_val); + ECMA_FINALIZE (check_coercible_val); + + return ret_value; +} /* ecma_builtin_string_prototype_object_conversion_helper */ + /** * The String.prototype object's 'toLowerCase' routine * @@ -519,7 +678,7 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this static ecma_completion_value_t ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**< this argument */ { - ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg); + return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true); } /* ecma_builtin_string_prototype_object_to_lower_case */ /** @@ -534,7 +693,7 @@ ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**< static ecma_completion_value_t ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg) /**< this argument */ { - ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg); + return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true); } /* ecma_builtin_string_prototype_object_to_locale_lower_case */ /** @@ -549,7 +708,7 @@ ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg static ecma_completion_value_t ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**< this argument */ { - ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg); + return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false); } /* ecma_builtin_string_prototype_object_to_upper_case */ /** @@ -564,7 +723,7 @@ ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**< static ecma_completion_value_t ecma_builtin_string_prototype_object_to_locale_upper_case (ecma_value_t this_arg) /**< this argument */ { - ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg); + return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false); } /* ecma_builtin_string_prototype_object_to_locale_upper_case */ /** diff --git a/jerry-core/lit/lit-char-helpers.cpp b/jerry-core/lit/lit-char-helpers.cpp index 6e0ed9b31f..76b2a2a216 100644 --- a/jerry-core/lit/lit-char-helpers.cpp +++ b/jerry-core/lit/lit-char-helpers.cpp @@ -328,3 +328,79 @@ lit_char_is_word_char (ecma_char_t c) /**< code unit */ || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END) || c == LIT_CHAR_UNDERSCORE); } /* lit_char_is_word_char */ + +/** + * Returns the lowercase character sequence of an ecma character. + * + * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters. + * + * @return the length of the lowercase character sequence + * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH. + */ +lit_utf8_size_t +lit_char_to_lower_case (ecma_char_t character, /**< input character value */ + ecma_char_t *output_buffer_p, /**< buffer for the result characters */ + size_t buffer_size) /**< buffer size */ +{ + TODO ("Needs a proper lower case implementation. See issue #323."); + + JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH); + + if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z) + { + output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + return 1; + } + + if (character == 0x130) + { + output_buffer_p[0] = LIT_CHAR_LOWERCASE_I; + output_buffer_p[1] = 0x307; + return 2; + } + + output_buffer_p[0] = character; + return 1; +} /* lit_char_to_lower_case */ + +/** + * Returns the uppercase character sequence of an ecma character. + * + * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters. + * + * @return the length of the uppercase character sequence + * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH. + */ +lit_utf8_size_t +lit_char_to_upper_case (ecma_char_t character, /**< input character value */ + ecma_char_t *output_buffer_p, /**< buffer for the result characters */ + size_t buffer_size) /**< buffer size */ +{ + TODO ("Needs a proper upper case implementation. See issue #323."); + + JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH); + + if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z) + { + output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + return 1; + } + + if (character == 0xdf) + { + output_buffer_p[0] = LIT_CHAR_UPPERCASE_S; + output_buffer_p[1] = LIT_CHAR_UPPERCASE_S; + return 2; + } + + if (character == 0x1fd7) + { + output_buffer_p[0] = 0x399; + output_buffer_p[1] = 0x308; + output_buffer_p[2] = 0x342; + return 3; + } + + output_buffer_p[0] = character; + return 1; +} /* lit_char_to_upper_case */ diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index e832406415..dc6f487310 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -220,4 +220,16 @@ extern uint32_t lit_char_hex_to_int (ecma_char_t); */ extern bool lit_char_is_word_char (ecma_char_t); +/* + * Utility functions for uppercasing / lowercasing + */ + +/** + * Minimum buffer size for lit_char_to_lower_case / lit_char_to_upper_case functions. + */ +#define LIT_MAXIMUM_OTHER_CASE_LENGTH (3) + +lit_utf8_size_t lit_char_to_lower_case (ecma_char_t, ecma_char_t *, size_t); +lit_utf8_size_t lit_char_to_upper_case (ecma_char_t, ecma_char_t *, size_t); + #endif /* LIT_CHAR_HELPERS_H */ diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp index 230e4173f6..9f1c446225 100644 --- a/jerry-core/lit/lit-strings.cpp +++ b/jerry-core/lit/lit-strings.cpp @@ -753,7 +753,7 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits; return 4; } -} /* lit_code_unit_to_utf8 */ +} /* lit_code_point_to_utf8 */ /** * Convert surrogate pair to code point diff --git a/tests/jerry/string-upper-lower-case-conversion.js b/tests/jerry/string-upper-lower-case-conversion.js new file mode 100644 index 0000000000..d687aacc2f --- /dev/null +++ b/tests/jerry/string-upper-lower-case-conversion.js @@ -0,0 +1,55 @@ +// Copyright 2015 University of Szeged +// Copyright 2015 Samsung Electronics Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Conversion + +assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toLowerCase() + == "0123456789abcdefghijklmnopqrstuvwxzyabcdefghijklmnopqrstuvwxyz"); +assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toUpperCase() + == "0123456789ABCDEFGHIJKLMNOPQRSTUVWXZYABCDEFGHIJKLMNOPQRSTUVWXYZ"); + +assert ("\u0130".toLowerCase() == "i\u0307"); +assert ("\xdf".toUpperCase() == "SS"); +assert ("\u1fd7".toUpperCase() == "\u0399\u0308\u0342"); + +assert ("H\u0130-+".toLowerCase() == "hi\u0307-+"); +assert ("\xdf\u1fd7\xdf".toUpperCase() == "SS\u0399\u0308\u0342SS"); +assert ("\u0130\u0130\u0130".toLowerCase() == "i\u0307i\u0307i\u0307"); + +// Although codepoint 0x10400 and 0x10428 are an upper-lowercase pair, +// we must not do their conversion in JavaScript. We must also ignore +// stray surrogates. + +assert ("\ud801\ud801\udc00\udc00".toLowerCase() == "\ud801\ud801\udc00\udc00"); +assert ("\ud801\ud801\udc28\udc28".toUpperCase() == "\ud801\ud801\udc28\udc28"); + +// Conversion of non-string objects. + +assert (String.prototype.toUpperCase.call(true) == "TRUE"); +assert (String.prototype.toLowerCase.call(-23) == "-23"); + +var object = { toString : function() { return ""; } }; +assert (String.prototype.toUpperCase.call(object) == ""); +assert (String.prototype.toLowerCase.call(object) == ""); + +try +{ + String.prototype.toUpperCase.call(null); + assert(false); +} +catch (e) +{ + assert (e instanceof TypeError); +}