Skip to content

Implement toLowerCase and toUpperCase built-in functions. #365

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 163 additions & 4 deletions jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "ecma-try-catch-macro.h"
#include "jrt.h"
#include "jrt-libc-includes.h"
#include "lit-char-helpers.h"

#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_STRING_BUILTIN

Expand Down Expand Up @@ -507,6 +508,164 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this
return ret_value;
} /* ecma_builtin_string_prototype_object_substring */

/**
* Helper function to convert a string to upper or lower case.
*
* @return completion value
* Returned value must be freed with ecma_free_completion_value.
*/
static ecma_completion_value_t
ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /**< this argument */
bool lower_case) /**< convert to lower (true)
* or upper (false) case */
{
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();

/* 1. */
ECMA_TRY_CATCH (check_coercible_val,
ecma_op_check_object_coercible (this_arg),
ret_value);

/* 2. */
ECMA_TRY_CATCH (to_string_val,
ecma_op_to_string (this_arg),
ret_value);

/* 3. */
ecma_string_t *input_string_p = ecma_get_string_from_value (to_string_val);
lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);

MEM_DEFINE_LOCAL_ARRAY (input_start_p,
input_size,
lit_utf8_byte_t);

ecma_string_to_utf8_string (input_string_p,
input_start_p,
(ssize_t) (input_size));

/*
* The URI encoding has two major phases: first we compute
* the length of the lower case string, then we encode it.
*/

lit_utf8_size_t output_length = 0;
lit_utf8_iterator_t input_iterator = lit_utf8_iterator_create (input_start_p, input_size);

while (!lit_utf8_iterator_is_eos (&input_iterator))
{
ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator);
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
lit_utf8_byte_t utf8_byte_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_POINT];
lit_utf8_size_t character_length;

/*
* We need to keep surrogate pairs. Surrogates are never converted,
* regardless they form a valid pair or not.
*/
if (lit_is_code_unit_high_surrogate (character))
{
ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);

if (lit_is_code_unit_low_surrogate (next_character))
{
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
output_length += lit_code_point_to_utf8 (surrogate_code_point, utf8_byte_buffer);
lit_utf8_iterator_incr (&input_iterator);
continue;
}
}

if (lower_case)
{
character_length = lit_char_to_lower_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}
else
{
character_length = lit_char_to_upper_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}

JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH);

for (lit_utf8_size_t i = 0; i < character_length; i++)
{
output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer);
}
}

/* Second phase. */

MEM_DEFINE_LOCAL_ARRAY (output_start_p,
output_length,
lit_utf8_byte_t);

lit_utf8_byte_t *output_char_p = output_start_p;

/* Encoding the output. */
lit_utf8_iterator_seek_bos (&input_iterator);

while (!lit_utf8_iterator_is_eos (&input_iterator))
{
ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator);
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
lit_utf8_size_t character_length;

/*
* We need to keep surrogate pairs. Surrogates are never converted,
* regardless they form a valid pair or not.
*/
if (lit_is_code_unit_high_surrogate (character))
{
ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);

if (lit_is_code_unit_low_surrogate (next_character))
{
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
output_char_p += lit_code_point_to_utf8 (surrogate_code_point, output_char_p);
lit_utf8_iterator_incr (&input_iterator);
continue;
}
}

if (lower_case)
{
character_length = lit_char_to_lower_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}
else
{
character_length = lit_char_to_upper_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}

JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH);

for (lit_utf8_size_t i = 0; i < character_length; i++)
{
output_char_p += lit_code_point_to_utf8 (character_buffer[i], output_char_p);
}
}

JERRY_ASSERT (output_start_p + output_length == output_char_p);

ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length);

ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));

MEM_FINALIZE_LOCAL_ARRAY (output_start_p);
MEM_FINALIZE_LOCAL_ARRAY (input_start_p);

ECMA_FINALIZE (to_string_val);
ECMA_FINALIZE (check_coercible_val);

return ret_value;
} /* ecma_builtin_string_prototype_object_conversion_helper */

/**
* The String.prototype object's 'toLowerCase' routine
*
Expand All @@ -519,7 +678,7 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this
static ecma_completion_value_t
ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**< this argument */
{
ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true);
} /* ecma_builtin_string_prototype_object_to_lower_case */

/**
Expand All @@ -534,7 +693,7 @@ ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**<
static ecma_completion_value_t
ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg) /**< this argument */
{
ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true);
} /* ecma_builtin_string_prototype_object_to_locale_lower_case */

/**
Expand All @@ -549,7 +708,7 @@ ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg
static ecma_completion_value_t
ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**< this argument */
{
ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false);
} /* ecma_builtin_string_prototype_object_to_upper_case */

/**
Expand All @@ -564,7 +723,7 @@ ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**<
static ecma_completion_value_t
ecma_builtin_string_prototype_object_to_locale_upper_case (ecma_value_t this_arg) /**< this argument */
{
ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false);
} /* ecma_builtin_string_prototype_object_to_locale_upper_case */

/**
Expand Down
76 changes: 76 additions & 0 deletions jerry-core/lit/lit-char-helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,79 @@ lit_char_is_word_char (ecma_char_t c) /**< code unit */
|| (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|| c == LIT_CHAR_UNDERSCORE);
} /* lit_char_is_word_char */

/**
* Returns the lowercase character sequence of an ecma character.
*
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zherczeg, could you, please, add arguments like buffer_size to the function, and check the condition with JERRY_ASSERT?

*
* @return the length of the lowercase character sequence
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
*/
lit_utf8_size_t
lit_char_to_lower_case (ecma_char_t character, /**< input character value */
ecma_char_t *output_buffer_p, /**< buffer for the result characters */
size_t buffer_size) /**< buffer size */
{
TODO ("Needs a proper lower case implementation. See issue #323.");

JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);

if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
{
output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
return 1;
}

if (character == 0x130)
{
output_buffer_p[0] = LIT_CHAR_LOWERCASE_I;
output_buffer_p[1] = 0x307;
return 2;
}

output_buffer_p[0] = character;
return 1;
} /* lit_char_to_lower_case */

/**
* Returns the uppercase character sequence of an ecma character.
*
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zherczeg, could you, please, also update the function likewise?

*
* @return the length of the uppercase character sequence
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
*/
lit_utf8_size_t
lit_char_to_upper_case (ecma_char_t character, /**< input character value */
ecma_char_t *output_buffer_p, /**< buffer for the result characters */
size_t buffer_size) /**< buffer size */
{
TODO ("Needs a proper upper case implementation. See issue #323.");

JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);

if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
{
output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
return 1;
}

if (character == 0xdf)
{
output_buffer_p[0] = LIT_CHAR_UPPERCASE_S;
output_buffer_p[1] = LIT_CHAR_UPPERCASE_S;
return 2;
}

if (character == 0x1fd7)
{
output_buffer_p[0] = 0x399;
output_buffer_p[1] = 0x308;
output_buffer_p[2] = 0x342;
return 3;
}

output_buffer_p[0] = character;
return 1;
} /* lit_char_to_upper_case */
12 changes: 12 additions & 0 deletions jerry-core/lit/lit-char-helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,4 +220,16 @@ extern uint32_t lit_char_hex_to_int (ecma_char_t);
*/
extern bool lit_char_is_word_char (ecma_char_t);

/*
* Utility functions for uppercasing / lowercasing
*/

/**
* Minimum buffer size for lit_char_to_lower_case / lit_char_to_upper_case functions.
*/
#define LIT_MAXIMUM_OTHER_CASE_LENGTH (3)

lit_utf8_size_t lit_char_to_lower_case (ecma_char_t, ecma_char_t *, size_t);
lit_utf8_size_t lit_char_to_upper_case (ecma_char_t, ecma_char_t *, size_t);

#endif /* LIT_CHAR_HELPERS_H */
2 changes: 1 addition & 1 deletion jerry-core/lit/lit-strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
return 4;
}
} /* lit_code_unit_to_utf8 */
} /* lit_code_point_to_utf8 */

/**
* Convert surrogate pair to code point
Expand Down
55 changes: 55 additions & 0 deletions tests/jerry/string-upper-lower-case-conversion.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright 2015 University of Szeged
// Copyright 2015 Samsung Electronics Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Conversion

assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toLowerCase()
== "0123456789abcdefghijklmnopqrstuvwxzyabcdefghijklmnopqrstuvwxyz");
assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toUpperCase()
== "0123456789ABCDEFGHIJKLMNOPQRSTUVWXZYABCDEFGHIJKLMNOPQRSTUVWXYZ");

assert ("\u0130".toLowerCase() == "i\u0307");
assert ("\xdf".toUpperCase() == "SS");
assert ("\u1fd7".toUpperCase() == "\u0399\u0308\u0342");

assert ("H\u0130-+".toLowerCase() == "hi\u0307-+");
assert ("\xdf\u1fd7\xdf".toUpperCase() == "SS\u0399\u0308\u0342SS");
assert ("\u0130\u0130\u0130".toLowerCase() == "i\u0307i\u0307i\u0307");

// Although codepoint 0x10400 and 0x10428 are an upper-lowercase pair,
// we must not do their conversion in JavaScript. We must also ignore
// stray surrogates.

assert ("\ud801\ud801\udc00\udc00".toLowerCase() == "\ud801\ud801\udc00\udc00");
assert ("\ud801\ud801\udc28\udc28".toUpperCase() == "\ud801\ud801\udc28\udc28");

// Conversion of non-string objects.

assert (String.prototype.toUpperCase.call(true) == "TRUE");
assert (String.prototype.toLowerCase.call(-23) == "-23");

var object = { toString : function() { return "<sTr>"; } };
assert (String.prototype.toUpperCase.call(object) == "<STR>");
assert (String.prototype.toLowerCase.call(object) == "<str>");

try
{
String.prototype.toUpperCase.call(null);
assert(false);
}
catch (e)
{
assert (e instanceof TypeError);
}