Implement toLowerCase and toUpperCase built-in functions.

zherczeg · zherczeg · commit fd0d85bf4c7b · 2015-07-10T05:54:01.000-07:00
JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu
diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.cpp
@@ -27,6 +27,7 @@
 #include "ecma-try-catch-macro.h"
 #include "jrt.h"
 #include "jrt-libc-includes.h"
+#include "lit-char-helpers.h"
 
 #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_STRING_BUILTIN
 
@@ -507,6 +508,157 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this
   return ret_value;
 } /* ecma_builtin_string_prototype_object_substring */
 
+/**
+ * Helper function to convert a string to upper or lower case.
+ *
+ * @return completion value
+ *         Returned value must be freed with ecma_free_completion_value.
+ */
+static ecma_completion_value_t
+ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /**< this argument */
+                                                        bool lower_case) /**< provideThis flag */
+{
+  ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
+
+  /* 1. */
+  ECMA_TRY_CATCH (check_coercible_val,
+                  ecma_op_check_object_coercible (this_arg),
+                  ret_value);
+
+  /* 2. */
+  ECMA_TRY_CATCH (to_string_val,
+                  ecma_op_to_string (this_arg),
+                  ret_value);
+
+  JERRY_ASSERT (ecma_is_value_string (to_string_val));
+
+  /* 3. */
+  ecma_string_t *input_string_p = ecma_get_string_from_value (to_string_val);
+  lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
+
+  MEM_DEFINE_LOCAL_ARRAY (input_start_p,
+                          input_size,
+                          lit_utf8_byte_t);
+
+  ecma_string_to_utf8_string (input_string_p,
+                              input_start_p,
+                              (ssize_t) (input_size));
+
+  /*
+   * The URI encoding has two major phases: first we compute
+   * the length of the lower case string, then we encode it.
+   */
+
+  lit_utf8_size_t output_length = 0;
+  lit_utf8_iterator_t input_iterator = lit_utf8_iterator_create (input_start_p, input_size);
+
+  while (!lit_utf8_iterator_is_eos (&input_iterator))
+  {
+    ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator);
+    ecma_char_t character_buffer[MAXIMUM_OTHERCASE_LENGTH];
+    lit_utf8_byte_t utf8_byte_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_POINT];
+    lit_utf8_size_t character_length;
+
+    /*
+     * We need to keep surrogate pairs. Surrogates are never converted,
+     * regardless they form a valid pair or not.
+     */
+    if (lit_is_code_unit_high_surrogate (character))
+    {
+      ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);
+
+      if (lit_is_code_unit_low_surrogate (next_character))
+      {
+        lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
+        output_length += lit_code_point_to_utf8 (surrogate_code_point, utf8_byte_buffer);
+        lit_utf8_iterator_incr (&input_iterator);
+        continue;
+      }
+    }
+
+    if (lower_case)
+    {
+      character_length = lit_char_to_lower_case (character, character_buffer);
+    }
+    else
+    {
+      character_length = lit_char_to_upper_case (character, character_buffer);
+    }
+
+    JERRY_ASSERT (character_length >= 1 && character_length <= MAXIMUM_OTHERCASE_LENGTH);
+
+    for (lit_utf8_size_t i = 0; i < character_length; i++)
+    {
+      output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer);
+    }
+  }
+
+  /* Second phase. */
+
+  MEM_DEFINE_LOCAL_ARRAY (output_start_p,
+                          output_length,
+                          lit_utf8_byte_t);
+
+  lit_utf8_byte_t *output_char_p = output_start_p;
+
+  /* Encoding the output. */
+  lit_utf8_iterator_seek_bos (&input_iterator);
+
+  while (!lit_utf8_iterator_is_eos (&input_iterator))
+  {
+    ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator);
+    ecma_char_t character_buffer[MAXIMUM_OTHERCASE_LENGTH];
+    lit_utf8_size_t character_length;
+
+    /*
+     * We need to keep surrogate pairs. Surrogates are never converted,
+     * regardless they form a valid pair or not.
+     */
+    if (lit_is_code_unit_high_surrogate (character))
+    {
+      ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);
+
+      if (lit_is_code_unit_low_surrogate (next_character))
+      {
+        lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
+        output_char_p += lit_code_point_to_utf8 (surrogate_code_point, output_char_p);
+        lit_utf8_iterator_incr (&input_iterator);
+        continue;
+      }
+    }
+
+    if (lower_case)
+    {
+      character_length = lit_char_to_lower_case (character, character_buffer);
+    }
+    else
+    {
+      character_length = lit_char_to_upper_case (character, character_buffer);
+    }
+
+    JERRY_ASSERT (character_length >= 1 && character_length <= MAXIMUM_OTHERCASE_LENGTH);
+
+    for (lit_utf8_size_t i = 0; i < character_length; i++)
+    {
+      output_char_p += lit_code_point_to_utf8 (character_buffer[i], output_char_p);
+    }
+  }
+
+  JERRY_ASSERT (output_start_p + output_length == output_char_p);
+
+  ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length);
+
+  ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
+
+  MEM_FINALIZE_LOCAL_ARRAY (output_start_p);
+  MEM_FINALIZE_LOCAL_ARRAY (input_start_p);
+
+  ECMA_FINALIZE (to_string_val);
+  ECMA_FINALIZE (check_coercible_val);
+
+  return ret_value;
+} /* ecma_builtin_string_prototype_object_conversion_helper */
+
 /**
  * The String.prototype object's 'toLowerCase' routine
  *
@@ -519,7 +671,7 @@ ecma_builtin_string_prototype_object_substring (ecma_value_t this_arg, /**< this
 static ecma_completion_value_t
 ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**< this argument */
 {
-  ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
+  return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true);
 } /* ecma_builtin_string_prototype_object_to_lower_case */
 
 /**
@@ -534,7 +686,7 @@ ecma_builtin_string_prototype_object_to_lower_case (ecma_value_t this_arg) /**<
 static ecma_completion_value_t
 ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg) /**< this argument */
 {
-  ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
+  return ecma_builtin_string_prototype_object_conversion_helper (this_arg, true);
 } /* ecma_builtin_string_prototype_object_to_locale_lower_case */
 
 /**
@@ -549,7 +701,7 @@ ecma_builtin_string_prototype_object_to_locale_lower_case (ecma_value_t this_arg
 static ecma_completion_value_t
 ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**< this argument */
 {
-  ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
+  return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false);
 } /* ecma_builtin_string_prototype_object_to_upper_case */
 
 /**
@@ -564,7 +716,7 @@ ecma_builtin_string_prototype_object_to_upper_case (ecma_value_t this_arg) /**<
 static ecma_completion_value_t
 ecma_builtin_string_prototype_object_to_locale_upper_case (ecma_value_t this_arg) /**< this argument */
 {
-  ECMA_BUILTIN_CP_UNIMPLEMENTED (this_arg);
+  return ecma_builtin_string_prototype_object_conversion_helper (this_arg, false);
 } /* ecma_builtin_string_prototype_object_to_locale_upper_case */
 
 /**
diff --git a/jerry-core/lit/lit-char-helpers.cpp b/jerry-core/lit/lit-char-helpers.cpp
@@ -328,3 +328,71 @@ lit_char_is_word_char (ecma_char_t c) /**< code unit */
           || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
           || c == LIT_CHAR_UNDERSCORE);
 } /* lit_char_is_word_char */
+
+/**
+ * Returns the lowercase character sequence of an ecma character.
+ *
+ * Note: output_buffer_p must be able to hold at least MAXIMUM_OTHERCASE_LENGTH characters.
+ *
+ * @return the length of the lowercase character sequence
+ *         which is always between 1 and MAXIMUM_OTHERCASE_LENGTH.
+ */
+lit_utf8_size_t
+lit_char_to_lower_case (ecma_char_t character, ecma_char_t *output_buffer_p)
+{
+  TODO ("Needs a proper lower case implementation.");
+
+  if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
+  {
+    output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
+    return 1;
+  }
+
+  if (character == 0x130)
+  {
+    output_buffer_p[0] = LIT_CHAR_LOWERCASE_I;
+    output_buffer_p[1] = 0x307;
+    return 2;
+  }
+
+  output_buffer_p[0] = character;
+  return 1;
+} /* lit_char_to_lower_case */
+
+/**
+ * Returns the uppercase character sequence of an ecma character.
+ *
+ * Note: output_buffer_p must be able to hold at least MAXIMUM_OTHERCASE_LENGTH characters.
+ *
+ * @return the length of the uppercase character sequence
+ *         which is always between 1 and MAXIMUM_OTHERCASE_LENGTH.
+ */
+lit_utf8_size_t
+lit_char_to_upper_case (ecma_char_t character, ecma_char_t *output_buffer_p)
+{
+  TODO ("Needs a proper upper case implementation.");
+
+  if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
+  {
+    output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
+    return 1;
+  }
+
+  if (character == 0xdf)
+  {
+    output_buffer_p[0] = LIT_CHAR_UPPERCASE_S;
+    output_buffer_p[1] = LIT_CHAR_UPPERCASE_S;
+    return 2;
+  }
+
+  if (character == 0x1fd7)
+  {
+    output_buffer_p[0] = 0x399;
+    output_buffer_p[1] = 0x308;
+    output_buffer_p[2] = 0x342;
+    return 3;
+  }
+
+  output_buffer_p[0] = character;
+  return 1;
+} /* lit_char_to_upper_case */
diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h
@@ -220,4 +220,13 @@ extern uint32_t lit_char_hex_to_int (ecma_char_t);
  */
 extern bool lit_char_is_word_char (ecma_char_t);
 
+/*
+ * Utility functions for uppercasing / lowercasing
+ */
+
+#define MAXIMUM_OTHERCASE_LENGTH (3)
+
+lit_utf8_size_t lit_char_to_lower_case (ecma_char_t, ecma_char_t *);
+lit_utf8_size_t lit_char_to_upper_case (ecma_char_t, ecma_char_t *);
+
 #endif /* LIT_CHAR_HELPERS_H */
diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp
@@ -753,7 +753,7 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
     buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
     return 4;
   }
-} /* lit_code_unit_to_utf8 */
+} /* lit_code_point_to_utf8 */
 
 /**
  * Convert surrogate pair to code point
diff --git a/tests/jerry/string-upper-lower-case-conversion.js b/tests/jerry/string-upper-lower-case-conversion.js
@@ -0,0 +1,55 @@
+// Copyright 2015 University of Szeged
+// Copyright 2015 Samsung Electronics Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Conversion
+
+assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toLowerCase()
+        == "0123456789abcdefghijklmnopqrstuvwxzyabcdefghijklmnopqrstuvwxyz");
+assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toUpperCase()
+        == "0123456789ABCDEFGHIJKLMNOPQRSTUVWXZYABCDEFGHIJKLMNOPQRSTUVWXYZ");
+
+assert ("\u0130".toLowerCase() == "i\u0307");
+assert ("\xdf".toUpperCase() == "SS");
+assert ("\u1fd7".toUpperCase() == "\u0399\u0308\u0342");
+
+assert ("H\u0130-+".toLowerCase() == "hi\u0307-+");
+assert ("\xdf\u1fd7\xdf".toUpperCase() == "SS\u0399\u0308\u0342SS");
+assert ("\u0130\u0130\u0130".toLowerCase() == "i\u0307i\u0307i\u0307");
+
+// Although codepoint 0x10400 and 0x10428 are an upper-lowercase pair,
+// we must not do their conversion in JavaScript. We must also ignore
+// stray surrogates.
+
+assert ("\ud801\ud801\udc00\udc00".toLowerCase() == "\ud801\ud801\udc00\udc00");
+assert ("\ud801\ud801\udc28\udc28".toUpperCase() == "\ud801\ud801\udc28\udc28");
+
+// Conversion of non-string objects.
+
+assert (String.prototype.toUpperCase.call(true) == "TRUE");
+assert (String.prototype.toLowerCase.call(-23) == "-23");
+
+var object = { toString : function() { return "<sTr>"; } };
+assert (String.prototype.toUpperCase.call(object) == "<STR>");
+assert (String.prototype.toLowerCase.call(object) == "<str>");
+
+try
+{
+  String.prototype.toUpperCase.call(null);
+  assert(false);
+}
+catch (e)
+{
+  assert (e instanceof TypeError);
+}

Original file line number	Diff line number	Diff line change
`@@ -753,7 +753,7 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /*< code point /`
`753`	`753`	`buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER \| fourth_byte_bits;`
`754`	`754`	`return 4;`
`755`	`755`	`}`
`756`		`-} /* lit_code_unit_to_utf8 */`
	`756`	`+} /* lit_code_point_to_utf8 */`
`757`	`757`
`758`	`758`	`/**`
`759`	`759`	`* Convert surrogate pair to code point`