jerryscript-project
diff --git a/‎jerry-core/ecma/base/ecma-globals.h
Lines changed: 2 additions & 2 deletions b/‎jerry-core/ecma/base/ecma-globals.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎jerry-core/ecma/base/ecma-helpers-string.cpp
Lines changed: 15 additions & 11 deletions b/‎jerry-core/ecma/base/ecma-helpers-string.cpp
Lines changed: 15 additions & 11 deletions
diff --git a/‎jerry-core/ecma/base/ecma-helpers.h
Lines changed: 1 addition & 1 deletion b/‎jerry-core/ecma/base/ecma-helpers.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎jerry-core/ecma/base/ecma-init-finalize.cpp
Lines changed: 0 additions & 1 deletion b/‎jerry-core/ecma/base/ecma-init-finalize.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎jerry-core/ecma/base/ecma-lcache.cpp
Lines changed: 4 additions & 4 deletions b/‎jerry-core/ecma/base/ecma-lcache.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp
Lines changed: 10 additions & 13 deletions b/‎jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp
Lines changed: 10 additions & 13 deletions
diff --git a/‎jerry-core/ecma/builtin-objects/ecma-builtin-string.cpp
Lines changed: 7 additions & 6 deletions b/‎jerry-core/ecma/builtin-objects/ecma-builtin-string.cpp
Lines changed: 7 additions & 6 deletions
diff --git a/‎jerry-core/lit/lit-globals.h
Lines changed: 54 additions & 5 deletions b/‎jerry-core/lit/lit-globals.h
Lines changed: 54 additions & 5 deletions
diff --git a/‎jerry-core/lit/lit-literal-storage.h
Lines changed: 3 additions & 4 deletions b/‎jerry-core/lit/lit-literal-storage.h
Lines changed: 3 additions & 4 deletions
@@ -712,7 +712,7 @@ typedef struct
   mem_cpointer_t next_chunk_cp;
 
   /** Characters */
-  uint8_t data[ sizeof (uint64_t) - sizeof (mem_cpointer_t) ];
+  lit_utf8_byte_t data[ sizeof (uint64_t) - sizeof (mem_cpointer_t) ];
 } ecma_collection_chunk_t;
 
 /**
@@ -755,7 +755,7 @@ typedef struct ecma_string_t
   uint8_t container;
 
   /** Hash of the string (calculated from two last characters of the string) */
-  ecma_string_hash_t hash;
+  lit_string_hash_t hash;
 
   /**
    * Actual data or identifier of it's place in container (depending on 'container' field)
 
@@ -102,6 +102,10 @@ ecma_new_chars_collection (const lit_utf8_byte_t chars_buffer[], /**< utf-8 char
 /**
  * Get length of a collection of ecma-chars
  *
+ * NOTE:
+ *   While chars collection holds a string in utf-8 encoding, this function acts as if the string was encoded in
+ *   UTF-16 and returns number of 16-bit characters (code units) required for string representation in this format.
+ *
  * @return number of UTF-16 code units in a collecton
  */
 static ecma_length_t
@@ -151,7 +155,7 @@ ecma_get_chars_collection_length (const ecma_collection_header_t *header_p) /**<
   JERRY_ASSERT (char_index == chars_number);
 
   return length;
-} /* ecma_compare_chars_collection */
+} /* ecma_get_chars_collection_length */
 
 /**
  * Compare two collection of ecma-chars.
@@ -446,7 +450,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
 ecma_string_t *
 ecma_new_ecma_string_from_code_unit (ecma_char_t code_unit) /**< code unit */
 {
-  lit_utf8_byte_t lit_utf8_bytes[MAX_BYTES_IN_CODE_UNIT];
+  lit_utf8_byte_t lit_utf8_bytes[LIT_UTF8_MAX_BYTES_IN_CODE_UNIT];
   lit_utf8_size_t bytes_size = lit_code_unit_to_utf8 (code_unit, lit_utf8_bytes);
 
   return ecma_new_ecma_string_from_utf8 (lit_utf8_bytes, bytes_size);
@@ -472,7 +476,7 @@ ecma_new_ecma_string_from_uint32 (uint32_t uint32_number) /**< UInt32-represente
   FIXME (/* Use digit to char conversion routine */);
   const lit_utf8_byte_t digits[10] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
   const bool is_one_char_or_more = (uint32_number >= 10);
-  const lit_utf8_byte_t last_chars[ECMA_STRING_HASH_LAST_CHARS_COUNT] =
+  const lit_utf8_byte_t last_chars[LIT_STRING_HASH_LAST_BYTES_COUNT] =
   {
     is_one_char_or_more ? digits[digit_pl] : digits[digit_l],
     is_one_char_or_more ? digits[digit_l] : (lit_utf8_byte_t) '\0'
@@ -657,22 +661,22 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
   ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string1_cp, string1_p);
   ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string2_cp, string2_p);
 
-  if (str2_size >= ECMA_STRING_HASH_LAST_CHARS_COUNT)
+  if (str2_size >= LIT_STRING_HASH_LAST_BYTES_COUNT)
   {
     string_desc_p->hash = string2_p->hash;
   }
   else
   {
-    JERRY_STATIC_ASSERT (ECMA_STRING_HASH_LAST_CHARS_COUNT == 2);
+    JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
     JERRY_ASSERT (str2_size == 1);
 
-    lit_utf8_byte_t bytes_buf[ECMA_STRING_HASH_LAST_CHARS_COUNT] =
+    lit_utf8_byte_t bytes_buf[LIT_STRING_HASH_LAST_BYTES_COUNT] =
     {
       ecma_string_get_byte_at_pos (string1_p, str1_size - 1u),
       ecma_string_get_byte_at_pos (string2_p, 0)
     };
 
-    string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf, ECMA_STRING_HASH_LAST_CHARS_COUNT);
+    string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf, LIT_STRING_HASH_LAST_BYTES_COUNT);
   }
 
   return string_desc_p;
@@ -1465,7 +1469,7 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
   {
     const uint32_t uint32_number = string_p->u.uint32_number;
     const int32_t max_uint32_len = 10;
-    const uint32_t nums_with_ascending_length[10] =
+    const uint32_t nums_with_ascending_length[max_uint32_len] =
     {
       1u,
       10u,
@@ -1717,7 +1721,7 @@ ecma_is_ex_string_magic (const ecma_string_t *string_p, /**< ecma-string */
  *
  * @return calculated hash
  */
-ecma_string_hash_t
+lit_string_hash_t
 ecma_string_hash (const ecma_string_t *string_p) /**< ecma-string to calculate hash for */
 
 {
@@ -1741,7 +1745,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
 #endif
 
   const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
-  const lit_utf8_size_t utf8_str_size = MAX_BYTES_IN_CODE_UNIT * span;
+  const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;
 
   if (utf8_str_size)
   {
@@ -1765,7 +1769,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
     {
       ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
 
-      JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + MAX_BYTES_IN_CODE_UNIT);
+      JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
       utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
     }
 
 
@@ -146,7 +146,7 @@ extern ecma_string_t* ecma_get_magic_string_ex (lit_magic_string_ex_id_t id);
 extern bool ecma_is_string_magic (const ecma_string_t *string_p, lit_magic_string_id_t *out_id_p);
 extern bool ecma_is_ex_string_magic (const ecma_string_t *string_p, lit_magic_string_ex_id_t *out_id_p);
 
-extern ecma_string_hash_t ecma_string_hash (const ecma_string_t *string_p);
+extern lit_string_hash_t ecma_string_hash (const ecma_string_t *string_p);
 extern ecma_string_t *ecma_string_substr (const ecma_string_t *string_p, ecma_length_t, ecma_length_t);
 
 /* ecma-helpers-number.c */
 
@@ -20,7 +20,6 @@
 #include "ecma-lcache.h"
 #include "ecma-lex-env.h"
 #include "ecma-stack.h"
-#include "lit-magic-strings.h"
 #include "mem-allocator.h"
 
 /** \addtogroup ecma ECMA
 
@@ -50,7 +50,7 @@ JERRY_STATIC_ASSERT (sizeof (ecma_lcache_hash_entry_t) == sizeof (uint64_t));
 /**
  * LCache hash value length, in bits
  */
-#define ECMA_LCACHE_HASH_BITS (sizeof (ecma_string_hash_t) * JERRY_BITSINBYTE)
+#define ECMA_LCACHE_HASH_BITS (sizeof (lit_string_hash_t) * JERRY_BITSINBYTE)
 
 /**
  * Number of rows in LCache's hash table
@@ -164,7 +164,7 @@ ecma_lcache_insert (ecma_object_t *object_p, /**< object */
 #ifndef CONFIG_ECMA_LCACHE_DISABLE
   prop_name_p = ecma_copy_or_ref_ecma_string (prop_name_p);
 
-  ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
+  lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
 
   if (prop_p != NULL)
   {
@@ -243,7 +243,7 @@ ecma_lcache_lookup (ecma_object_t *object_p, /**< object */
                                                  *         then the output parameter is not set */
 {
 #ifndef CONFIG_ECMA_LCACHE_DISABLE
-  ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
+  lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
 
   unsigned int object_cp;
   ECMA_SET_NON_NULL_POINTER (object_cp, object_p);
@@ -333,7 +333,7 @@ ecma_lcache_invalidate (ecma_object_t *object_p, /**< object */
   ECMA_SET_NON_NULL_POINTER (object_cp, object_p);
   ECMA_SET_POINTER (prop_cp, prop_p);
 
-  ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
+  lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
 
   /* Property's name has was computed.
    * Given (object, property name) pair should be in the row corresponding to computed hash.
 
@@ -580,18 +580,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
   lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
 
   MEM_DEFINE_LOCAL_ARRAY (input_start_p,
-                          input_size + 1,
+                          input_size,
                           lit_utf8_byte_t);
 
-  input_start_p[input_size] = LIT_BYTE_NULL;
-
   ecma_string_to_utf8_string (input_string_p,
                               input_start_p,
                               (ssize_t) (input_size));
 
   lit_utf8_byte_t *input_char_p = input_start_p;
   lit_utf8_byte_t *input_end_p = input_start_p + input_size;
-  lit_utf8_size_t output_size = 1;
+  lit_utf8_size_t output_size = 0;
 
   /*
    * The URI decoding has two major phases: first we validate the input,
@@ -780,10 +778,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
       }
     }
 
-    *output_char_p = '\0';
-    JERRY_ASSERT (output_start_p + output_size == output_char_p + 1);
+    JERRY_ASSERT (output_start_p + output_size == output_char_p);
 
-    ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size - 1);
+    ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
 
     ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
 
@@ -881,12 +878,12 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
    * and compute the length of the output, then we encode the input.
    */
 
-  lit_utf8_buffer_iterator iter = lit_create_utf8_buffer_iterator (input_start_p, input_size);
+  lit_utf8_iterator iter = lit_utf8_iterator_create (input_start_p, input_size);
   lit_utf8_size_t output_length = 1;
-  while (!lit_reached_buffer_end (&iter))
+  while (!lit_utf8_iterator_reached_buffer_end (&iter))
   {
     /* Input validation. */
-    lit_code_point character = lit_next_code_unit_from_buffer (&iter);
+    lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
 
     if (character <= 0x7f)
     {
@@ -932,12 +929,12 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
                             output_length,
                             lit_utf8_byte_t);
 
-    lit_utf8_buffer_iterator iter = lit_create_utf8_buffer_iterator (input_start_p, input_size);
+    lit_utf8_iterator iter = lit_utf8_iterator_create (input_start_p, input_size);
     lit_utf8_byte_t *output_char_p = output_start_p;
-    while (!lit_reached_buffer_end (&iter))
+    while (!lit_utf8_iterator_reached_buffer_end (&iter))
     {
       /* Input decode. */
-      lit_code_point character = lit_next_code_unit_from_buffer (&iter);
+      lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
 
       if (character <= 0x7f)
       {
 
@@ -66,11 +66,12 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
     return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p));
   }
 
-  lit_utf8_size_t utf8_buf_size = args_number * MAX_BYTES_IN_CODE_UNIT;
+  lit_utf8_size_t utf8_buf_size = args_number * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
+  ecma_string_t *ret_str_p;
+  MEM_DEFINE_LOCAL_ARRAY (utf8_buf_p, utf8_buf_size, lit_utf8_byte_t);
+
   lit_utf8_size_t utf8_buf_used = 0;
 
-  lit_utf8_byte_t *utf8_buf_p = (lit_utf8_byte_t*) mem_heap_alloc_block (utf8_buf_size,
-                                                                         MEM_HEAP_ALLOC_SHORT_TERM);
   FIXME ("Support surrogate pairs");
   for (ecma_length_t arg_index = 0;
        arg_index < args_number;
@@ -81,7 +82,7 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
     uint32_t uint32_char_code = ecma_number_to_uint32 (arg_num);
     ecma_char_t code_unit = (uint16_t) uint32_char_code;
 
-    JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - MAX_BYTES_IN_CODE_UNIT);
+    JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
     utf8_buf_used += lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used);
     JERRY_ASSERT (utf8_buf_used <= utf8_buf_size);
 
@@ -97,9 +98,9 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
     JERRY_ASSERT (ecma_is_completion_value_empty (ret_value));
   }
 
-  ecma_string_t *ret_str_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used);
+  ret_str_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used);
 
-  mem_heap_free_block (utf8_buf_p);
+  MEM_FINALIZE_LOCAL_ARRAY (utf8_buf_p);
 
   return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p));
 } /* ecma_builtin_string_object_from_char_code */
 
@@ -18,14 +18,63 @@
 
 #include "jrt.h"
 
+/**
+ * ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value
+ * used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6).
+ *
+ * The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer
+ * than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units.
+ *
+ * According to the standard all strings and source text are assumed to be a sequence of code units.
+ * Length of a string equals to number of code units in the string, which is not the same as number of Unicode
+ * characters in a string.
+ *
+ * Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character
+ * occupies from one to four bytes in UTF-8 representation.
+ *
+ * Unicode scalar value   | Bytes in UTF-8             | Bytes in UTF-16
+ *                        | (internal representation)  |
+ * ----------------------------------------------------------------------
+ *  0x0     - 0x7F        |  1 byte                    |  2 bytes
+ *  0x80    - 0x7FF       |  2 bytes                   |  2 bytes
+ *  0x800   - 0xFFFF      |  3 bytes                   |  2 bytes
+ *  0x10000 - 0x10FFFF    |  4 bytes                   |  4 bytes
+ *
+ * Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low
+ * surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16).
+ * Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow
+ * them to be encoded inside strings. The reason for that is compatibility with ECMA standard.
+ *
+ * For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon).
+ * It has the following representation in UTF-16: 0xD835 0xDF00.
+ *
+ * ECMA standard allows extracting a substring from this string:
+ * > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700
+ * > str.length; // 2
+ * > var str1 = str.substring (0, 1);
+ * > str1.length; // 1
+ * > str1.charCodeAt (0); // 55349 (this equals to 0xD835)
+ *
+ * Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80.
+ * After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5.
+ *
+ * Pair of low and high surrogates encoded separately should never occur in internal string representation,
+ * it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates,
+ * it should be processed gracefully;
+ * > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation
+ * > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation
+ * > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation,
+ *                          // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80
+ */
+
 /**
  * Description of an ecma-character, which represents 16-bit code unit,
  * which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1)
  */
 typedef uint16_t ecma_char_t;
 
 /**
- * Null character (zt-string end marker)
+ * Null character
  */
 #define ECMA_CHAR_NULL  ((ecma_char_t) '\0')
 
@@ -42,7 +91,7 @@ typedef ecma_char_t *ecma_char_ptr_t;
 /**
  * Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding
  */
-#define MAX_BYTES_IN_CODE_UNIT (3)
+#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
 
 /**
  * A byte of utf-8 string
@@ -57,16 +106,16 @@ typedef uint32_t lit_utf8_size_t;
 /**
  * Unicode code point
  */
-typedef uint32_t lit_code_point;
+typedef uint32_t lit_code_point_t;
 
 /**
  * ECMA string hash
  */
-typedef uint8_t ecma_string_hash_t;
+typedef uint8_t lit_string_hash_t;
 
 /**
  * Number of string's last characters to use for hash calculation
  */
-#define ECMA_STRING_HASH_LAST_CHARS_COUNT (2)
+#define LIT_STRING_HASH_LAST_BYTES_COUNT (2)
 
 #endif /* LIT_GLOBALS_H */
@@ -97,10 +97,10 @@ class lit_charset_record_t : public rcs_record_t
    *
    * @return hash value of the string (the value of the 'hash' field in the header)
    */
-  ecma_string_hash_t
+  lit_string_hash_t
   get_hash () const
   {
-    return (ecma_string_hash_t) get_field (_hash_field_pos, _hash_field_width);
+    return (lit_string_hash_t) get_field (_hash_field_pos, _hash_field_width);
   } /* get_hash */
 
   /**
@@ -149,7 +149,7 @@ class lit_charset_record_t : public rcs_record_t
    * Set record's hash (the value of the 'hash' field in the header)
    */
   void
-  set_hash (ecma_string_hash_t hash) /**< hash value */
+  set_hash (lit_string_hash_t hash) /**< hash value */
   {
     set_field (_hash_field_pos, _hash_field_width, hash);
   } /* set_hash */
@@ -241,7 +241,6 @@ class lit_magic_record_t : public rcs_record_t
   magic_string_id_t get_magic_str_id () const
   {
     uint32_t id = get_field (magic_field_pos, magic_field_width);
-    // JERRY_ASSERT (id < LIT_MAGIC_STRING__COUNT);
     return (magic_string_id_t) id;
   } /* get_magic_str_id */