Skip to content

Commit 2939973

Browse files
committed
Fix notes.
JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov [email protected]
1 parent 627c8e5 commit 2939973

14 files changed

+340
-213
lines changed

jerry-core/ecma/base/ecma-globals.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,7 @@ typedef struct
712712
mem_cpointer_t next_chunk_cp;
713713

714714
/** Characters */
715-
uint8_t data[ sizeof (uint64_t) - sizeof (mem_cpointer_t) ];
715+
lit_utf8_byte_t data[ sizeof (uint64_t) - sizeof (mem_cpointer_t) ];
716716
} ecma_collection_chunk_t;
717717

718718
/**
@@ -755,7 +755,7 @@ typedef struct ecma_string_t
755755
uint8_t container;
756756

757757
/** Hash of the string (calculated from two last characters of the string) */
758-
ecma_string_hash_t hash;
758+
lit_string_hash_t hash;
759759

760760
/**
761761
* Actual data or identifier of it's place in container (depending on 'container' field)

jerry-core/ecma/base/ecma-helpers-string.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ ecma_new_chars_collection (const lit_utf8_byte_t chars_buffer[], /**< utf-8 char
102102
/**
103103
* Get length of a collection of ecma-chars
104104
*
105+
* NOTE:
106+
* While chars collection holds a string in utf-8 encoding, this function acts as if the string was encoded in
107+
* UTF-16 and returns number of 16-bit characters (code units) required for string representation in this format.
108+
*
105109
* @return number of UTF-16 code units in a collecton
106110
*/
107111
static ecma_length_t
@@ -151,7 +155,7 @@ ecma_get_chars_collection_length (const ecma_collection_header_t *header_p) /**<
151155
JERRY_ASSERT (char_index == chars_number);
152156

153157
return length;
154-
} /* ecma_compare_chars_collection */
158+
} /* ecma_get_chars_collection_length */
155159

156160
/**
157161
* Compare two collection of ecma-chars.
@@ -446,7 +450,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
446450
ecma_string_t *
447451
ecma_new_ecma_string_from_code_unit (ecma_char_t code_unit) /**< code unit */
448452
{
449-
lit_utf8_byte_t lit_utf8_bytes[MAX_BYTES_IN_CODE_UNIT];
453+
lit_utf8_byte_t lit_utf8_bytes[LIT_UTF8_MAX_BYTES_IN_CODE_UNIT];
450454
lit_utf8_size_t bytes_size = lit_code_unit_to_utf8 (code_unit, lit_utf8_bytes);
451455

452456
return ecma_new_ecma_string_from_utf8 (lit_utf8_bytes, bytes_size);
@@ -472,7 +476,7 @@ ecma_new_ecma_string_from_uint32 (uint32_t uint32_number) /**< UInt32-represente
472476
FIXME (/* Use digit to char conversion routine */);
473477
const lit_utf8_byte_t digits[10] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
474478
const bool is_one_char_or_more = (uint32_number >= 10);
475-
const lit_utf8_byte_t last_chars[ECMA_STRING_HASH_LAST_CHARS_COUNT] =
479+
const lit_utf8_byte_t last_chars[LIT_STRING_HASH_LAST_BYTES_COUNT] =
476480
{
477481
is_one_char_or_more ? digits[digit_pl] : digits[digit_l],
478482
is_one_char_or_more ? digits[digit_l] : (lit_utf8_byte_t) '\0'
@@ -657,22 +661,22 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
657661
ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string1_cp, string1_p);
658662
ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string2_cp, string2_p);
659663

660-
if (str2_size >= ECMA_STRING_HASH_LAST_CHARS_COUNT)
664+
if (str2_size >= LIT_STRING_HASH_LAST_BYTES_COUNT)
661665
{
662666
string_desc_p->hash = string2_p->hash;
663667
}
664668
else
665669
{
666-
JERRY_STATIC_ASSERT (ECMA_STRING_HASH_LAST_CHARS_COUNT == 2);
670+
JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
667671
JERRY_ASSERT (str2_size == 1);
668672

669-
lit_utf8_byte_t bytes_buf[ECMA_STRING_HASH_LAST_CHARS_COUNT] =
673+
lit_utf8_byte_t bytes_buf[LIT_STRING_HASH_LAST_BYTES_COUNT] =
670674
{
671675
ecma_string_get_byte_at_pos (string1_p, str1_size - 1u),
672676
ecma_string_get_byte_at_pos (string2_p, 0)
673677
};
674678

675-
string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf, ECMA_STRING_HASH_LAST_CHARS_COUNT);
679+
string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf, LIT_STRING_HASH_LAST_BYTES_COUNT);
676680
}
677681

678682
return string_desc_p;
@@ -1465,7 +1469,7 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
14651469
{
14661470
const uint32_t uint32_number = string_p->u.uint32_number;
14671471
const int32_t max_uint32_len = 10;
1468-
const uint32_t nums_with_ascending_length[10] =
1472+
const uint32_t nums_with_ascending_length[max_uint32_len] =
14691473
{
14701474
1u,
14711475
10u,
@@ -1717,7 +1721,7 @@ ecma_is_ex_string_magic (const ecma_string_t *string_p, /**< ecma-string */
17171721
*
17181722
* @return calculated hash
17191723
*/
1720-
ecma_string_hash_t
1724+
lit_string_hash_t
17211725
ecma_string_hash (const ecma_string_t *string_p) /**< ecma-string to calculate hash for */
17221726

17231727
{
@@ -1741,7 +1745,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
17411745
#endif
17421746

17431747
const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
1744-
const lit_utf8_size_t utf8_str_size = MAX_BYTES_IN_CODE_UNIT * span;
1748+
const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;
17451749

17461750
if (utf8_str_size)
17471751
{
@@ -1765,7 +1769,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
17651769
{
17661770
ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
17671771

1768-
JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + MAX_BYTES_IN_CODE_UNIT);
1772+
JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
17691773
utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
17701774
}
17711775

jerry-core/ecma/base/ecma-helpers.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ extern ecma_string_t* ecma_get_magic_string_ex (lit_magic_string_ex_id_t id);
146146
extern bool ecma_is_string_magic (const ecma_string_t *string_p, lit_magic_string_id_t *out_id_p);
147147
extern bool ecma_is_ex_string_magic (const ecma_string_t *string_p, lit_magic_string_ex_id_t *out_id_p);
148148

149-
extern ecma_string_hash_t ecma_string_hash (const ecma_string_t *string_p);
149+
extern lit_string_hash_t ecma_string_hash (const ecma_string_t *string_p);
150150
extern ecma_string_t *ecma_string_substr (const ecma_string_t *string_p, ecma_length_t, ecma_length_t);
151151

152152
/* ecma-helpers-number.c */

jerry-core/ecma/base/ecma-init-finalize.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "ecma-lcache.h"
2121
#include "ecma-lex-env.h"
2222
#include "ecma-stack.h"
23-
#include "lit-magic-strings.h"
2423
#include "mem-allocator.h"
2524

2625
/** \addtogroup ecma ECMA

jerry-core/ecma/base/ecma-lcache.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ JERRY_STATIC_ASSERT (sizeof (ecma_lcache_hash_entry_t) == sizeof (uint64_t));
5050
/**
5151
* LCache hash value length, in bits
5252
*/
53-
#define ECMA_LCACHE_HASH_BITS (sizeof (ecma_string_hash_t) * JERRY_BITSINBYTE)
53+
#define ECMA_LCACHE_HASH_BITS (sizeof (lit_string_hash_t) * JERRY_BITSINBYTE)
5454

5555
/**
5656
* Number of rows in LCache's hash table
@@ -164,7 +164,7 @@ ecma_lcache_insert (ecma_object_t *object_p, /**< object */
164164
#ifndef CONFIG_ECMA_LCACHE_DISABLE
165165
prop_name_p = ecma_copy_or_ref_ecma_string (prop_name_p);
166166

167-
ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
167+
lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
168168

169169
if (prop_p != NULL)
170170
{
@@ -243,7 +243,7 @@ ecma_lcache_lookup (ecma_object_t *object_p, /**< object */
243243
* then the output parameter is not set */
244244
{
245245
#ifndef CONFIG_ECMA_LCACHE_DISABLE
246-
ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
246+
lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
247247

248248
unsigned int object_cp;
249249
ECMA_SET_NON_NULL_POINTER (object_cp, object_p);
@@ -333,7 +333,7 @@ ecma_lcache_invalidate (ecma_object_t *object_p, /**< object */
333333
ECMA_SET_NON_NULL_POINTER (object_cp, object_p);
334334
ECMA_SET_POINTER (prop_cp, prop_p);
335335

336-
ecma_string_hash_t hash_key = ecma_string_hash (prop_name_p);
336+
lit_string_hash_t hash_key = ecma_string_hash (prop_name_p);
337337

338338
/* Property's name has was computed.
339339
* Given (object, property name) pair should be in the row corresponding to computed hash.

jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -580,18 +580,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
580580
lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
581581

582582
MEM_DEFINE_LOCAL_ARRAY (input_start_p,
583-
input_size + 1,
583+
input_size,
584584
lit_utf8_byte_t);
585585

586-
input_start_p[input_size] = LIT_BYTE_NULL;
587-
588586
ecma_string_to_utf8_string (input_string_p,
589587
input_start_p,
590588
(ssize_t) (input_size));
591589

592590
lit_utf8_byte_t *input_char_p = input_start_p;
593591
lit_utf8_byte_t *input_end_p = input_start_p + input_size;
594-
lit_utf8_size_t output_size = 1;
592+
lit_utf8_size_t output_size = 0;
595593

596594
/*
597595
* The URI decoding has two major phases: first we validate the input,
@@ -780,10 +778,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
780778
}
781779
}
782780

783-
*output_char_p = '\0';
784-
JERRY_ASSERT (output_start_p + output_size == output_char_p + 1);
781+
JERRY_ASSERT (output_start_p + output_size == output_char_p);
785782

786-
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size - 1);
783+
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
787784

788785
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
789786

@@ -881,12 +878,12 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
881878
* and compute the length of the output, then we encode the input.
882879
*/
883880

884-
lit_utf8_buffer_iterator iter = lit_create_utf8_buffer_iterator (input_start_p, input_size);
881+
lit_utf8_iterator iter = lit_utf8_iterator_create (input_start_p, input_size);
885882
lit_utf8_size_t output_length = 1;
886-
while (!lit_reached_buffer_end (&iter))
883+
while (!lit_utf8_iterator_reached_buffer_end (&iter))
887884
{
888885
/* Input validation. */
889-
lit_code_point character = lit_next_code_unit_from_buffer (&iter);
886+
lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
890887

891888
if (character <= 0x7f)
892889
{
@@ -932,12 +929,12 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
932929
output_length,
933930
lit_utf8_byte_t);
934931

935-
lit_utf8_buffer_iterator iter = lit_create_utf8_buffer_iterator (input_start_p, input_size);
932+
lit_utf8_iterator iter = lit_utf8_iterator_create (input_start_p, input_size);
936933
lit_utf8_byte_t *output_char_p = output_start_p;
937-
while (!lit_reached_buffer_end (&iter))
934+
while (!lit_utf8_iterator_reached_buffer_end (&iter))
938935
{
939936
/* Input decode. */
940-
lit_code_point character = lit_next_code_unit_from_buffer (&iter);
937+
lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
941938

942939
if (character <= 0x7f)
943940
{

jerry-core/ecma/builtin-objects/ecma-builtin-string.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,12 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
6666
return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p));
6767
}
6868

69-
lit_utf8_size_t utf8_buf_size = args_number * MAX_BYTES_IN_CODE_UNIT;
69+
lit_utf8_size_t utf8_buf_size = args_number * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
70+
ecma_string_t *ret_str_p;
71+
MEM_DEFINE_LOCAL_ARRAY (utf8_buf_p, utf8_buf_size, lit_utf8_byte_t);
72+
7073
lit_utf8_size_t utf8_buf_used = 0;
7174

72-
lit_utf8_byte_t *utf8_buf_p = (lit_utf8_byte_t*) mem_heap_alloc_block (utf8_buf_size,
73-
MEM_HEAP_ALLOC_SHORT_TERM);
7475
FIXME ("Support surrogate pairs");
7576
for (ecma_length_t arg_index = 0;
7677
arg_index < args_number;
@@ -81,7 +82,7 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
8182
uint32_t uint32_char_code = ecma_number_to_uint32 (arg_num);
8283
ecma_char_t code_unit = (uint16_t) uint32_char_code;
8384

84-
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - MAX_BYTES_IN_CODE_UNIT);
85+
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
8586
utf8_buf_used += lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used);
8687
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size);
8788

@@ -97,9 +98,9 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
9798
JERRY_ASSERT (ecma_is_completion_value_empty (ret_value));
9899
}
99100

100-
ecma_string_t *ret_str_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used);
101+
ret_str_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used);
101102

102-
mem_heap_free_block (utf8_buf_p);
103+
MEM_FINALIZE_LOCAL_ARRAY (utf8_buf_p);
103104

104105
return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p));
105106
} /* ecma_builtin_string_object_from_char_code */

jerry-core/lit/lit-globals.h

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,63 @@
1818

1919
#include "jrt.h"
2020

21+
/**
22+
* ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value
23+
* used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6).
24+
*
25+
* The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer
26+
* than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units.
27+
*
28+
* According to the standard all strings and source text are assumed to be a sequence of code units.
29+
* Length of a string equals to number of code units in the string, which is not the same as number of Unicode
30+
* characters in a string.
31+
*
32+
* Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character
33+
* occupies from one to four bytes in UTF-8 representation.
34+
*
35+
* Unicode scalar value | Bytes in UTF-8 | Bytes in UTF-16
36+
* | (internal representation) |
37+
* ----------------------------------------------------------------------
38+
* 0x0 - 0x7F | 1 byte | 2 bytes
39+
* 0x80 - 0x7FF | 2 bytes | 2 bytes
40+
* 0x800 - 0xFFFF | 3 bytes | 2 bytes
41+
* 0x10000 - 0x10FFFF | 4 bytes | 4 bytes
42+
*
43+
* Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low
44+
* surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16).
45+
* Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow
46+
* them to be encoded inside strings. The reason for that is compatibility with ECMA standard.
47+
*
48+
* For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon).
49+
* It has the following representation in UTF-16: 0xD835 0xDF00.
50+
*
51+
* ECMA standard allows extracting a substring from this string:
52+
* > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700
53+
* > str.length; // 2
54+
* > var str1 = str.substring (0, 1);
55+
* > str1.length; // 1
56+
* > str1.charCodeAt (0); // 55349 (this equals to 0xD835)
57+
*
58+
* Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80.
59+
* After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5.
60+
*
61+
* Pair of low and high surrogates encoded separately should never occur in internal string representation,
62+
* it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates,
63+
* it should be processed gracefully;
64+
* > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation
65+
* > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation
66+
* > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation,
67+
* // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80
68+
*/
69+
2170
/**
2271
* Description of an ecma-character, which represents 16-bit code unit,
2372
* which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1)
2473
*/
2574
typedef uint16_t ecma_char_t;
2675

2776
/**
28-
* Null character (zt-string end marker)
77+
* Null character
2978
*/
3079
#define ECMA_CHAR_NULL ((ecma_char_t) '\0')
3180

@@ -42,7 +91,7 @@ typedef ecma_char_t *ecma_char_ptr_t;
4291
/**
4392
* Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding
4493
*/
45-
#define MAX_BYTES_IN_CODE_UNIT (3)
94+
#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
4695

4796
/**
4897
* A byte of utf-8 string
@@ -57,16 +106,16 @@ typedef uint32_t lit_utf8_size_t;
57106
/**
58107
* Unicode code point
59108
*/
60-
typedef uint32_t lit_code_point;
109+
typedef uint32_t lit_code_point_t;
61110

62111
/**
63112
* ECMA string hash
64113
*/
65-
typedef uint8_t ecma_string_hash_t;
114+
typedef uint8_t lit_string_hash_t;
66115

67116
/**
68117
* Number of string's last characters to use for hash calculation
69118
*/
70-
#define ECMA_STRING_HASH_LAST_CHARS_COUNT (2)
119+
#define LIT_STRING_HASH_LAST_BYTES_COUNT (2)
71120

72121
#endif /* LIT_GLOBALS_H */

jerry-core/lit/lit-literal-storage.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@ class lit_charset_record_t : public rcs_record_t
9797
*
9898
* @return hash value of the string (the value of the 'hash' field in the header)
9999
*/
100-
ecma_string_hash_t
100+
lit_string_hash_t
101101
get_hash () const
102102
{
103-
return (ecma_string_hash_t) get_field (_hash_field_pos, _hash_field_width);
103+
return (lit_string_hash_t) get_field (_hash_field_pos, _hash_field_width);
104104
} /* get_hash */
105105

106106
/**
@@ -149,7 +149,7 @@ class lit_charset_record_t : public rcs_record_t
149149
* Set record's hash (the value of the 'hash' field in the header)
150150
*/
151151
void
152-
set_hash (ecma_string_hash_t hash) /**< hash value */
152+
set_hash (lit_string_hash_t hash) /**< hash value */
153153
{
154154
set_field (_hash_field_pos, _hash_field_width, hash);
155155
} /* set_hash */
@@ -241,7 +241,6 @@ class lit_magic_record_t : public rcs_record_t
241241
magic_string_id_t get_magic_str_id () const
242242
{
243243
uint32_t id = get_field (magic_field_pos, magic_field_width);
244-
// JERRY_ASSERT (id < LIT_MAGIC_STRING__COUNT);
245244
return (magic_string_id_t) id;
246245
} /* get_magic_str_id */
247246

0 commit comments

Comments
 (0)