diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp index e2ecde51b5..b6131a48ec 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp @@ -880,10 +880,10 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size); lit_utf8_size_t output_length = 1; - while (!lit_utf8_iterator_reached_buffer_end (&iter)) + while (!lit_utf8_iterator_is_eos (&iter)) { /* Input validation. */ - lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter); + lit_code_point_t character = lit_utf8_iterator_read_next (&iter); if (character <= 0x7f) { @@ -931,10 +931,10 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size); lit_utf8_byte_t *output_char_p = output_start_p; - while (!lit_utf8_iterator_reached_buffer_end (&iter)) + while (!lit_utf8_iterator_is_eos (&iter)) { /* Input decode. */ - lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter); + lit_code_point_t character = lit_utf8_iterator_read_next (&iter); if (character <= 0x7f) { diff --git a/jerry-core/lit/lit-globals.h b/jerry-core/lit/lit-globals.h index 76bfeb8c68..e35c56e078 100644 --- a/jerry-core/lit/lit-globals.h +++ b/jerry-core/lit/lit-globals.h @@ -88,6 +88,11 @@ typedef ecma_char_t *ecma_char_ptr_t; */ #define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3) +/** + * Max bytes needed to represent a code point (Unicode character) via utf-8 encoding + */ +#define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4) + /** * A byte of utf-8 string */ diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp index b442f1c453..609db1e6ee 100644 --- a/jerry-core/lit/lit-strings.cpp +++ b/jerry-core/lit/lit-strings.cpp @@ -17,54 +17,6 @@ #include "jrt-libc-includes.h" -/** - * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The - * Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7). - */ -#define LIT_UNICODE_CODE_POINT_NULL (0x0) -#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF) - -#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF) -#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000) -#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00) -#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800) -#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800) -#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF) -#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00) -#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF) -#define LIT_UTF16_BITS_IN_SURROGATE (10) -#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF) - -#define LIT_UTF8_1_BYTE_MARKER (0x00) -#define LIT_UTF8_2_BYTE_MARKER (0xC0) -#define LIT_UTF8_3_BYTE_MARKER (0xE0) -#define LIT_UTF8_4_BYTE_MARKER (0xF0) -#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80) - -#define LIT_UTF8_1_BYTE_MASK (0x80) -#define LIT_UTF8_2_BYTE_MASK (0xE0) -#define LIT_UTF8_3_BYTE_MASK (0xF0) -#define LIT_UTF8_4_BYTE_MASK (0xF8) -#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0) - -#define LIT_UTF8_LAST_7_BITS_MASK (0x7F) -#define LIT_UTF8_LAST_6_BITS_MASK (0x3F) -#define LIT_UTF8_LAST_5_BITS_MASK (0x1F) -#define LIT_UTF8_LAST_4_BITS_MASK (0x0F) -#define LIT_UTF8_LAST_3_BITS_MASK (0x07) -#define LIT_UTF8_LAST_2_BITS_MASK (0x03) -#define LIT_UTF8_LAST_1_BIT_MASK (0x01) - -#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6) - -#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F) -#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80) -#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF) -#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800) -#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX) -#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x1000) -#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX) - /** * Validate utf-8 string * @@ -175,18 +127,80 @@ lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string * lit_utf8_size_t buf_size) /**< string size */ { JERRY_ASSERT (utf8_buf_p || !buf_size); + JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size)); lit_utf8_iterator_t buf_iter = { - 0, - buf_size, utf8_buf_p, - 0, + buf_size, + { + 0, + false + } }; return buf_iter; } /* lit_utf8_iterator_create */ +/** + * Reset iterator to point to the beginning of a string + */ +void +lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */ +{ + iter_p->buf_pos.offset = 0; + iter_p->buf_pos.is_non_bmp_middle = false; +} /* lit_utf8_iterator_seek_bos */ + +/** + * Reset iterator to point to the end of a string + */ +void +lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */ +{ + iter_p->buf_pos.offset = iter_p->buf_size & LIT_ITERATOR_OFFSET_MASK; + iter_p->buf_pos.is_non_bmp_middle = false; +} /* lit_utf8_iterator_seek_eos */ + +/** + * Save iterator's position to restore it later + * + * @return current position of the iterator + */ +lit_utf8_iterator_pos_t +lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *iter_p) +{ + return iter_p->buf_pos; +} /* lit_utf8_iterator_get_pos */ + +/** + * Restore previously saved position of the iterator + */ +void +lit_utf8_iterator_seek (lit_utf8_iterator_t *iter_p, /**< utf-8 string iterator */ + lit_utf8_iterator_pos_t iter_pos) /**< position to restore */ +{ + JERRY_ASSERT (iter_pos.offset <= iter_p->buf_size); +#ifndef JERRY_NDEBUG + lit_utf8_byte_t byte = *(iter_p->buf_p + iter_pos.offset); + JERRY_ASSERT ((byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER); + JERRY_ASSERT (!iter_pos.is_non_bmp_middle || ((byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)); +#endif + + iter_p->buf_pos = iter_pos; +} /* lit_utf8_iterator_seek */ + +/** + * Get offset (in code units) of the iterator + * + * @return current offset of the iterator in code units + */ +ecma_length_t +lit_utf8_iterator_get_index (const lit_utf8_iterator_t *iter_p) +{ + return lit_utf8_string_length (iter_p->buf_p, iter_p->buf_pos.offset) + iter_p->buf_pos.is_non_bmp_middle; +} /* lit_utf8_iterator_get_index */ + /** * Represents code point (>0xFFFF) as surrogate pair and returns its lower part * @@ -221,59 +235,225 @@ convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code poi } /* convert_code_point_to_low_surrogate */ /** - * Get next code unit form the iterated string and increment iterator to point to next code unit + * Get next code unit form the iterated string * * @return next code unit */ ecma_char_t -lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p) /**< @in-out: utf-8 string iterator */ +lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */ +{ + JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p)); + + lit_code_point_t code_point; + lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset, + iter_p->buf_size - iter_p->buf_pos.offset, + &code_point); + + if (code_point <= LIT_UTF16_CODE_UNIT_MAX) + { + JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle); + return (ecma_char_t) code_point; + } + else + { + if (iter_p->buf_pos.is_non_bmp_middle) + { + return convert_code_point_to_low_surrogate (code_point); + } + else + { + return convert_code_point_to_high_surrogate (code_point); + } + } +} /* lit_utf8_iterator_peek_next */ + +/** + * Get previous code unit form the iterated string + * + * @return previous code unit + */ +ecma_char_t +lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */ +{ + JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p)); + + lit_code_point_t code_point; + lit_utf8_size_t offset = iter_p->buf_pos.offset; + + if (iter_p->buf_pos.is_non_bmp_middle) + { + lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset, + iter_p->buf_size - iter_p->buf_pos.offset, + &code_point); + return convert_code_point_to_high_surrogate (code_point); + } + + do + { + JERRY_ASSERT (offset != 0); + offset--; + } + while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER); + + JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT); + + lit_read_code_point_from_utf8 (iter_p->buf_p + offset, + iter_p->buf_size - offset, + &code_point); + + if (code_point <= LIT_UTF16_CODE_UNIT_MAX) + { + return (ecma_char_t) code_point; + } + else + { + return convert_code_point_to_low_surrogate (code_point); + } +} /* lit_utf8_iterator_peek_prev */ + +/** + * Increment iterator to point to next code unit + */ +void +lit_utf8_iterator_incr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */ +{ + lit_utf8_iterator_read_next (iter_p); +} /* lit_utf8_iterator_read_next */ + +/** + * Decrement iterator to point to previous code unit + */ +void +lit_utf8_iterator_decr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */ { - JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (buf_iter_p)); + lit_utf8_iterator_read_prev (iter_p); +} /* lit_utf8_iterator_decr */ - if (buf_iter_p->code_point) +/** + * Skip specified number of code units + */ +void +lit_utf8_iterator_advance (lit_utf8_iterator_t *iter_p, /**< in-out: iterator */ + ecma_length_t chars_count) /**< number of code units to skip */ +{ + while (chars_count--) { - ecma_char_t code_unit = convert_code_point_to_low_surrogate (buf_iter_p->code_point); - buf_iter_p->code_point = 0; - return code_unit; + lit_utf8_iterator_incr (iter_p); } +} /* lit_utf8_iterator_advance */ + +/** + * Get next code unit form the iterated string and increment iterator to point to next code unit + * + * @return next code unit + */ +ecma_char_t +lit_utf8_iterator_read_next (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */ +{ + JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p)); lit_code_point_t code_point; - buf_iter_p->buf_offset += lit_read_code_point_from_utf8 (buf_iter_p->buf_p + buf_iter_p->buf_offset, - buf_iter_p->buf_size - buf_iter_p->buf_offset, - &code_point); + lit_utf8_size_t utf8_char_size = lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset, + iter_p->buf_size - iter_p->buf_pos.offset, + &code_point); if (code_point <= LIT_UTF16_CODE_UNIT_MAX) { + JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle); + iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK; return (ecma_char_t) code_point; } else { - buf_iter_p->code_point = code_point; + if (iter_p->buf_pos.is_non_bmp_middle) + { + iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK; + iter_p->buf_pos.is_non_bmp_middle = false; + return convert_code_point_to_low_surrogate (code_point); + } + else + { + iter_p->buf_pos.is_non_bmp_middle = true; + return convert_code_point_to_high_surrogate (code_point); + } + } +} /* lit_utf8_iterator_read_next */ + +/** + * Get previous code unit form the iterated string and decrement iterator to point to previous code unit + * + * @return previous code unit + */ +ecma_char_t +lit_utf8_iterator_read_prev (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */ +{ + JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p)); + + lit_code_point_t code_point; + lit_utf8_size_t offset = iter_p->buf_pos.offset; + + if (iter_p->buf_pos.is_non_bmp_middle) + { + lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset, + iter_p->buf_size - iter_p->buf_pos.offset, + &code_point); + + iter_p->buf_pos.is_non_bmp_middle = false; + return convert_code_point_to_high_surrogate (code_point); } - JERRY_ASSERT (false); - return LIT_CHAR_NULL; -} /* lit_utf8_iterator_read_code_unit_and_increment */ + do + { + JERRY_ASSERT (offset != 0); + offset--; + } + while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER); + + JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT); + + iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK; + lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset, + iter_p->buf_size - iter_p->buf_pos.offset, + &code_point); + + if (code_point <= LIT_UTF16_CODE_UNIT_MAX) + { + return (ecma_char_t) code_point; + } + else + { + iter_p->buf_pos.is_non_bmp_middle = true; + + return convert_code_point_to_low_surrogate (code_point); + } +} /* lit_utf8_iterator_read_prev */ /** * Checks iterator reached end of the string * - * @return true - the whole string was iterated + * @return true - iterator is at the end of string * false - otherwise */ bool -lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *buf_iter_p) /**< utf-8 string iterator */ +lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *iter_p) /**< utf-8 string iterator */ { - JERRY_ASSERT (buf_iter_p->buf_offset <= buf_iter_p->buf_size); + JERRY_ASSERT (iter_p->buf_pos.offset <= iter_p->buf_size); - if (buf_iter_p->code_point == LIT_UNICODE_CODE_POINT_NULL && buf_iter_p->buf_offset == buf_iter_p->buf_size) - { - return true; - } + return (iter_p->buf_pos.offset == iter_p->buf_size); +} /* lit_utf8_iterator_is_eos */ - return false; -} /* lit_utf8_iterator_reached_buffer_end */ +/** + * Checks iterator reached beginning of the string + * + * @return true - iterator is at the beginning of a string + * false - otherwise + */ +bool +lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p) +{ + return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false); +} /* lit_utf8_iterator_is_bos */ /** * Calculate size of a zero-terminated utf-8 string @@ -300,12 +480,12 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ { ecma_length_t length = 0; lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size); - while (!lit_utf8_iterator_reached_buffer_end (&buf_iter)) + while (!lit_utf8_iterator_is_eos (&buf_iter)) { - lit_utf8_iterator_read_code_unit_and_increment (&buf_iter); + lit_utf8_iterator_read_next (&buf_iter); length++; } - JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&buf_iter)); + JERRY_ASSERT (lit_utf8_iterator_is_eos (&buf_iter)); return length; } /* lit_utf8_string_length */ @@ -375,13 +555,13 @@ lit_utf8_string_calc_hash_last_bytes (const lit_utf8_byte_t *utf8_buf_p, /**< ch { JERRY_ASSERT (utf8_buf_p != NULL); - lit_utf8_size_t byte1 = utf8_buf_size > 0 ? utf8_buf_p[utf8_buf_size - 1] : (lit_utf8_size_t) 0; - lit_utf8_size_t byte2 = utf8_buf_size > 1 ? utf8_buf_p[utf8_buf_size - 2] : (lit_utf8_size_t) 0; + lit_utf8_byte_t byte1 = (utf8_buf_size > 0) ? utf8_buf_p[utf8_buf_size - 1] : 0; + lit_utf8_byte_t byte2 = (utf8_buf_size > 1) ? utf8_buf_p[utf8_buf_size - 2] : 0; - lit_utf8_size_t t1 = byte1 + byte2; - lit_utf8_size_t t2 = t1 * 0x24418b66; - lit_utf8_size_t t3 = (t2 >> 16) ^ (t2 & 0xffffu); - lit_utf8_size_t t4 = (t3 >> 8) ^ (t3 & 0xffu); + uint32_t t1 = (uint32_t) byte1 + (uint32_t) byte2; + uint32_t t2 = t1 * 0x24418b66; + uint32_t t3 = (t2 >> 16) ^ (t2 & 0xffffu); + uint32_t t4 = (t3 >> 8) ^ (t3 & 0xffu); return (lit_string_hash_t) t4; } /* lit_utf8_string_calc_hash_last_bytes */ @@ -404,8 +584,8 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri do { - JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter)); - code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter); + JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter)); + code_unit = lit_utf8_iterator_read_next (&iter); } while (code_unit_offset--); @@ -560,11 +740,11 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size); lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size); - while (!lit_utf8_iterator_reached_buffer_end (&iter1) - && !lit_utf8_iterator_reached_buffer_end (&iter2)) + while (!lit_utf8_iterator_is_eos (&iter1) + && !lit_utf8_iterator_is_eos (&iter2)) { - ecma_char_t code_point1 = lit_utf8_iterator_read_code_unit_and_increment (&iter1); - ecma_char_t code_point2 = lit_utf8_iterator_read_code_unit_and_increment (&iter2); + ecma_char_t code_point1 = lit_utf8_iterator_read_next (&iter1); + ecma_char_t code_point2 = lit_utf8_iterator_read_next (&iter2); if (code_point1 < code_point2) { return true; @@ -575,5 +755,5 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< } } - return (lit_utf8_iterator_reached_buffer_end (&iter1) && !lit_utf8_iterator_reached_buffer_end (&iter2)); + return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2)); } /* lit_compare_utf8_strings_relational */ diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index 5e2b3e5dba..2c616a135d 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -25,16 +25,82 @@ */ #define LIT_BYTE_NULL (0) +/** + * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The + * Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7). + */ +#define LIT_UNICODE_CODE_POINT_NULL (0x0) +#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF) + +#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF) +#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000) +#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00) +#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800) +#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800) +#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF) +#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00) +#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF) +#define LIT_UTF16_BITS_IN_SURROGATE (10) +#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF) + +#define LIT_UTF8_1_BYTE_MARKER (0x00) +#define LIT_UTF8_2_BYTE_MARKER (0xC0) +#define LIT_UTF8_3_BYTE_MARKER (0xE0) +#define LIT_UTF8_4_BYTE_MARKER (0xF0) +#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80) + +#define LIT_UTF8_1_BYTE_MASK (0x80) +#define LIT_UTF8_2_BYTE_MASK (0xE0) +#define LIT_UTF8_3_BYTE_MASK (0xF0) +#define LIT_UTF8_4_BYTE_MASK (0xF8) +#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0) + +#define LIT_UTF8_LAST_7_BITS_MASK (0x7F) +#define LIT_UTF8_LAST_6_BITS_MASK (0x3F) +#define LIT_UTF8_LAST_5_BITS_MASK (0x1F) +#define LIT_UTF8_LAST_4_BITS_MASK (0x0F) +#define LIT_UTF8_LAST_3_BITS_MASK (0x07) +#define LIT_UTF8_LAST_2_BITS_MASK (0x03) +#define LIT_UTF8_LAST_1_BIT_MASK (0x01) + +#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6) + +#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F) +#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80) +#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF) +#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800) +#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX) +#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000) +#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX) + +/** + * Width of the offset field in lit_utf8_iterator_pos_t structure + */ +#define LIT_ITERATOR_OFFSET_WIDTH (31) + +/** + * Iterator's offset field mask + */ +#define LIT_ITERATOR_OFFSET_MASK ((1ull << LIT_ITERATOR_OFFSET_WIDTH) - 1) + +/** + * Represents position of the iterator + */ +typedef struct +{ + lit_utf8_size_t offset : LIT_ITERATOR_OFFSET_WIDTH; /** offset to utf-8 char */ + bool is_non_bmp_middle: 1; /** flag indicating that current position of the iterator is the middle of + * 4-byte char */ +} lit_utf8_iterator_pos_t; + /** * Represents an iterator over utf-8 buffer */ typedef struct { - lit_utf8_size_t buf_offset; /* current offset in the buffer */ - lit_utf8_size_t buf_size; /* buffer length */ const lit_utf8_byte_t *buf_p; /* buffer */ - lit_code_point_t code_point; /* code point is saved here when processed Unicode character is higher than - * 0xFFFF */ + lit_utf8_size_t buf_size; /* buffer length */ + lit_utf8_iterator_pos_t buf_pos; /* position in the buffer */ } lit_utf8_iterator_t; /* validation */ @@ -42,8 +108,27 @@ bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t); /* iteration */ lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t); -ecma_char_t lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *); -bool lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *); + +void lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *); +void lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *); + +lit_utf8_iterator_pos_t lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *); +void lit_utf8_iterator_seek (lit_utf8_iterator_t *, lit_utf8_iterator_pos_t); + +ecma_length_t lit_utf8_iterator_get_index (const lit_utf8_iterator_t *); + +ecma_char_t lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *); +ecma_char_t lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *); + +void lit_utf8_iterator_incr (lit_utf8_iterator_t *); +void lit_utf8_iterator_decr (lit_utf8_iterator_t *); +void lit_utf8_iterator_advance (lit_utf8_iterator_t *, ecma_length_t); + +ecma_char_t lit_utf8_iterator_read_next (lit_utf8_iterator_t *); +ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *); + +bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *); +bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *); /* size */ lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *); diff --git a/tests/unit/test-strings.cpp b/tests/unit/test-strings.cpp index 33db3a6cac..2d41726929 100644 --- a/tests/unit/test-strings.cpp +++ b/tests/unit/test-strings.cpp @@ -19,10 +19,91 @@ #include "test-common.h" // Iterations count -#define test_iters 64 +#define test_iters (1024) + +// Sub iterations count +#define test_subiters (128) + +// Max bytes in string +#define max_bytes_in_string (16 * 1024) +#define max_code_units_in_string (max_bytes_in_string) + +typedef enum +{ + UTF8_ANY_SIZE, + UTF8_ONE_BYTE, + UTF8_TWO_BYTES, + UTF8_THREE_BYTES, + UTF8_FOUR_BYTES +} utf8_char_size; + +static lit_utf8_size_t +generate_utf8_char (utf8_char_size char_size, + lit_utf8_byte_t *buf) +{ + JERRY_ASSERT (char_size >= 0 && char_size <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT); + lit_code_point_t code_point = (lit_code_point_t) rand (); + + if (char_size == 1) + { + code_point %= LIT_UTF8_1_BYTE_CODE_POINT_MAX; + } + else if (char_size == 2) + { + code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_2_BYTE_CODE_POINT_MAX - + LIT_UTF8_2_BYTE_CODE_POINT_MIN); + } + else if (char_size == 3) + { + code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_3_BYTE_CODE_POINT_MAX - + LIT_UTF8_3_BYTE_CODE_POINT_MIN); + } + else if (char_size == 4) + { + code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_4_BYTE_CODE_POINT_MAX - + LIT_UTF8_4_BYTE_CODE_POINT_MIN); + } + else + { + code_point %= LIT_UTF8_4_BYTE_CODE_POINT_MAX; + } + + if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN + && code_point <= LIT_UTF16_LOW_SURROGATE_MAX) + { + code_point = LIT_UTF16_HIGH_SURROGATE_MIN - 1; + } + + return lit_code_point_to_utf8 (code_point, buf); +} + +static ecma_length_t +generate_utf8_string (lit_utf8_byte_t *buf_p, + lit_utf8_size_t buf_size) +{ + ecma_length_t length = 0; + + lit_utf8_size_t size = 0; + while (size < buf_size) + { + const utf8_char_size char_size = (((buf_size - size) > LIT_UTF8_MAX_BYTES_IN_CODE_POINT) + ? UTF8_ANY_SIZE + : (utf8_char_size) (buf_size - size)); + + lit_utf8_size_t bytes_generated = generate_utf8_char (char_size, buf_p); + + JERRY_ASSERT (lit_is_utf8_string_valid (buf_p, bytes_generated)); + + size += bytes_generated; + buf_p += bytes_generated; + length += (bytes_generated == LIT_UTF8_MAX_BYTES_IN_CODE_POINT) ? 2 : 1; + } + + JERRY_ASSERT (size == buf_size); + + return length; +} -// Subiterations count -#define test_sub_iters 64 int main (int __attr_unused___ argc, @@ -32,7 +113,74 @@ main (int __attr_unused___ argc, mem_init (); - /* test lit_is_utf8_string_valid */ + lit_utf8_byte_t utf8_string[max_bytes_in_string]; + ecma_char_t code_units[max_code_units_in_string]; + lit_utf8_iterator_pos_t saved_positions[max_code_units_in_string]; + + for (int i = 0; i < test_iters; i++) + { + lit_utf8_size_t utf8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string); + ecma_length_t length = generate_utf8_string (utf8_string, utf8_string_size); + + JERRY_ASSERT (lit_utf8_string_length (utf8_string, utf8_string_size) == length); + + lit_utf8_iterator_t iter = lit_utf8_iterator_create (utf8_string, utf8_string_size); + ecma_length_t calculated_length = 0; + + ecma_length_t code_units_count = 0; + while (!lit_utf8_iterator_is_eos (&iter)) + { + code_units[code_units_count] = lit_utf8_iterator_peek_next (&iter); + saved_positions[code_units_count] = lit_utf8_iterator_get_pos (&iter); + code_units_count++; + calculated_length++; + + lit_utf8_iterator_incr (&iter); + } + + JERRY_ASSERT (length == calculated_length); + + if (code_units_count > 0) + { + for (int j = 0; j < test_subiters; j++) + { + ecma_length_t index = (ecma_length_t) rand () % code_units_count; + lit_utf8_iterator_seek (&iter, saved_positions[index]); + JERRY_ASSERT (lit_utf8_iterator_peek_next (&iter) == code_units[index]); + JERRY_ASSERT (lit_utf8_iterator_get_index (&iter) == index); + } + } + + lit_utf8_iterator_seek_eos (&iter); + while (!lit_utf8_iterator_is_bos (&iter)) + { + JERRY_ASSERT (code_units_count > 0); + calculated_length--; + JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_peek_prev (&iter)); + lit_utf8_iterator_decr (&iter); + } + + JERRY_ASSERT (calculated_length == 0); + + while (!lit_utf8_iterator_is_eos (&iter)) + { + ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter); + JERRY_ASSERT (code_unit == code_units[calculated_length]); + calculated_length++; + } + + JERRY_ASSERT (length == calculated_length); + + while (!lit_utf8_iterator_is_bos (&iter)) + { + JERRY_ASSERT (code_units_count > 0); + calculated_length--; + JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_read_prev (&iter)); + } + + JERRY_ASSERT (calculated_length == 0); + } + /* Overlong-encoded code point */ lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82}; JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1))); @@ -53,14 +201,12 @@ main (int __attr_unused___ argc, lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0}; JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2))); - /* test lit_read_code_point_from_utf8 */ lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88}; lit_code_point_t code_point; lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point); JERRY_ASSERT (bytes_count == 4); JERRY_ASSERT (code_point == 0x10348); - /* test lit_code_unit_to_utf8 */ lit_utf8_byte_t res_buf[3]; lit_utf8_size_t res_size; @@ -79,14 +225,13 @@ main (int __attr_unused___ argc, JERRY_ASSERT (res_buf[1] == 0x9F); JERRY_ASSERT (res_buf[2] == 0xBF); - /* test lit_utf8_iterator */ lit_utf8_byte_t bytes[] = {0xF0, 0x90, 0x8D, 0x88}; lit_utf8_iterator_t iter = lit_utf8_iterator_create (bytes, sizeof (bytes)); - ecma_char_t code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter); - JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter)); + ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter); + JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter)); JERRY_ASSERT (code_unit == 0xD800); - code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter); - JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&iter)); + code_unit = lit_utf8_iterator_read_next (&iter); + JERRY_ASSERT (lit_utf8_iterator_is_eos (&iter)); JERRY_ASSERT (code_unit == 0xDF48); mem_finalize (true);