Skip to content

Commit bcedc90

Browse files
committed
Add \u parse support for the JSON object. Buffer overrun issues were fixed as well.
JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg [email protected]
1 parent bbfddea commit bcedc90

File tree

6 files changed

+144
-59
lines changed

6 files changed

+144
-59
lines changed

jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp

Lines changed: 11 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "ecma-helpers.h"
2525
#include "ecma-try-catch-macro.h"
2626
#include "jrt.h"
27+
#include "lit-char-helpers.h"
2728
#include "lit-magic-strings.h"
2829
#include "lit-strings.h"
2930
#include "vm.h"
@@ -517,53 +518,6 @@ static uint8_t unescaped_uri_component_set[16] =
517518
*/
518519
#define URI_ENCODED_BYTE_SIZE (3)
519520

520-
#define ECMA_BUILTIN_HEX_TO_BYTE_ERROR (0x100)
521-
522-
/**
523-
* Helper function to decode a hexadecimal byte from a string.
524-
*
525-
* @return the decoded byte value
526-
* It returns with ECMA_BUILTIN_HEX_TO_BYTE_ERROR if a parse error is occured.
527-
*/
528-
static uint32_t
529-
ecma_builtin_global_object_hex_to_byte (lit_utf8_byte_t *source_p) /**< source string */
530-
{
531-
uint32_t decoded_byte = 0;
532-
533-
/*
534-
* Zero terminated string, so length check is not needed.
535-
*/
536-
if (*source_p != '%')
537-
{
538-
return ECMA_BUILTIN_HEX_TO_BYTE_ERROR;
539-
}
540-
541-
for (lit_utf8_size_t i = 0; i < 2; i++)
542-
{
543-
source_p++;
544-
decoded_byte <<= 4;
545-
546-
if (*source_p >= '0' && *source_p <= '9')
547-
{
548-
decoded_byte |= (uint32_t) (*source_p - '0');
549-
}
550-
else if (*source_p >= 'a' && *source_p <= 'f')
551-
{
552-
decoded_byte |= (uint32_t) (*source_p - ('a' - 10));
553-
}
554-
else if (*source_p >= 'A' && *source_p <= 'F')
555-
{
556-
decoded_byte |= (uint32_t) (*source_p - ('A' - 10));
557-
}
558-
else
559-
{
560-
return ECMA_BUILTIN_HEX_TO_BYTE_ERROR;
561-
}
562-
}
563-
564-
return decoded_byte;
565-
} /* ecma_builtin_global_object_hex_to_byte */
566-
567521
/**
568522
* Helper function to decode URI.
569523
*
@@ -586,12 +540,13 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
586540
lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
587541

588542
MEM_DEFINE_LOCAL_ARRAY (input_start_p,
589-
input_size,
543+
input_size + 1,
590544
lit_utf8_byte_t);
591545

592546
ecma_string_to_utf8_string (input_string_p,
593547
input_start_p,
594548
(ssize_t) (input_size));
549+
input_start_p[input_size] = LIT_BYTE_NULL;
595550

596551
lit_utf8_byte_t *input_char_p = input_start_p;
597552
lit_utf8_byte_t *input_end_p = input_start_p + input_size;
@@ -616,8 +571,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
616571
continue;
617572
}
618573

619-
uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
620-
if (decoded_byte == ECMA_BUILTIN_HEX_TO_BYTE_ERROR)
574+
lit_code_point_t decoded_byte;
575+
576+
if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
621577
{
622578
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
623579
break;
@@ -667,7 +623,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
667623
continue;
668624
}
669625

670-
uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
626+
lit_code_point_t decoded_byte;
627+
628+
lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte);
671629
input_char_p += URI_ENCODED_BYTE_SIZE;
672630

673631
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -704,7 +662,8 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
704662
ecma_char_t character = lit_utf8_iterator_read_next (&characters);
705663

706664
/* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */
707-
if (character >= LIT_UTF16_HIGH_SURROGATE_MIN && character <= LIT_UTF16_LOW_SURROGATE_MAX)
665+
if (lit_is_code_unit_low_surrogate (character)
666+
|| lit_is_code_unit_high_surrogate (character))
708667
{
709668
valid_utf8 = false;
710669
break;

jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp

Lines changed: 72 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
143143
{
144144
break;
145145
}
146+
case 'b':
147+
{
148+
*current_p = '\b';
149+
break;
150+
}
146151
case 'f':
147152
{
148153
*current_p = '\f';
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
163168
*current_p = '\t';
164169
break;
165170
}
166-
case 'b':
171+
case 'u':
167172
{
168-
*current_p = '\b';
169-
break;
173+
lit_code_point_t code_point;
174+
175+
if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
176+
{
177+
return;
178+
}
179+
180+
current_p += 5;
181+
write_p += lit_code_point_to_utf8 (code_point, write_p);
182+
continue;
183+
/* FALLTHRU */
170184
}
171185
default:
172186
{
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
177191
*write_p++ = *current_p++;
178192
}
179193

194+
/*
195+
* Post processing surrogate pairs.
196+
*
197+
* The general issue is, that surrogate fragments can come from
198+
* the original stream and can be constructed by \u sequences
199+
* as well. We need to construct code points from them.
200+
*
201+
* Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
202+
* The first \u is parsed by JSON, the second is by the lexer.
203+
*
204+
* The rewrite happens in-place, since the write pointer is always
205+
* precede the read-pointer. We also cannot create an UTF8 iterator,
206+
* because the lit_is_utf8_string_valid assertion may fail.
207+
*/
208+
209+
lit_utf8_byte_t *read_p = token_p->u.string.start_p;
210+
lit_utf8_byte_t *read_end_p = write_p;
211+
write_p = read_p;
212+
213+
while (read_p < read_end_p)
214+
{
215+
lit_code_point_t code_point;
216+
read_p += lit_read_code_point_from_utf8 (read_p,
217+
(lit_utf8_size_t) (read_end_p - read_p),
218+
&code_point);
219+
220+
/* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
221+
so code_points above maximum UTF16 code unit must not be tested. */
222+
if (read_p < read_end_p
223+
&& code_point <= LIT_UTF16_CODE_UNIT_MAX
224+
&& lit_is_code_unit_high_surrogate ((ecma_char_t) code_point))
225+
{
226+
lit_code_point_t next_code_point;
227+
lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
228+
(lit_utf8_size_t) (read_end_p - read_p),
229+
&next_code_point);
230+
231+
if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
232+
&& lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point))
233+
{
234+
code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point,
235+
(ecma_char_t) next_code_point);
236+
read_p += next_code_point_size;
237+
}
238+
}
239+
write_p += lit_code_point_to_utf8 (code_point, write_p);
240+
}
241+
242+
JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p,
243+
(lit_utf8_size_t) (write_p - token_p->u.string.start_p)));
244+
180245
token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
181246
token_p->current_p = current_p + 1;
182247
token_p->type = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
757822
ret_value);
758823

759824
ecma_string_t *string_p = ecma_get_string_from_value (string);
760-
ecma_length_t length = (uint32_t) ecma_string_get_length (string_p);
761-
size_t buffer_size = sizeof (lit_utf8_byte_t) * (length + 1);
825+
ecma_length_t string_size = (uint32_t) ecma_string_get_size (string_p);
826+
size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1);
762827

763828
MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t);
764829

765830
ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t) buffer_size);
766-
str_start_p[length] = LIT_BYTE_NULL;
831+
str_start_p[string_size] = LIT_BYTE_NULL;
767832

768833
ecma_json_token_t token;
769834
token.current_p = str_start_p;
770-
token.end_p = str_start_p + length;
835+
token.end_p = str_start_p + string_size;
771836

772837
ecma_value_t final_result = ecma_builtin_json_parse_value (&token);
773838

jerry-core/lit/lit-char-helpers.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,52 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
312312
}
313313
} /* lit_char_hex_to_int */
314314

315+
/**
316+
* Parse the next number_of_characters hexadecimal character,
317+
* and construct a code point from them. The buffer must
318+
* be zero terminated.
319+
*
320+
* @return true if decoding was successful, false otherwise
321+
*/
322+
bool
323+
lit_read_code_point_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
324+
lit_utf8_size_t number_of_characters, /**< number of characters to be read */
325+
lit_code_point_t *out_code_point_p) /**< @out: decoded result */
326+
{
327+
lit_code_point_t code_point = 0;
328+
329+
JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);
330+
331+
for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
332+
{
333+
code_point <<= 4;
334+
335+
if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
336+
&& *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
337+
{
338+
code_point |= (uint32_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
339+
}
340+
else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
341+
&& *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
342+
{
343+
code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
344+
}
345+
else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
346+
&& *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
347+
{
348+
code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
349+
}
350+
else
351+
{
352+
return false;
353+
}
354+
355+
buf_p++;
356+
}
357+
*out_code_point_p = code_point;
358+
return true;
359+
} /* lit_read_code_point_from_hex */
360+
315361
/**
316362
* Check if specified character is a word character (part of IsWordChar abstract operation)
317363
*

jerry-core/lit/lit-char-helpers.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ extern bool lit_char_is_decimal_digit (ecma_char_t);
210210
extern bool lit_char_is_hex_digit (ecma_char_t);
211211
extern uint32_t lit_char_hex_to_int (ecma_char_t);
212212

213+
/* read a hex encoded code point from a zero terminated buffer */
214+
bool lit_read_code_point_from_hex (const lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *);
215+
213216
/**
214217
* Null character
215218
*/

jerry-core/lit/lit-strings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
7373
lit_utf8_byte_t c = utf8_buf_p[idx++];
7474
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
7575
{
76+
is_prev_code_point_high_surrogate = false;
7677
continue;
7778
}
7879

tests/jerry/json-parse.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ str = '"str"';
4040
assert (JSON.parse (str) == "str");
4141
str = '"\\b\\f\\n\\t\\r"'
4242
assert (JSON.parse (str) === "\b\f\n\t\r");
43+
/* Note: \u is parsed by the lexer, \\u is by the JSON parser. */
44+
str = '"\\u0000\\u001f"';
45+
assert (JSON.parse (str) === "\x00\x1f");
46+
str = '"\\ud801\\udc00\\ud801\udc00\ud801\\udc00\ud801\udc00"';
47+
assert (JSON.parse (str) === "\ud801\udc00\ud801\udc00\ud801\udc00\ud801\udc00");
48+
/* These surrogates do not form a valid surrogate pairs. */
49+
str = '"\\ud801,\\udc00,\\ud801,\udc00,\ud801,\\udc00,\ud801,\udc00"';
50+
assert (JSON.parse (str) === "\ud801,\udc00,\ud801,\udc00,\ud801,\udc00,\ud801,\udc00");
4351

4452
check_parse_error ('undefined');
4553
check_parse_error ('falses');
@@ -52,6 +60,9 @@ check_parse_error ('3e+a');
5260
check_parse_error ('55e4,');
5361
check_parse_error ('5 true');
5462
check_parse_error ("'str'");
63+
check_parse_error ('\x00');
64+
check_parse_error ('"\x00"');
65+
check_parse_error ('"\x1f"');
5566

5667
// Checking objects
5768
str = ' { "x": 0, "yy": null, "zzz": { "A": 4.0, "BB": { "1": 63e-1 }, "CCC" : false } } ';

0 commit comments

Comments
 (0)