mirror of
https://github.com/json-c/json-c.git
synced 2026-03-20 21:49:07 +08:00
Issue #616: Change the parsing of surrogate pairs in unicode escapes so it uses a couple of additional states instead of assuming the low surrogate is already present, to ensure that we correctly handle various cases of incremental parsing.
This commit is contained in:
@@ -25,6 +25,8 @@ Other changes
|
||||
Add json_object_array_shrink() and array_list_shrink() functions.
|
||||
* Add json_object_new_array_ext(int) and array_list_new_2(int) to allow
|
||||
arrays to be allocated with the exact size needed, when known.
|
||||
* Parsing of surrogate pairs in unicode escapes now properly handles
|
||||
incremental parsing.
|
||||
|
||||
|
||||
***
|
||||
|
||||
306
json_tokener.c
306
json_tokener.c
@@ -295,7 +295,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
|
||||
}
|
||||
#endif
|
||||
|
||||
while (PEEK_CHAR(c, tok))
|
||||
while (PEEK_CHAR(c, tok)) // Note: c might be '\0' !
|
||||
{
|
||||
|
||||
redo_char:
|
||||
@@ -628,9 +628,11 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
|
||||
}
|
||||
break;
|
||||
|
||||
// ===================================================
|
||||
|
||||
case json_tokener_state_escape_unicode:
|
||||
{
|
||||
/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
|
||||
/* Handle a 4-byte \uNNNN sequence, or two sequences if a surrogate pair */
|
||||
while (1)
|
||||
{
|
||||
if (!c || !strchr(json_hex_chars, c))
|
||||
@@ -638,181 +640,153 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
|
||||
tok->err = json_tokener_error_parse_string;
|
||||
goto out;
|
||||
}
|
||||
tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
|
||||
<< ((3 - tok->st_pos) * 4));
|
||||
tok->ucs_char |=
|
||||
((unsigned int)jt_hexdigit(c) << ((3 - tok->st_pos) * 4));
|
||||
tok->st_pos++;
|
||||
if (tok->st_pos < 4)
|
||||
{
|
||||
ADVANCE_CHAR(str, tok);
|
||||
if (!PEEK_CHAR(c, tok))
|
||||
{
|
||||
/*
|
||||
* We're out of characters in the current call to
|
||||
* json_tokener_parse(), but a subsequent call might
|
||||
* provide us with more, so leave our current state
|
||||
* as-is (including tok->high_surrogate) and return.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (tok->st_pos >= 4)
|
||||
break;
|
||||
|
||||
/* Now, we have a full \uNNNN sequence in tok->ucs_char */
|
||||
|
||||
if (tok->high_surrogate)
|
||||
ADVANCE_CHAR(str, tok);
|
||||
if (!PEEK_CHAR(c, tok))
|
||||
{
|
||||
if (IS_LOW_SURROGATE(tok->ucs_char))
|
||||
{
|
||||
/* remove the utf8_replacement_char */
|
||||
/* which may generate during */
|
||||
/* parsing the high surrogate pair. */
|
||||
if (!strcmp(
|
||||
tok->pb->buf,
|
||||
(char *)
|
||||
utf8_replacement_char))
|
||||
{
|
||||
printbuf_reset(tok->pb);
|
||||
}
|
||||
/* Recalculate the ucs_char, then fall thru to process normally */
|
||||
tok->ucs_char =
|
||||
DECODE_SURROGATE_PAIR(
|
||||
tok->high_surrogate,
|
||||
tok->ucs_char);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* High surrogate was not followed by a low surrogate
|
||||
* Replace the high and process the rest normally
|
||||
*/
|
||||
printbuf_memappend_fast(
|
||||
tok->pb,
|
||||
(char *)utf8_replacement_char,
|
||||
3);
|
||||
}
|
||||
tok->high_surrogate = 0;
|
||||
}
|
||||
|
||||
if (tok->ucs_char < 0x80)
|
||||
{
|
||||
unsigned char unescaped_utf[1];
|
||||
unescaped_utf[0] = tok->ucs_char;
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)unescaped_utf, 1);
|
||||
}
|
||||
else if (tok->ucs_char < 0x800)
|
||||
{
|
||||
unsigned char unescaped_utf[2];
|
||||
unescaped_utf[0] =
|
||||
0xc0 | (tok->ucs_char >> 6);
|
||||
unescaped_utf[1] =
|
||||
0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)unescaped_utf, 2);
|
||||
}
|
||||
else if (IS_HIGH_SURROGATE(tok->ucs_char))
|
||||
{
|
||||
/* Got a high surrogate. Remember it and look for
|
||||
* the beginning of another \uNNNN sequence, which
|
||||
* should be the low surrogate.
|
||||
/*
|
||||
* We're out of characters in the current call to
|
||||
* json_tokener_parse(), but a subsequent call might
|
||||
* provide us with more, so leave our current state
|
||||
* as-is (including tok->high_surrogate) and return.
|
||||
*/
|
||||
tok->high_surrogate = tok->ucs_char;
|
||||
/* Not at end, and the next two chars should be "\u" */
|
||||
if ((len == -1 ||
|
||||
len > (tok->char_offset + 2)) &&
|
||||
// str[0] != '0' && // implied by json_hex_chars, above.
|
||||
(str[1] == '\\') && (str[2] == 'u'))
|
||||
{
|
||||
/* Advance through the 16 bit surrogate, and move
|
||||
* on to the next sequence. The next step is to
|
||||
* process the following characters.
|
||||
*/
|
||||
if (!ADVANCE_CHAR(str, tok) ||
|
||||
!ADVANCE_CHAR(str, tok))
|
||||
{
|
||||
printbuf_memappend_fast(
|
||||
tok->pb,
|
||||
(char *)
|
||||
utf8_replacement_char,
|
||||
3);
|
||||
}
|
||||
/* Advance to the first char of the next sequence and
|
||||
* continue processing with the next sequence.
|
||||
*/
|
||||
if (!ADVANCE_CHAR(str, tok) ||
|
||||
!PEEK_CHAR(c, tok))
|
||||
{
|
||||
printbuf_memappend_fast(
|
||||
tok->pb,
|
||||
(char *)
|
||||
utf8_replacement_char,
|
||||
3);
|
||||
tok->ucs_char = 0;
|
||||
tok->st_pos = 0;
|
||||
goto out;
|
||||
}
|
||||
tok->ucs_char = 0;
|
||||
tok->st_pos = 0;
|
||||
/* other json_tokener_state_escape_unicode */
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Got a high surrogate without another sequence following
|
||||
* it. Put a replacement char in for the high surrogate
|
||||
* and pretend we finished.
|
||||
*/
|
||||
printbuf_memappend_fast(
|
||||
tok->pb,
|
||||
(char *)utf8_replacement_char,
|
||||
3);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
else if (IS_LOW_SURROGATE(tok->ucs_char))
|
||||
}
|
||||
tok->st_pos = 0;
|
||||
|
||||
/* Now, we have a full \uNNNN sequence in tok->ucs_char */
|
||||
|
||||
/* If the *previous* sequence was a high surrogate ... */
|
||||
if (tok->high_surrogate)
|
||||
{
|
||||
if (IS_LOW_SURROGATE(tok->ucs_char))
|
||||
{
|
||||
/* Got a low surrogate not preceded by a high */
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)utf8_replacement_char,
|
||||
3);
|
||||
}
|
||||
else if (tok->ucs_char < 0x10000)
|
||||
{
|
||||
unsigned char unescaped_utf[3];
|
||||
unescaped_utf[0] =
|
||||
0xe0 | (tok->ucs_char >> 12);
|
||||
unescaped_utf[1] =
|
||||
0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||
unescaped_utf[2] =
|
||||
0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)unescaped_utf, 3);
|
||||
}
|
||||
else if (tok->ucs_char < 0x110000)
|
||||
{
|
||||
unsigned char unescaped_utf[4];
|
||||
unescaped_utf[0] =
|
||||
0xf0 | ((tok->ucs_char >> 18) & 0x07);
|
||||
unescaped_utf[1] =
|
||||
0x80 | ((tok->ucs_char >> 12) & 0x3f);
|
||||
unescaped_utf[2] =
|
||||
0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||
unescaped_utf[3] =
|
||||
0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)unescaped_utf, 4);
|
||||
/* Recalculate the ucs_char, then fall thru to process normally */
|
||||
tok->ucs_char = DECODE_SURROGATE_PAIR(tok->high_surrogate,
|
||||
tok->ucs_char);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Don't know what we got--insert the replacement char */
|
||||
printbuf_memappend_fast(
|
||||
tok->pb, (char *)utf8_replacement_char,
|
||||
3);
|
||||
/* High surrogate was not followed by a low surrogate
|
||||
* Replace the high and process the rest normally
|
||||
*/
|
||||
printbuf_memappend_fast(tok->pb,
|
||||
(char *)utf8_replacement_char, 3);
|
||||
}
|
||||
state = saved_state; // i.e. _state_string or _object_field
|
||||
tok->high_surrogate = 0;
|
||||
}
|
||||
|
||||
if (tok->ucs_char < 0x80)
|
||||
{
|
||||
unsigned char unescaped_utf[1];
|
||||
unescaped_utf[0] = tok->ucs_char;
|
||||
printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 1);
|
||||
}
|
||||
else if (tok->ucs_char < 0x800)
|
||||
{
|
||||
unsigned char unescaped_utf[2];
|
||||
unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
|
||||
unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 2);
|
||||
}
|
||||
else if (IS_HIGH_SURROGATE(tok->ucs_char))
|
||||
{
|
||||
/*
|
||||
* The next two characters should be \u, HOWEVER,
|
||||
* we can't simply peek ahead here, because the
|
||||
* characters we need might not be passed to us
|
||||
* until a subsequent call to json_tokener_parse.
|
||||
* Instead, transition throug a couple of states.
|
||||
* (now):
|
||||
* _escape_unicode => _unicode_need_escape
|
||||
* (see a '\\' char):
|
||||
* _unicode_need_escape => _unicode_need_u
|
||||
* (see a 'u' char):
|
||||
* _unicode_need_u => _escape_unicode
|
||||
* ...and we'll end up back around here.
|
||||
*/
|
||||
tok->high_surrogate = tok->ucs_char;
|
||||
tok->ucs_char = 0;
|
||||
state = json_tokener_state_escape_unicode_need_escape;
|
||||
break;
|
||||
}
|
||||
else if (IS_LOW_SURROGATE(tok->ucs_char))
|
||||
{
|
||||
/* Got a low surrogate not preceded by a high */
|
||||
printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
|
||||
}
|
||||
else if (tok->ucs_char < 0x10000)
|
||||
{
|
||||
unsigned char unescaped_utf[3];
|
||||
unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
|
||||
unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||
unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 3);
|
||||
}
|
||||
else if (tok->ucs_char < 0x110000)
|
||||
{
|
||||
unsigned char unescaped_utf[4];
|
||||
unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
|
||||
unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
|
||||
unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||
unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
|
||||
printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Don't know what we got--insert the replacement char */
|
||||
printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
|
||||
}
|
||||
state = saved_state; // i.e. _state_string or _state_object_field
|
||||
}
|
||||
break;
|
||||
|
||||
case json_tokener_state_escape_unicode_need_escape:
|
||||
// We get here after processing a high_surrogate
|
||||
// require a '\\' char
|
||||
if (!c || c != '\\')
|
||||
{
|
||||
/* Got a high surrogate without another sequence following
|
||||
* it. Put a replacement char in for the high surrogate
|
||||
* and pop back up to _state_string or _state_object_field.
|
||||
*/
|
||||
printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
|
||||
tok->high_surrogate = 0;
|
||||
tok->ucs_char = 0;
|
||||
tok->st_pos = 0;
|
||||
state = saved_state;
|
||||
goto redo_char;
|
||||
}
|
||||
state = json_tokener_state_escape_unicode_need_u;
|
||||
break;
|
||||
|
||||
case json_tokener_state_escape_unicode_need_u:
|
||||
/* We already had a \ char, check that it's \u */
|
||||
if (!c || c != 'u')
|
||||
{
|
||||
/* Got a high surrogate with some non-unicode escape
|
||||
* sequence following it.
|
||||
* Put a replacement char in for the high surrogate
|
||||
* and handle the escape sequence normally.
|
||||
*/
|
||||
printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
|
||||
tok->high_surrogate = 0;
|
||||
tok->ucs_char = 0;
|
||||
tok->st_pos = 0;
|
||||
state = json_tokener_state_string_escape;
|
||||
goto redo_char;
|
||||
}
|
||||
state = json_tokener_state_escape_unicode;
|
||||
break;
|
||||
|
||||
// ===================================================
|
||||
|
||||
case json_tokener_state_boolean:
|
||||
{
|
||||
int size1, size2;
|
||||
@@ -1146,8 +1120,9 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (!ADVANCE_CHAR(str, tok))
|
||||
goto out;
|
||||
(void)ADVANCE_CHAR(str, tok);
|
||||
if (!c) // This is the char *before* advancing
|
||||
break;
|
||||
} /* while(PEEK_CHAR) */
|
||||
|
||||
out:
|
||||
@@ -1156,7 +1131,8 @@ out:
|
||||
tok->err = json_tokener_error_parse_utf8_string;
|
||||
}
|
||||
if (c && (state == json_tokener_state_finish) && (tok->depth == 0) &&
|
||||
(tok->flags & (JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS)) == JSON_TOKENER_STRICT)
|
||||
(tok->flags & (JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS)) ==
|
||||
JSON_TOKENER_STRICT)
|
||||
{
|
||||
/* unexpected char after JSON data */
|
||||
tok->err = json_tokener_error_parse_unexpected;
|
||||
|
||||
@@ -59,6 +59,8 @@ enum json_tokener_state
|
||||
json_tokener_state_string,
|
||||
json_tokener_state_string_escape,
|
||||
json_tokener_state_escape_unicode,
|
||||
json_tokener_state_escape_unicode_need_escape,
|
||||
json_tokener_state_escape_unicode_need_u,
|
||||
json_tokener_state_boolean,
|
||||
json_tokener_state_number,
|
||||
json_tokener_state_array,
|
||||
|
||||
@@ -68,8 +68,8 @@ static void single_incremental_parse(const char *test_string, int clear_serializ
|
||||
|
||||
if (strcmp(all_at_once_str, new_str) != 0)
|
||||
{
|
||||
printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n",
|
||||
test_string, chunksize, all_at_once_str, new_str);
|
||||
printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", test_string,
|
||||
chunksize, all_at_once_str, new_str);
|
||||
}
|
||||
json_tokener_free(tok);
|
||||
}
|
||||
@@ -193,8 +193,8 @@ static void test_utf8_parse()
|
||||
// json_tokener_parse doesn't support checking for byte order marks.
|
||||
// It's the responsibility of the caller to detect and skip a BOM.
|
||||
// Both of these checks return null.
|
||||
char* utf8_bom = "\xEF\xBB\xBF";
|
||||
char* utf8_bom_and_chars = "\xEF\xBB\xBF{}";
|
||||
char *utf8_bom = "\xEF\xBB\xBF";
|
||||
char *utf8_bom_and_chars = "\xEF\xBB\xBF{}";
|
||||
single_basic_parse(utf8_bom, 0);
|
||||
single_basic_parse(utf8_bom_and_chars, 0);
|
||||
}
|
||||
@@ -245,7 +245,7 @@ struct incremental_step
|
||||
int char_offset;
|
||||
enum json_tokener_error expected_error;
|
||||
int reset_tokener; /* Set to 1 to call json_tokener_reset() after parsing */
|
||||
int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
|
||||
int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
|
||||
} incremental_steps[] = {
|
||||
|
||||
/* Check that full json messages can be parsed, both w/ and w/o a reset */
|
||||
@@ -268,7 +268,11 @@ struct incremental_step
|
||||
{"\": {\"bar", -1, -1, json_tokener_continue, 0},
|
||||
{"\":13}}", -1, -1, json_tokener_success, 1},
|
||||
|
||||
/* Check the UTF-16 surrogate pair */
|
||||
/* Check the UTF-16 surrogate pair handling in various ways.
|
||||
* Note: \ud843\udd1e is u+1D11E, Musical Symbol G Clef
|
||||
* Your terminal may not display these correctly, in particular
|
||||
* PuTTY doesn't currently show this character.
|
||||
*/
|
||||
/* parse one char at every time */
|
||||
{"\"\\", -1, -1, json_tokener_continue, 0},
|
||||
{"u", -1, -1, json_tokener_continue, 0},
|
||||
@@ -296,6 +300,16 @@ struct incremental_step
|
||||
{"udd1e\"", -1, -1, json_tokener_success, 1},
|
||||
{"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
|
||||
{"dd1e\"", -1, -1, json_tokener_success, 1},
|
||||
{"\"fff \\ud834\\ud", -1, -1, json_tokener_continue, 0},
|
||||
{"d1e bar\"", -1, -1, json_tokener_success, 1},
|
||||
{"\"fff \\ud834\\udd", -1, -1, json_tokener_continue, 0},
|
||||
{"1e bar\"", -1, -1, json_tokener_success, 1},
|
||||
|
||||
/* \ud83d\ude00 is U+1F600, Grinning Face
|
||||
* Displays fine in PuTTY, though you may need "less -r"
|
||||
*/
|
||||
{"\"fff \\ud83d\\ude", -1, -1, json_tokener_continue, 0},
|
||||
{"00 bar\"", -1, -1, json_tokener_success, 1},
|
||||
|
||||
/* Check that json_tokener_reset actually resets */
|
||||
{"{ \"foo", -1, -1, json_tokener_continue, 1},
|
||||
|
||||
@@ -124,6 +124,12 @@ json_tokener_parse_ex(tok, "\ud834\ , 8) ... OK: got correct error: continu
|
||||
json_tokener_parse_ex(tok, udd1e" , 6) ... OK: got object of type [string]: "𝄞"
|
||||
json_tokener_parse_ex(tok, "\ud834\u , 9) ... OK: got correct error: continue
|
||||
json_tokener_parse_ex(tok, dd1e" , 5) ... OK: got object of type [string]: "𝄞"
|
||||
json_tokener_parse_ex(tok, "fff \ud834\ud, 14) ... OK: got correct error: continue
|
||||
json_tokener_parse_ex(tok, d1e bar" , 8) ... OK: got object of type [string]: "fff 𝄞 bar"
|
||||
json_tokener_parse_ex(tok, "fff \ud834\udd, 15) ... OK: got correct error: continue
|
||||
json_tokener_parse_ex(tok, 1e bar" , 7) ... OK: got object of type [string]: "fff 𝄞 bar"
|
||||
json_tokener_parse_ex(tok, "fff \ud83d\ude, 15) ... OK: got correct error: continue
|
||||
json_tokener_parse_ex(tok, 00 bar" , 7) ... OK: got object of type [string]: "fff 😀 bar"
|
||||
json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue
|
||||
json_tokener_parse_ex(tok, : "bar"} , 8) ... OK: got correct error: unexpected character
|
||||
json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue
|
||||
@@ -240,5 +246,5 @@ json_tokener_parse_ex(tok, "\ud855
|
||||
json_tokener_parse_ex(tok, "\ud0031<33>" , 10) ... OK: got correct error: invalid utf-8 string
|
||||
json_tokener_parse_ex(tok, 11<31>11 , 5) ... OK: got correct error: invalid utf-8 string
|
||||
json_tokener_parse_ex(tok, {"1<>":1} , 8) ... OK: got correct error: invalid utf-8 string
|
||||
End Incremental Tests OK=154 ERROR=0
|
||||
End Incremental Tests OK=160 ERROR=0
|
||||
==================================
|
||||
|
||||
Reference in New Issue
Block a user