Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.

While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape.  No other functional changes (yet).
This commit is contained in:
Eric Haszlakiewicz
2020-06-21 03:10:55 +00:00
parent 50179fb09f
commit 36118b681e
2 changed files with 172 additions and 165 deletions

View File

@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
/* PEEK_CHAR(dest, tok) macro: /* PEEK_CHAR(dest, tok) macro:
* Peeks at the current char and stores it in dest. * Peeks at the current char and stores it in dest.
* Returns 1 on success, sets tok->err and returns 0 if no more chars. * Returns 1 on success, sets tok->err and returns 0 if no more chars.
* Implicit inputs: str, len vars * Implicit inputs: str, len, nBytesp vars
*/ */
#define PEEK_CHAR(dest, tok) \ #define PEEK_CHAR(dest, tok) \
(((tok)->char_offset == len) \ (((tok)->char_offset == len) \
@@ -633,15 +633,33 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
/* Handle a 4-byte sequence, or two sequences if a surrogate pair */ /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
while (1) while (1)
{ {
if (c && strchr(json_hex_chars, c)) if (!c || !strchr(json_hex_chars, c))
{ {
tok->ucs_char += ((unsigned int)jt_hexdigit(c) tok->err = json_tokener_error_parse_string;
<< ((3 - tok->st_pos++) * 4)); goto out;
if (tok->st_pos == 4) }
tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
<< ((3 - tok->st_pos) * 4));
tok->st_pos++;
if (tok->st_pos < 4)
{ {
unsigned char unescaped_utf[4]; ADVANCE_CHAR(str, tok);
if (!PEEK_CHAR(c, tok))
{
/*
* We're out of characters in the current call to
* json_tokener_parse(), but a subsequent call might
* provide us with more, so leave our current state
* as-is (including tok->high_surrogate) and return.
*/
goto out;
}
continue;
}
if (tok->got_hi_surrogate) /* Now, we have a full \uNNNN sequence in tok->ucs_char */
if (tok->high_surrogate)
{ {
if (IS_LOW_SURROGATE(tok->ucs_char)) if (IS_LOW_SURROGATE(tok->ucs_char))
{ {
@@ -658,29 +676,32 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
/* Recalculate the ucs_char, then fall thru to process normally */ /* Recalculate the ucs_char, then fall thru to process normally */
tok->ucs_char = tok->ucs_char =
DECODE_SURROGATE_PAIR( DECODE_SURROGATE_PAIR(
tok->got_hi_surrogate, tok->high_surrogate,
tok->ucs_char); tok->ucs_char);
} }
else else
{ {
/* Hi surrogate was not followed by a low surrogate */ /* High surrogate was not followed by a low surrogate
/* Replace the hi and process the rest normally */ * Replace the high and process the rest normally
*/
printbuf_memappend_fast( printbuf_memappend_fast(
tok->pb, tok->pb,
(char *)utf8_replacement_char, (char *)utf8_replacement_char,
3); 3);
} }
tok->got_hi_surrogate = 0; tok->high_surrogate = 0;
} }
if (tok->ucs_char < 0x80) if (tok->ucs_char < 0x80)
{ {
unsigned char unescaped_utf[1];
unescaped_utf[0] = tok->ucs_char; unescaped_utf[0] = tok->ucs_char;
printbuf_memappend_fast( printbuf_memappend_fast(
tok->pb, (char *)unescaped_utf, 1); tok->pb, (char *)unescaped_utf, 1);
} }
else if (tok->ucs_char < 0x800) else if (tok->ucs_char < 0x800)
{ {
unsigned char unescaped_utf[2];
unescaped_utf[0] = unescaped_utf[0] =
0xc0 | (tok->ucs_char >> 6); 0xc0 | (tok->ucs_char >> 6);
unescaped_utf[1] = unescaped_utf[1] =
@@ -691,10 +712,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
else if (IS_HIGH_SURROGATE(tok->ucs_char)) else if (IS_HIGH_SURROGATE(tok->ucs_char))
{ {
/* Got a high surrogate. Remember it and look for /* Got a high surrogate. Remember it and look for
* the beginning of another sequence, which * the beginning of another \uNNNN sequence, which
* should be the low surrogate. * should be the low surrogate.
*/ */
tok->got_hi_surrogate = tok->ucs_char; tok->high_surrogate = tok->ucs_char;
/* Not at end, and the next two chars should be "\u" */ /* Not at end, and the next two chars should be "\u" */
if ((len == -1 || if ((len == -1 ||
len > (tok->char_offset + 2)) && len > (tok->char_offset + 2)) &&
@@ -737,7 +758,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
else else
{ {
/* Got a high surrogate without another sequence following /* Got a high surrogate without another sequence following
* it. Put a replacement char in for the hi surrogate * it. Put a replacement char in for the high surrogate
* and pretend we finished. * and pretend we finished.
*/ */
printbuf_memappend_fast( printbuf_memappend_fast(
@@ -755,6 +776,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
} }
else if (tok->ucs_char < 0x10000) else if (tok->ucs_char < 0x10000)
{ {
unsigned char unescaped_utf[3];
unescaped_utf[0] = unescaped_utf[0] =
0xe0 | (tok->ucs_char >> 12); 0xe0 | (tok->ucs_char >> 12);
unescaped_utf[1] = unescaped_utf[1] =
@@ -766,6 +788,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
} }
else if (tok->ucs_char < 0x110000) else if (tok->ucs_char < 0x110000)
{ {
unsigned char unescaped_utf[4];
unescaped_utf[0] = unescaped_utf[0] =
0xf0 | ((tok->ucs_char >> 18) & 0x07); 0xf0 | ((tok->ucs_char >> 18) & 0x07);
unescaped_utf[1] = unescaped_utf[1] =
@@ -784,26 +807,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
tok->pb, (char *)utf8_replacement_char, tok->pb, (char *)utf8_replacement_char,
3); 3);
} }
state = saved_state; state = saved_state; // i.e. _state_string or _object_field
break; break;
} }
} }
else
{
tok->err = json_tokener_error_parse_string;
goto out;
}
if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
{
/* Clean up any pending chars */
if (tok->got_hi_surrogate &&
strcmp(tok->pb->buf, (char *)utf8_replacement_char))
printbuf_memappend_fast(
tok->pb, (char *)utf8_replacement_char, 3);
goto out;
}
}
}
break; break;
case json_tokener_state_boolean: case json_tokener_state_boolean:

View File

@@ -111,7 +111,7 @@ struct json_tokener
* @deprecated See json_tokener_get_error() instead. * @deprecated See json_tokener_get_error() instead.
*/ */
enum json_tokener_error err; enum json_tokener_error err;
unsigned int ucs_char, got_hi_surrogate; unsigned int ucs_char, high_surrogate;
char quote_char; char quote_char;
struct json_tokener_srec *stack; struct json_tokener_srec *stack;
int flags; int flags;