Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.

While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape.  No other functional changes (yet).
This commit is contained in:
Eric Haszlakiewicz
2020-06-21 03:10:55 +00:00
parent 50179fb09f
commit 36118b681e
2 changed files with 172 additions and 165 deletions

View File

@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
/* PEEK_CHAR(dest, tok) macro:
* Peeks at the current char and stores it in dest.
* Returns 1 on success, sets tok->err and returns 0 if no more chars.
* Implicit inputs: str, len vars
* Implicit inputs: str, len, nBytesp vars
*/
#define PEEK_CHAR(dest, tok) \
(((tok)->char_offset == len) \
@@ -633,15 +633,33 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
while (1)
{
if (c && strchr(json_hex_chars, c))
if (!c || !strchr(json_hex_chars, c))
{
tok->ucs_char += ((unsigned int)jt_hexdigit(c)
<< ((3 - tok->st_pos++) * 4));
if (tok->st_pos == 4)
tok->err = json_tokener_error_parse_string;
goto out;
}
tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
<< ((3 - tok->st_pos) * 4));
tok->st_pos++;
if (tok->st_pos < 4)
{
unsigned char unescaped_utf[4];
ADVANCE_CHAR(str, tok);
if (!PEEK_CHAR(c, tok))
{
/*
* We're out of characters in the current call to
* json_tokener_parse(), but a subsequent call might
* provide us with more, so leave our current state
* as-is (including tok->high_surrogate) and return.
*/
goto out;
}
continue;
}
if (tok->got_hi_surrogate)
/* Now, we have a full \uNNNN sequence in tok->ucs_char */
if (tok->high_surrogate)
{
if (IS_LOW_SURROGATE(tok->ucs_char))
{
@@ -658,29 +676,32 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
/* Recalculate the ucs_char, then fall thru to process normally */
tok->ucs_char =
DECODE_SURROGATE_PAIR(
tok->got_hi_surrogate,
tok->high_surrogate,
tok->ucs_char);
}
else
{
/* Hi surrogate was not followed by a low surrogate */
/* Replace the hi and process the rest normally */
/* High surrogate was not followed by a low surrogate
* Replace the high and process the rest normally
*/
printbuf_memappend_fast(
tok->pb,
(char *)utf8_replacement_char,
3);
}
tok->got_hi_surrogate = 0;
tok->high_surrogate = 0;
}
if (tok->ucs_char < 0x80)
{
unsigned char unescaped_utf[1];
unescaped_utf[0] = tok->ucs_char;
printbuf_memappend_fast(
tok->pb, (char *)unescaped_utf, 1);
}
else if (tok->ucs_char < 0x800)
{
unsigned char unescaped_utf[2];
unescaped_utf[0] =
0xc0 | (tok->ucs_char >> 6);
unescaped_utf[1] =
@@ -691,10 +712,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
else if (IS_HIGH_SURROGATE(tok->ucs_char))
{
/* Got a high surrogate. Remember it and look for
* the beginning of another sequence, which
* the beginning of another \uNNNN sequence, which
* should be the low surrogate.
*/
tok->got_hi_surrogate = tok->ucs_char;
tok->high_surrogate = tok->ucs_char;
/* Not at end, and the next two chars should be "\u" */
if ((len == -1 ||
len > (tok->char_offset + 2)) &&
@@ -737,7 +758,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
else
{
/* Got a high surrogate without another sequence following
* it. Put a replacement char in for the hi surrogate
* it. Put a replacement char in for the high surrogate
* and pretend we finished.
*/
printbuf_memappend_fast(
@@ -755,6 +776,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
}
else if (tok->ucs_char < 0x10000)
{
unsigned char unescaped_utf[3];
unescaped_utf[0] =
0xe0 | (tok->ucs_char >> 12);
unescaped_utf[1] =
@@ -766,6 +788,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
}
else if (tok->ucs_char < 0x110000)
{
unsigned char unescaped_utf[4];
unescaped_utf[0] =
0xf0 | ((tok->ucs_char >> 18) & 0x07);
unescaped_utf[1] =
@@ -784,26 +807,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
tok->pb, (char *)utf8_replacement_char,
3);
}
state = saved_state;
state = saved_state; // i.e. _state_string or _object_field
break;
}
}
else
{
tok->err = json_tokener_error_parse_string;
goto out;
}
if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
{
/* Clean up any pending chars */
if (tok->got_hi_surrogate &&
strcmp(tok->pb->buf, (char *)utf8_replacement_char))
printbuf_memappend_fast(
tok->pb, (char *)utf8_replacement_char, 3);
goto out;
}
}
}
break;
case json_tokener_state_boolean:

View File

@@ -111,7 +111,7 @@ struct json_tokener
* @deprecated See json_tokener_get_error() instead.
*/
enum json_tokener_error err;
unsigned int ucs_char, got_hi_surrogate;
unsigned int ucs_char, high_surrogate;
char quote_char;
struct json_tokener_srec *stack;
int flags;