Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.

While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape. No other functional changes (yet).
2026-06-22 02:49:06 +08:00 · 2020-06-21 03:10:55 +00:00
parent 50179fb09f
commit 36118b681e
2 changed files with 172 additions and 165 deletions
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
 /* PEEK_CHAR(dest, tok) macro:
 *   Peeks at the current char and stores it in dest.
 *   Returns 1 on success, sets tok->err and returns 0 if no more chars.
- *   Implicit inputs:  str, len vars
+ *   Implicit inputs:  str, len, nBytesp vars
 */
 #define PEEK_CHAR(dest, tok)                                                 \
 	(((tok)->char_offset == len)                                         \
@@ -633,15 +633,33 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 			/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 			while (1)
 			{
-				if (c && strchr(json_hex_chars, c))
+				if (!c || !strchr(json_hex_chars, c))
 				{
-					tok->ucs_char += ((unsigned int)jt_hexdigit(c)
-					                  << ((3 - tok->st_pos++) * 4));
-					if (tok->st_pos == 4)
+					tok->err = json_tokener_error_parse_string;
+					goto out;
+				}
+				tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
+								  << ((3 - tok->st_pos) * 4));
+				tok->st_pos++;
+				if (tok->st_pos < 4)
 				{
-						unsigned char unescaped_utf[4];
+					ADVANCE_CHAR(str, tok);
+					if (!PEEK_CHAR(c, tok))
+					{
+						/*
+						 * We're out of characters in the current call to
+						 * json_tokener_parse(), but a subsequent call might
+						 * provide us with more, so leave our current state
+						 * as-is (including tok->high_surrogate) and return.
+						 */
+						goto out;
+					}
+					continue;
+				}

-						if (tok->got_hi_surrogate)
+				/* Now, we have a full \uNNNN sequence in tok->ucs_char */
+
+				if (tok->high_surrogate)
 				{
 					if (IS_LOW_SURROGATE(tok->ucs_char))
 					{
@@ -658,29 +676,32 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 						/* Recalculate the ucs_char, then fall thru to process normally */
 						tok->ucs_char =
 							DECODE_SURROGATE_PAIR(
-								        tok->got_hi_surrogate,
+								tok->high_surrogate,
 								tok->ucs_char);
 					}
 					else
 					{
-								/* Hi surrogate was not followed by a low surrogate */
-								/* Replace the hi and process the rest normally */
+						/* High surrogate was not followed by a low surrogate
+						 * Replace the high and process the rest normally
+						 */
 						printbuf_memappend_fast(
 							tok->pb,
 							(char *)utf8_replacement_char,
 							3);
 					}
-							tok->got_hi_surrogate = 0;
+					tok->high_surrogate = 0;
 				}

 				if (tok->ucs_char < 0x80)
 				{
+					unsigned char unescaped_utf[1];
 					unescaped_utf[0] = tok->ucs_char;
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 1);
 				}
 				else if (tok->ucs_char < 0x800)
 				{
+					unsigned char unescaped_utf[2];
 					unescaped_utf[0] =
 						0xc0 | (tok->ucs_char >> 6);
 					unescaped_utf[1] =
@@ -691,10 +712,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 				else if (IS_HIGH_SURROGATE(tok->ucs_char))
 				{
 					/* Got a high surrogate.  Remember it and look for
-							 * the beginning of another sequence, which
+					 * the beginning of another \uNNNN sequence, which
 					 * should be the low surrogate.
 					 */
-							tok->got_hi_surrogate = tok->ucs_char;
+					tok->high_surrogate = tok->ucs_char;
 					/* Not at end, and the next two chars should be "\u" */
 					if ((len == -1 ||
 						 len > (tok->char_offset + 2)) &&
@@ -737,7 +758,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 					else
 					{
 						/* Got a high surrogate without another sequence following
-								 * it.  Put a replacement char in for the hi surrogate
+						 * it.  Put a replacement char in for the high surrogate
 						 * and pretend we finished.
 						 */
 						printbuf_memappend_fast(
@@ -755,6 +776,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 				}
 				else if (tok->ucs_char < 0x10000)
 				{
+					unsigned char unescaped_utf[3];
 					unescaped_utf[0] =
 						0xe0 | (tok->ucs_char >> 12);
 					unescaped_utf[1] =
@@ -766,6 +788,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 				}
 				else if (tok->ucs_char < 0x110000)
 				{
+					unsigned char unescaped_utf[4];
 					unescaped_utf[0] =
 						0xf0 | ((tok->ucs_char >> 18) & 0x07);
 					unescaped_utf[1] =
@@ -784,26 +807,10 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 						tok->pb, (char *)utf8_replacement_char,
 						3);
 				}
-						state = saved_state;
+				state = saved_state; // i.e. _state_string or _object_field
 				break;
 			}
 		}
-				else
-				{
-					tok->err = json_tokener_error_parse_string;
-					goto out;
-				}
-				if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
-				{
-					/* Clean up any pending chars */
-					if (tok->got_hi_surrogate &&
-					    strcmp(tok->pb->buf, (char *)utf8_replacement_char))
-						printbuf_memappend_fast(
-						    tok->pb, (char *)utf8_replacement_char, 3);
-					goto out;
-				}
-			}
-		}
 		break;

 		case json_tokener_state_boolean:
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -111,7 +111,7 @@ struct json_tokener
 	 * @deprecated See json_tokener_get_error() instead.
 	 */
 	enum json_tokener_error err;
-	unsigned int ucs_char, got_hi_surrogate;
+	unsigned int ucs_char, high_surrogate;
 	char quote_char;
 	struct json_tokener_srec *stack;
 	int flags;