Issue #616: Change the parsing of surrogate pairs in unicode escapes so it uses a couple of additional states instead of assuming the low surrogate is already present, to ensure that we correctly handle various cases of incremental parsing.

2026-06-14 15:09:07 +08:00 · 2020-06-21 18:17:40 +00:00
parent 197e372464
commit a68566bf6a
5 changed files with 172 additions and 172 deletions
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -68,8 +68,8 @@ static void single_incremental_parse(const char *test_string, int clear_serializ

 	if (strcmp(all_at_once_str, new_str) != 0)
 	{
-		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n",
-		    test_string, chunksize, all_at_once_str, new_str);
+		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", test_string,
+		       chunksize, all_at_once_str, new_str);
 	}
 	json_tokener_free(tok);
 }
@@ -193,8 +193,8 @@ static void test_utf8_parse()
 	// json_tokener_parse doesn't support checking for byte order marks.
 	// It's the responsibility of the caller to detect and skip a BOM.
 	// Both of these checks return null.
-	char* utf8_bom = "\xEF\xBB\xBF";
-	char* utf8_bom_and_chars = "\xEF\xBB\xBF{}";
+	char *utf8_bom = "\xEF\xBB\xBF";
+	char *utf8_bom_and_chars = "\xEF\xBB\xBF{}";
 	single_basic_parse(utf8_bom, 0);
 	single_basic_parse(utf8_bom_and_chars, 0);
 }
@@ -245,7 +245,7 @@ struct incremental_step
 	int char_offset;
 	enum json_tokener_error expected_error;
 	int reset_tokener; /* Set to 1 to call json_tokener_reset() after parsing */
-	int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
+	int tok_flags;     /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
 } incremental_steps[] = {

    /* Check that full json messages can be parsed, both w/ and w/o a reset */
@@ -268,7 +268,11 @@ struct incremental_step
    {"\": {\"bar", -1, -1, json_tokener_continue, 0},
    {"\":13}}", -1, -1, json_tokener_success, 1},

-    /* Check the UTF-16 surrogate pair */
+    /* Check the UTF-16 surrogate pair handling in various ways.
+	 * Note: \ud843\udd1e is u+1D11E, Musical Symbol G Clef
+	 * Your terminal may not display these correctly, in particular
+	 *  PuTTY doesn't currently show this character.
+	 */
    /* parse one char at every time */
    {"\"\\", -1, -1, json_tokener_continue, 0},
    {"u", -1, -1, json_tokener_continue, 0},
@@ -296,6 +300,16 @@ struct incremental_step
    {"udd1e\"", -1, -1, json_tokener_success, 1},
    {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
    {"dd1e\"", -1, -1, json_tokener_success, 1},
+    {"\"fff \\ud834\\ud", -1, -1, json_tokener_continue, 0},
+    {"d1e bar\"", -1, -1, json_tokener_success, 1},
+    {"\"fff \\ud834\\udd", -1, -1, json_tokener_continue, 0},
+    {"1e bar\"", -1, -1, json_tokener_success, 1},
+
+    /* \ud83d\ude00 is U+1F600, Grinning Face
+	 * Displays fine in PuTTY, though you may need "less -r"
+	 */
+    {"\"fff \\ud83d\\ude", -1, -1, json_tokener_continue, 0},
+    {"00 bar\"", -1, -1, json_tokener_success, 1},

    /* Check that json_tokener_reset actually resets */
    {"{ \"foo", -1, -1, json_tokener_continue, 1},