Issue #616: Change the parsing of surrogate pairs in unicode escapes so it uses a couple of additional states instead of assuming the low surrogate is already present, to ensure that we correctly handle various cases of incremental parsing.

2026-03-27 08:59:07 +08:00 · 2020-06-21 18:17:40 +00:00
parent 197e372464
commit a68566bf6a
5 changed files with 172 additions and 172 deletions
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -68,8 +68,8 @@ static void single_incremental_parse(const char *test_string, int clear_serializ

 	if (strcmp(all_at_once_str, new_str) != 0)
 	{
-		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n",
-		    test_string, chunksize, all_at_once_str, new_str);
+		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", test_string,
+		       chunksize, all_at_once_str, new_str);
 	}
 	json_tokener_free(tok);
 }
@@ -193,8 +193,8 @@ static void test_utf8_parse()
 	// json_tokener_parse doesn't support checking for byte order marks.
 	// It's the responsibility of the caller to detect and skip a BOM.
 	// Both of these checks return null.
-	char* utf8_bom = "\xEF\xBB\xBF";
-	char* utf8_bom_and_chars = "\xEF\xBB\xBF{}";
+	char *utf8_bom = "\xEF\xBB\xBF";
+	char *utf8_bom_and_chars = "\xEF\xBB\xBF{}";
 	single_basic_parse(utf8_bom, 0);
 	single_basic_parse(utf8_bom_and_chars, 0);
 }
@@ -245,7 +245,7 @@ struct incremental_step
 	int char_offset;
 	enum json_tokener_error expected_error;
 	int reset_tokener; /* Set to 1 to call json_tokener_reset() after parsing */
-	int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
+	int tok_flags;     /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
 } incremental_steps[] = {

    /* Check that full json messages can be parsed, both w/ and w/o a reset */
@@ -268,7 +268,11 @@ struct incremental_step
    {"\": {\"bar", -1, -1, json_tokener_continue, 0},
    {"\":13}}", -1, -1, json_tokener_success, 1},

-    /* Check the UTF-16 surrogate pair */
+    /* Check the UTF-16 surrogate pair handling in various ways.
+	 * Note: \ud843\udd1e is u+1D11E, Musical Symbol G Clef
+	 * Your terminal may not display these correctly, in particular
+	 *  PuTTY doesn't currently show this character.
+	 */
    /* parse one char at every time */
    {"\"\\", -1, -1, json_tokener_continue, 0},
    {"u", -1, -1, json_tokener_continue, 0},
@@ -296,6 +300,16 @@ struct incremental_step
    {"udd1e\"", -1, -1, json_tokener_success, 1},
    {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
    {"dd1e\"", -1, -1, json_tokener_success, 1},
+    {"\"fff \\ud834\\ud", -1, -1, json_tokener_continue, 0},
+    {"d1e bar\"", -1, -1, json_tokener_success, 1},
+    {"\"fff \\ud834\\udd", -1, -1, json_tokener_continue, 0},
+    {"1e bar\"", -1, -1, json_tokener_success, 1},
+
+    /* \ud83d\ude00 is U+1F600, Grinning Face
+	 * Displays fine in PuTTY, though you may need "less -r"
+	 */
+    {"\"fff \\ud83d\\ude", -1, -1, json_tokener_continue, 0},
+    {"00 bar\"", -1, -1, json_tokener_success, 1},

    /* Check that json_tokener_reset actually resets */
    {"{ \"foo", -1, -1, json_tokener_continue, 1},
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -124,6 +124,12 @@ json_tokener_parse_ex(tok, "\ud834\    ,   8) ... OK: got correct error: continu
 json_tokener_parse_ex(tok, udd1e"      ,   6) ... OK: got object of type [string]: "𝄞"
 json_tokener_parse_ex(tok, "\ud834\u   ,   9) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, dd1e"       ,   5) ... OK: got object of type [string]: "𝄞"
+json_tokener_parse_ex(tok, "fff \ud834\ud,  14) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, d1e bar"    ,   8) ... OK: got object of type [string]: "fff 𝄞 bar"
+json_tokener_parse_ex(tok, "fff \ud834\udd,  15) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 1e bar"     ,   7) ... OK: got object of type [string]: "fff 𝄞 bar"
+json_tokener_parse_ex(tok, "fff \ud83d\ude,  15) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 00 bar"     ,   7) ... OK: got object of type [string]: "fff 😀 bar"
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, : "bar"}    ,   8) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
@@ -240,5 +246,5 @@ json_tokener_parse_ex(tok, "\ud855
 json_tokener_parse_ex(tok, "\ud0031<33>"  ,  10) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, 11<31>11       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, {"1<>":1}    ,   8) ... OK: got correct error: invalid utf-8 string
-End Incremental Tests OK=154 ERROR=0
+End Incremental Tests OK=160 ERROR=0
 ==================================