Merge pull request #893 from sffc/supplemental-code-point-bug

Fix bug involving supplemental code points that look like high surrogates
2026-06-22 10:59:07 +08:00 · 2025-08-07 09:46:47 -04:00
parent bf92456789 7974657c56
commit 2372e9518e
3 changed files with 16 additions and 3 deletions
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -145,8 +145,8 @@ enum json_tokener_error json_tokener_get_error(struct json_tokener *tok)
 }

 /* Stuff for decoding unicode sequences */
-#define IS_HIGH_SURROGATE(uc) (((uc)&0xFC00) == 0xD800)
-#define IS_LOW_SURROGATE(uc) (((uc)&0xFC00) == 0xDC00)
+#define IS_HIGH_SURROGATE(uc) (((uc)&0xFFFFFC00) == 0xD800)
+#define IS_LOW_SURROGATE(uc) (((uc)&0xFFFFFC00) == 0xDC00)
 #define DECODE_SURROGATE_PAIR(hi, lo) ((((hi)&0x3FF) << 10) + ((lo)&0x3FF) + 0x10000)
 static unsigned char utf8_replacement_char[3] = {0xEF, 0xBF, 0xBD};

--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -113,6 +113,9 @@ static void test_basic_parse(void)
 	single_basic_parse("\"\\udd27\"", 0);
 	// Test with a "short" high surrogate
 	single_basic_parse("[9,'\\uDAD", 0);
+	single_basic_parse("\"[9,'\\uDAD\"", 0);
+	// Test with a supplemental character that looks like a high surrogate
+	single_basic_parse("\"\\uD836\\uDE87\"", 0);
 	single_basic_parse("null", 0);
 	single_basic_parse("NaN", 0);
 	single_basic_parse("-NaN", 0); /* non-sensical, returns null */
@@ -332,6 +335,11 @@ struct incremental_step
    {"{ \"foo", -1, -1, json_tokener_continue, 1, 0},
    {": \"bar\"}", -1, 0, json_tokener_error_parse_unexpected, 1, 0},

+    /* Check a supplemental code point that looks like a high surrogate */
+    {"\"\\uD836", -1, -1, json_tokener_continue, 0, 0},
+    {"\\uDE87", -1, -1, json_tokener_continue, 0, 0},
+    {"\"", -1, -1, json_tokener_success, 1, 0},
+
    /* Check incremental parsing with trailing characters */
    {"{ \"foo", -1, -1, json_tokener_continue, 0, 0},
    {"\": {\"bar", -1, -1, json_tokener_continue, 0, 0},
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -13,6 +13,8 @@ new_obj.to_string("\ud840\u4e16")="<22>世"
 new_obj.to_string("\ud840")="<22>"
 new_obj.to_string("\udd27")="<22>"
 new_obj.to_string([9,'\uDAD)=null
+new_obj.to_string("[9,'\uDAD")=null
+new_obj.to_string("\uD836\uDE87")="𝪇"
 new_obj.to_string(null)=null
 new_obj.to_string(NaN)=NaN
 new_obj.to_string(-NaN)=null
@@ -138,6 +140,9 @@ json_tokener_parse_ex(tok, "ä"        ,   4) ... OK: got object of type [string
 json_tokener_parse_ex(tok, "ä"        ,   4) ... OK: got object of type [string]: "ä"
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, : "bar"}    ,   8) ... OK: got correct error: unexpected character
+json_tokener_parse_ex(tok, "\uD836     ,   7) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, \uDE87      ,   6) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, "           ,   1) ... OK: got object of type [string]: "𝪇"
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, ": {"bar    ,   8) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, ":13}}XXXX  ,  10) ... OK: got object of type [object]: { "foo": { "bar": 13 } }
@@ -363,5 +368,5 @@ json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid
 json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence
 json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence
 json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence
-json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence
+json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence
 json_tokener_parse_ex(tok, {"":1}     ,   7) ... OK: got correct error: invalid string sequence