Merge pull request #531 from dota17/utf8test

validate utf-8 string
2026-06-25 12:29:06 +08:00 · 2020-02-05 21:56:06 -05:00
parent 1934eddf29 787a8b3f1c
commit 0ffdbb2395
4 changed files with 125 additions and 5 deletions
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = {
  "object value separator ',' expected",
  "invalid string sequence",
  "expected comment",
  "invalid utf-8 string",
  "buffer size overflow"
 };
@@ -222,8 +223,12 @@ struct json_object* json_tokener_parse_verbose(const char *str,
    :						\
    (((tok)->err = json_tokener_continue), 0)	\
    ) :						\
-   (((dest) = *str), 1)				\
+   (((tok->flags & JSON_TOKENER_VALIDATE_UTF8) &&   \
-   )
+    (!json_tokener_validate_utf8(*str, nBytesp)))?  \
    ((tok->err = json_tokener_error_parse_utf8_string), 0)  \
    :            \
    (((dest) = *str), 1)				\
   ))
 /* ADVANCE_CHAR() macro:
 *   Increments str & tok->char_offset.
@@ -242,6 +247,9 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
 {
  struct json_object *obj = NULL;
  char c = '\1';
  unsigned int nBytes = 0;
  unsigned int *nBytesp = &nBytes;
 #ifdef HAVE_USELOCALE
  locale_t oldlocale = uselocale(NULL);
  locale_t newloc;
@@ -948,6 +956,10 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
  } /* while(PEEK_CHAR) */
 out:
  if ((tok->flags & JSON_TOKENER_VALIDATE_UTF8) && (nBytes != 0))
  {
    tok->err = json_tokener_error_parse_utf8_string;
  }
  if (c &&
     (state == json_tokener_state_finish) &&
     (tok->depth == 0) &&
@@ -985,6 +997,32 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
  return NULL;
 }
 json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes)
 {
  unsigned char chr = c;
  if (*nBytes == 0)
  {
    if (chr >= 0x80)
    {
      if ((chr & 0xe0) == 0xc0)
        *nBytes = 1;
      else if ((chr & 0xf0) == 0xe0)
        *nBytes = 2;
      else if ((chr & 0xf8) == 0xf0)
        *nBytes = 3;
      else
        return 0;
    }
  }
  else
  {
    if ((chr & 0xC0) != 0x80)
      return 0;
    (*nBytes)--;
  }
  return 1;
 }
 void json_tokener_set_flags(struct json_tokener *tok, int flags)
 {
 	tok->flags = flags;
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -38,6 +38,7 @@ enum json_tokener_error {
  json_tokener_error_parse_object_value_sep,
  json_tokener_error_parse_string,
  json_tokener_error_parse_comment,
  json_tokener_error_parse_utf8_string,
  json_tokener_error_size
 };
@@ -136,6 +137,17 @@ typedef struct json_tokener json_tokener;
 */
 #define JSON_TOKENER_STRICT  0x01
 /**
 * Allow json_tokener_parse_ex() validate utf-8 char.
 * The json_tokener_validate_utf8() validate one utf8 char
 * after get one char, then begin to parse it.
 *
 * This flag is not set by default.
 *
 * @see json_tokener_set_flags()
 */
 #define JSON_TOKENER_VALIDATE_UTF8  0x10
 /**
 * Given an error previously returned by json_tokener_get_error(),
 * return a human readable description of the error.
@@ -162,6 +174,11 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok);
 JSON_EXPORT struct json_object* json_tokener_parse(const char *str);
 JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error);
 /**
 * validete the utf-8 string in strict model.
 * if not utf-8 format, return err.
 */
 json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes);
 /**
 * Set flags that control how parsing will be done.
 */
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -355,6 +355,39 @@ struct incremental_step {
 	{ "[1,2,3,]",         -1, 7, json_tokener_error_parse_unexpected, 3 },
 	{ "{\"a\":1,}",         -1, 7, json_tokener_error_parse_unexpected, 3 },
  // utf-8 test
  // acsll encoding
 	{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 5 },
 	{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 1 },
  // utf-8 encoding
 	{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 5 },
 	{ "\x22\xe4\xb8",-1, 3, json_tokener_error_parse_utf8_string, 4 },
 	{ "\x96\xe7\x95\x8c\x22",-1, 0, json_tokener_error_parse_utf8_string, 5 },
 	{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 1 },
 	{ "\x22\xcf\x80\xcf\x86\x22",-1, -1, json_tokener_success, 5 },
 	{ "\x22\xf0\xa5\x91\x95\x22",-1, -1, json_tokener_success, 5 },
  // wrong utf-8 encoding
 	{ "\x22\xe6\x9d\x4e\x22",-1, 3, json_tokener_error_parse_utf8_string, 5 },
 	{ "\x22\xe6\x9d\x4e\x22",-1, 5, json_tokener_success, 1 },
  // GBK encoding
 	{ "\x22\xc0\xee\xc5\xf4\x22",-1, 2, json_tokener_error_parse_utf8_string, 5 },
 	{ "\x22\xc0\xee\xc5\xf4\x22",-1, 6, json_tokener_success, 1 },
  // char after space
 	{ "\x20\x20\x22\xe4\xb8\x96\x22",-1, -1, json_tokener_success, 5 },
 	{ "\x20\x20\x81\x22\xe4\xb8\x96\x22",-1, 2, json_tokener_error_parse_utf8_string, 5 },
 	{ "\x5b\x20\x81\x31\x5d",-1, 2, json_tokener_error_parse_utf8_string, 5 },
  // char in state inf
 	{ "\x49\x6e\x66\x69\x6e\x69\x74\x79",9, 8, json_tokener_success, 1 },
 	{ "\x49\x6e\x66\x81\x6e\x69\x74\x79",-1, 3, json_tokener_error_parse_utf8_string, 5 },
  // char in escape unicode
 	{ "\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22",15, 14, json_tokener_success, 5 },
 	{ "\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22",-1, 8, json_tokener_error_parse_utf8_string, 5 },
 	{ "\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22",-1, 9, json_tokener_error_parse_utf8_string, 5 },
  // char in number
 	{ "\x31\x31\x81\x31\x31",-1, 2, json_tokener_error_parse_utf8_string, 5 },
  // char in object
 	{ "\x7b\x22\x31\x81\x22\x3a\x31\x7d",-1, 3, json_tokener_error_parse_utf8_string, 5 },
 	{ NULL, -1, -1, json_tokener_success, 0 },
 };
@@ -389,9 +422,19 @@ static void test_incremental_parse()
 		size_t expected_char_offset;
 		if (step->reset_tokener & 2)
-			json_tokener_set_flags(tok, JSON_TOKENER_STRICT);
+			{
 				if (step->reset_tokener & 4)
 					json_tokener_set_flags(tok, 3);
 				else
 					json_tokener_set_flags(tok, JSON_TOKENER_STRICT);
 			}
 		else
-			json_tokener_set_flags(tok, 0);
+			{
 				if (step->reset_tokener & 4)
 					json_tokener_set_flags(tok, JSON_TOKENER_VALIDATE_UTF8);
 				else
 					json_tokener_set_flags(tok, 0);
 			}
 		if (length == -1)
 			length = strlen(step->string_to_parse);
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -183,5 +183,27 @@ json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got object of type [array]
 json_tokener_parse_ex(tok, [1,2,,3,]   ,   9) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, {"a":1,}    ,   8) ... OK: got correct error: unexpected character
-End Incremental Tests OK=105 ERROR=0
+json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
 json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
 json_tokener_parse_ex(tok, "世界"    ,   8) ... OK: got object of type [string]: "世界"
 json_tokener_parse_ex(tok, "<22><>         ,   3) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, <20>界"       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, "世界"    ,   8) ... OK: got object of type [string]: "世界"
 json_tokener_parse_ex(tok, "πφ"      ,   6) ... OK: got object of type [string]: "πφ"
 json_tokener_parse_ex(tok, "𥑕"      ,   6) ... OK: got object of type [string]: "𥑕"
 json_tokener_parse_ex(tok, "<22><>N"       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, "<22><>N"       ,   5) ... OK: got object of type [string]: "<22><>N"
 json_tokener_parse_ex(tok, "<22><><EFBFBD><EFBFBD>"      ,   6) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, "<22><><EFBFBD><EFBFBD>"      ,   6) ... OK: got object of type [string]: "<22><><EFBFBD><EFBFBD>"
 json_tokener_parse_ex(tok,   "世"     ,   7) ... OK: got object of type [string]: "世"
 json_tokener_parse_ex(tok,   <20>"世"    ,   8) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, [ <20>1]       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, Infinity    ,   9) ... OK: got object of type [double]: Infinity
 json_tokener_parse_ex(tok, Inf<6E>nity    ,   8) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, "\ud855\udc55",  15) ... OK: got object of type [string]: "𥑕"
 json_tokener_parse_ex(tok, "\ud855<35>udc55",  14) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, "\ud0031<33>"  ,  10) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, 11<31>11       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, {"1<>":1}    ,   8) ... OK: got correct error: invalid utf-8 string
 End Incremental Tests OK=127 ERROR=0
 ==================================