mirror of
https://github.com/json-c/json-c.git
synced 2026-04-08 23:09:07 +08:00
* Add handling of surrogate pairs
git-svn-id: http://svn.metaparadigm.com/svn/json-c/trunk@53 327403b1-1117-474d-bef2-5cb71233fd97
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
* Add handling of surrogate pairs (json_tokener.c, test4.c, Makefile.am)
|
||||||
|
Brent Miller, bdmiller at yahoo dash inc dot com
|
||||||
* Correction to comment describing printbuf_memappend in printbuf.h
|
* Correction to comment describing printbuf_memappend in printbuf.h
|
||||||
Brent Miller, bdmiller at yahoo dash inc dot com
|
Brent Miller, bdmiller at yahoo dash inc dot com
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ libjson_la_SOURCES = \
|
|||||||
linkhash.c \
|
linkhash.c \
|
||||||
printbuf.c
|
printbuf.c
|
||||||
|
|
||||||
check_PROGRAMS = test1 test2 test3
|
check_PROGRAMS = test1 test2 test3 test4
|
||||||
|
|
||||||
test1_SOURCES = test1.c
|
test1_SOURCES = test1.c
|
||||||
test1_LDADD = $(lib_LTLIBRARIES)
|
test1_LDADD = $(lib_LTLIBRARIES)
|
||||||
@@ -41,3 +41,6 @@ test2_LDADD = $(lib_LTLIBRARIES)
|
|||||||
|
|
||||||
test3_SOURCES = test3.c
|
test3_SOURCES = test3.c
|
||||||
test3_LDADD = $(lib_LTLIBRARIES)
|
test3_LDADD = $(lib_LTLIBRARIES)
|
||||||
|
|
||||||
|
test4_SOURCES = test4.c
|
||||||
|
test4_LDADD = $(lib_LTLIBRARIES)
|
||||||
|
|||||||
102
json_tokener.c
102
json_tokener.c
@@ -58,6 +58,12 @@ const char* json_tokener_errors[] = {
|
|||||||
"expected comment",
|
"expected comment",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Stuff for decoding unicode sequences */
|
||||||
|
#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
|
||||||
|
#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00)
|
||||||
|
#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
|
||||||
|
static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD };
|
||||||
|
|
||||||
|
|
||||||
struct json_tokener* json_tokener_new(void)
|
struct json_tokener* json_tokener_new(void)
|
||||||
{
|
{
|
||||||
@@ -176,6 +182,7 @@ char* strndup(const char* str, size_t n)
|
|||||||
#define ADVANCE_CHAR(str, tok) \
|
#define ADVANCE_CHAR(str, tok) \
|
||||||
( ++(str), ((tok)->char_offset)++, c)
|
( ++(str), ((tok)->char_offset)++, c)
|
||||||
|
|
||||||
|
|
||||||
/* End optimization macro defs */
|
/* End optimization macro defs */
|
||||||
|
|
||||||
|
|
||||||
@@ -398,40 +405,97 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case json_tokener_state_escape_unicode:
|
case json_tokener_state_escape_unicode:
|
||||||
/* Note that the following code is inefficient for handling large
|
|
||||||
* chunks of extended chars, calling printbuf_memappend() once
|
|
||||||
* for each multi-byte character of input.
|
|
||||||
* This is a good area for future optimization.
|
|
||||||
*/
|
|
||||||
{
|
{
|
||||||
/* Advance until we change state */
|
unsigned int got_hi_surrogate = 0;
|
||||||
|
|
||||||
|
/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
|
||||||
while(1) {
|
while(1) {
|
||||||
if(strchr(json_hex_chars, c)) {
|
if(strchr(json_hex_chars, c)) {
|
||||||
tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
|
tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
|
||||||
if(tok->st_pos == 4) {
|
if(tok->st_pos == 4) {
|
||||||
unsigned char utf_out[3];
|
unsigned char unescaped_utf[4];
|
||||||
|
|
||||||
|
if (got_hi_surrogate) {
|
||||||
|
if (IS_LOW_SURROGATE(tok->ucs_char)) {
|
||||||
|
/* Recalculate the ucs_char, then fall thru to process normally */
|
||||||
|
tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char);
|
||||||
|
} else {
|
||||||
|
/* Hi surrogate was not followed by a low surrogate */
|
||||||
|
/* Replace the hi and process the rest normally */
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
|
}
|
||||||
|
got_hi_surrogate = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (tok->ucs_char < 0x80) {
|
if (tok->ucs_char < 0x80) {
|
||||||
utf_out[0] = tok->ucs_char;
|
unescaped_utf[0] = tok->ucs_char;
|
||||||
printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);
|
printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1);
|
||||||
} else if (tok->ucs_char < 0x800) {
|
} else if (tok->ucs_char < 0x800) {
|
||||||
utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
|
unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
|
||||||
utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
|
unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
|
||||||
printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);
|
printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2);
|
||||||
|
} else if (IS_HIGH_SURROGATE(tok->ucs_char)) {
|
||||||
|
/* Got a high surrogate. Remember it and look for the
|
||||||
|
* the beginning of another sequence, which should be the
|
||||||
|
* low surrogate.
|
||||||
|
*/
|
||||||
|
got_hi_surrogate = tok->ucs_char;
|
||||||
|
/* Not at end, and the next two chars should be "\u" */
|
||||||
|
if ((tok->char_offset+1 != len) &&
|
||||||
|
(tok->char_offset+2 != len) &&
|
||||||
|
(str[1] == '\\') &&
|
||||||
|
(str[2] == 'u'))
|
||||||
|
{
|
||||||
|
ADVANCE_CHAR(str, tok);
|
||||||
|
ADVANCE_CHAR(str, tok);
|
||||||
|
|
||||||
|
/* Advance to the first char of the next sequence and
|
||||||
|
* continue processing with the next sequence.
|
||||||
|
*/
|
||||||
|
if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
tok->ucs_char = 0;
|
||||||
|
tok->st_pos = 0;
|
||||||
|
continue; /* other json_tokener_state_escape_unicode */
|
||||||
|
} else {
|
||||||
|
/* Got a high surrogate without another sequence following
|
||||||
|
* it. Put a replacement char in for the hi surrogate
|
||||||
|
* and pretend we finished.
|
||||||
|
*/
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
|
}
|
||||||
|
} else if (IS_LOW_SURROGATE(tok->ucs_char)) {
|
||||||
|
/* Got a low surrogate not preceded by a high */
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
|
} else if (tok->ucs_char < 0x10000) {
|
||||||
|
unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
|
||||||
|
unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||||
|
unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3);
|
||||||
|
} else if (tok->ucs_char < 0x110000) {
|
||||||
|
unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
|
||||||
|
unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
|
||||||
|
unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
||||||
|
unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4);
|
||||||
} else {
|
} else {
|
||||||
utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
|
/* Don't know what we got--insert the replacement char */
|
||||||
utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
|
}
|
||||||
printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);
|
|
||||||
}
|
|
||||||
state = saved_state;
|
state = saved_state;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tok->err = json_tokener_error_parse_string;
|
tok->err = json_tokener_error_parse_string;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok))
|
if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
|
||||||
|
if (got_hi_surrogate) /* Clean up any pending chars */
|
||||||
|
printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|||||||
44
test4.c
Normal file
44
test4.c
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* gcc -o utf8 utf8.c -I/home/y/include -L./.libs -ljson
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <json/json_object.h>
|
||||||
|
#include <json/json_tokener.h>
|
||||||
|
|
||||||
|
void print_hex( const unsigned char* s) {
|
||||||
|
const unsigned char *iter = s;
|
||||||
|
unsigned char ch;
|
||||||
|
while ((ch = *iter++) != 0) {
|
||||||
|
if( ',' != ch)
|
||||||
|
printf("%x ", ch);
|
||||||
|
else
|
||||||
|
printf( ",");
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
const char *input = "\"\\ud840\\udd26,\\ud840\\udd27,\\ud800\\udd26,\\ud800\\udd27\"";
|
||||||
|
const char *expected = "\xF0\xA0\x84\xA6,\xF0\xA0\x84\xA7,\xF0\x90\x84\xA6,\xF0\x90\x84\xA7";
|
||||||
|
struct json_object *parse_result = json_tokener_parse((char*)input);
|
||||||
|
const char *unjson = json_object_get_string(parse_result);
|
||||||
|
|
||||||
|
printf("input: %s\n", input);
|
||||||
|
|
||||||
|
int strings_match = !strcmp( expected, unjson);
|
||||||
|
if (strings_match) {
|
||||||
|
printf("JSON parse result is correct: %s\n", unjson);
|
||||||
|
printf("PASS\n");
|
||||||
|
return(0);
|
||||||
|
} else {
|
||||||
|
printf("JSON parse result doesn't match expected string\n");
|
||||||
|
printf("expected string bytes: ");
|
||||||
|
print_hex( expected);
|
||||||
|
printf("parsed string bytes: ");
|
||||||
|
print_hex( unjson);
|
||||||
|
printf("FAIL\n");
|
||||||
|
return(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user