static void utf8_string(void)
{
/*
- * FIXME Current behavior for invalid UTF-8 sequences is
- * incorrect. This test expects current, incorrect results.
- * They're all marked "bug:" below, and are to be replaced by
- * correct ones as the bugs get fixed.
- *
- * The JSON parser rejects some, but not all invalid sequences.
- *
* Problem: we can't easily deal with embedded U+0000. Parsing
* the JSON string "this \\u0000" is fun" yields "this \0 is fun",
* which gets misinterpreted as NUL-terminated "this ". We should
/* Expected unparse output, defaults to @json_in */
const char *json_out;
} test_cases[] = {
- /*
- * Bug markers used here:
- * - bug: not rejected
- * JSON parser fails to reject invalid sequence(s)
- */
-
/* 0 Control characters */
{
/*
{
/* first one beyond Unicode range: U+110000 */
"\xF4\x90\x80\x80",
- "\xF4\x90\x80\x80",
+ NULL,
"\\uFFFD",
},
/* 3 Malformed sequences */
/* 3.1.1 First continuation byte */
{
"\x80",
- "\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.1.2 Last continuation byte */
{
"\xBF",
- "\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.1.3 2 continuation bytes */
{
"\x80\xBF",
- "\x80\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
/* 3.1.4 3 continuation bytes */
{
"\x80\xBF\x80",
- "\x80\xBF\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.1.5 4 continuation bytes */
{
"\x80\xBF\x80\xBF",
- "\x80\xBF\x80\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.1.6 5 continuation bytes */
{
"\x80\xBF\x80\xBF\x80",
- "\x80\xBF\x80\xBF\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.1.7 6 continuation bytes */
{
"\x80\xBF\x80\xBF\x80\xBF",
- "\x80\xBF\x80\xBF\x80\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.1.8 7 continuation bytes */
{
"\x80\xBF\x80\xBF\x80\xBF\x80",
- "\x80\xBF\x80\xBF\x80\xBF\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.1.9 Sequence of all 64 possible continuation bytes */
"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
- /* bug: not rejected */
- "\x80\x81\x82\x83\x84\x85\x86\x87"
- "\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
- "\x90\x91\x92\x93\x94\x95\x96\x97"
- "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
- "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7"
- "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
- "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
- "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
- "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
+ "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
/* 3.2 Lonely start characters */
/* 3.2.1 All 32 first bytes of 2-byte sequences, followed by space */
"\xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF "
"\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 "
"\xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
- NULL, /* bug: accepted partly, see FIXME below */
+ NULL,
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
{
"\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
"\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
- /* bug: not rejected */
- "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
- "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
+ NULL,
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD "
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ",
},
/* 3.2.3 All 8 first bytes of 4-byte sequences, followed by space */
{
"\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
- NULL, /* bug: accepted partly, see FIXME below */
+ NULL,
"\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ",
},
/* 3.2.4 All 4 first bytes of 5-byte sequences, followed by space */
/* 3.3.2 3-byte sequence with last byte missing (U+0000) */
{
"\xE0\x80",
- "\xE0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.3.3 4-byte sequence with last byte missing (U+0000) */
{
"\xF0\x80\x80",
- "\xF0\x80\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.3.4 5-byte sequence with last byte missing (U+0000) */
/* 3.3.6 2-byte sequence with last byte missing (U+07FF) */
{
"\xDF",
- "\xDF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.3.7 3-byte sequence with last byte missing (U+FFFF) */
{
"\xEF\xBF",
- "\xEF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 3.3.8 4-byte sequence with last byte missing (U+1FFFFF) */
{
"\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80"
"\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF",
- NULL, /* bug: accepted partly, see FIXME below */
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD",
},
},
{
"\xE0\x80\xAF",
- "\xE0\x80\xAF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
"\xF0\x80\x80\xAF",
- "\xF0\x80\x80\xAF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
{
/* \U+07FF */
"\xE0\x9F\xBF",
- "\xE0\x9F\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
* also 2.2.3
*/
"\xF0\x8F\xBF\xBC",
- "\xF0\x8F\xBF\xBC", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
{
/* \U+0000 */
"\xE0\x80\x80",
- "\xE0\x80\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+0000 */
"\xF0\x80\x80\x80",
- "\xF0\x80\x80\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
{
/* \U+D800 */
"\xED\xA0\x80",
- "\xED\xA0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DB7F */
"\xED\xAD\xBF",
- "\xED\xAD\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DB80 */
"\xED\xAE\x80",
- "\xED\xAE\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DBFF */
"\xED\xAF\xBF",
- "\xED\xAF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DC00 */
"\xED\xB0\x80",
- "\xED\xB0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DF80 */
"\xED\xBE\x80",
- "\xED\xBE\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+DFFF */
"\xED\xBF\xBF",
- "\xED\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* 5.2 Paired UTF-16 surrogates */
{
/* \U+D800\U+DC00 */
"\xED\xA0\x80\xED\xB0\x80",
- "\xED\xA0\x80\xED\xB0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+D800\U+DFFF */
"\xED\xA0\x80\xED\xBF\xBF",
- "\xED\xA0\x80\xED\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DB7F\U+DC00 */
"\xED\xAD\xBF\xED\xB0\x80",
- "\xED\xAD\xBF\xED\xB0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DB7F\U+DFFF */
"\xED\xAD\xBF\xED\xBF\xBF",
- "\xED\xAD\xBF\xED\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DB80\U+DC00 */
"\xED\xAE\x80\xED\xB0\x80",
- "\xED\xAE\x80\xED\xB0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DB80\U+DFFF */
"\xED\xAE\x80\xED\xBF\xBF",
- "\xED\xAE\x80\xED\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DBFF\U+DC00 */
"\xED\xAF\xBF\xED\xB0\x80",
- "\xED\xAF\xBF\xED\xB0\x80", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
{
/* \U+DBFF\U+DFFF */
"\xED\xAF\xBF\xED\xBF\xBF",
- "\xED\xAF\xBF\xED\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD\\uFFFD",
},
/* 5.3 Other illegal code positions */
{
/* \U+FFFE */
"\xEF\xBF\xBE",
- "\xEF\xBF\xBE", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* \U+FFFF */
"\xEF\xBF\xBF",
- "\xEF\xBF\xBF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* U+FDD0 */
"\xEF\xB7\x90",
- "\xEF\xB7\x90", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
{
/* U+FDEF */
"\xEF\xB7\xAF",
- "\xEF\xB7\xAF", /* bug: not rejected */
+ NULL,
"\\uFFFD",
},
/* Plane 1 .. 16 noncharacters */
"\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF"
"\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF"
"\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF",
- /* bug: not rejected */
- "\xF0\x9F\xBF\xBE\xF0\x9F\xBF\xBF"
- "\xF0\xAF\xBF\xBE\xF0\xAF\xBF\xBF"
- "\xF0\xBF\xBF\xBE\xF0\xBF\xBF\xBF"
- "\xF1\x8F\xBF\xBE\xF1\x8F\xBF\xBF"
- "\xF1\x9F\xBF\xBE\xF1\x9F\xBF\xBF"
- "\xF1\xAF\xBF\xBE\xF1\xAF\xBF\xBF"
- "\xF1\xBF\xBF\xBE\xF1\xBF\xBF\xBF"
- "\xF2\x8F\xBF\xBE\xF2\x8F\xBF\xBF"
- "\xF2\x9F\xBF\xBE\xF2\x9F\xBF\xBF"
- "\xF2\xAF\xBF\xBE\xF2\xAF\xBF\xBF"
- "\xF2\xBF\xBF\xBE\xF2\xBF\xBF\xBF"
- "\xF3\x8F\xBF\xBE\xF3\x8F\xBF\xBF"
- "\xF3\x9F\xBF\xBE\xF3\x9F\xBF\xBF"
- "\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF"
- "\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF"
- "\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF",
+ NULL,
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
"\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD"
}
in = strndup(tail, end - tail);
str = from_json_str(in, j, NULL);
- /*
- * FIXME JSON parser accepts invalid sequence
- * starting with \xC2..\xF4
- */
- if (*in >= '\xC2' && *in <= '\xF4') {
- g_free(str);
- str = NULL;
- }
g_assert(!str);
g_free(in);
}
#include "qemu/osdep.h"
#include "qemu/unicode.h"
+static bool is_valid_codepoint(int codepoint)
+{
+ if (codepoint > 0x10FFFFu) {
+ return false; /* beyond Unicode range */
+ }
+ if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
+ || (codepoint & 0xFFFE) == 0xFFFE) {
+ return false; /* noncharacter */
+ }
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+ return false; /* surrogate code point */
+ }
+ return true;
+}
+
/**
* mod_utf8_codepoint:
* @s: string encoded in modified UTF-8
cp <<= 6;
cp |= byte & 0x3F;
}
- if (cp > 0x10FFFF) {
- cp = -1; /* beyond Unicode range */
- } else if ((cp >= 0xFDD0 && cp <= 0xFDEF)
- || (cp & 0xFFFE) == 0xFFFE) {
- cp = -1; /* noncharacter */
- } else if (cp >= 0xD800 && cp <= 0xDFFF) {
- cp = -1; /* surrogate code point */
+ if (!is_valid_codepoint(cp)) {
+ cp = -1;
} else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
cp = -1; /* overlong, not \xC0\x80 */
}
*end = (char *)p;
return cp;
}
+
+/**
+ * mod_utf8_encode:
+ * @buf: Destination buffer
+ * @bufsz: size of @buf, at least 5.
+ * @codepoint: Unicode codepoint to encode
+ *
+ * Convert Unicode codepoint @codepoint to modified UTF-8.
+ *
+ * Returns: the length of the UTF-8 sequence on success, -1 when
+ * @codepoint is invalid.
+ */
+ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
+{
+ assert(bufsz >= 5);
+
+ if (!is_valid_codepoint(codepoint)) {
+ return -1;
+ }
+
+ if (codepoint > 0 && codepoint <= 0x7F) {
+ buf[0] = codepoint & 0x7F;
+ buf[1] = 0;
+ return 1;
+ }
+ if (codepoint <= 0x7FF) {
+ buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+ buf[1] = 0x80 | (codepoint & 0x3F);
+ buf[2] = 0;
+ return 2;
+ }
+ if (codepoint <= 0xFFFF) {
+ buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+ buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[2] = 0x80 | (codepoint & 0x3F);
+ buf[3] = 0;
+ return 3;
+ }
+ buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+ buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[3] = 0x80 | (codepoint & 0x3F);
+ buf[4] = 0;
+ return 4;
+}