[qemu.git] / qobject / json-lexer.c

/*
 * JSON lexer
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Anthony Liguori   <[email protected]>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#include "qemu/osdep.h"
#include "json-parser-int.h"

#define MAX_TOKEN_SIZE (64ULL << 20)

/*
 * From RFC 8259 "The JavaScript Object Notation (JSON) Data
 * Interchange Format", with [comments in brackets]:
 *
 * The set of tokens includes six structural characters, strings,
 * numbers, and three literal names.
 *
 * These are the six structural characters:
 *
 *    begin-array     = ws %x5B ws  ; [ left square bracket
 *    begin-object    = ws %x7B ws  ; { left curly bracket
 *    end-array       = ws %x5D ws  ; ] right square bracket
 *    end-object      = ws %x7D ws  ; } right curly bracket
 *    name-separator  = ws %x3A ws  ; : colon
 *    value-separator = ws %x2C ws  ; , comma
 *
 * Insignificant whitespace is allowed before or after any of the six
 * structural characters.
 * [This lexer accepts it before or after any token, which is actually
 * the same, as the grammar always has structural characters between
 * other tokens.]
 *
 *    ws = *(
 *           %x20 /              ; Space
 *           %x09 /              ; Horizontal tab
 *           %x0A /              ; Line feed or New line
 *           %x0D )              ; Carriage return
 *
 * [...] three literal names:
 *    false null true
 *  [This lexer accepts [a-z]+, and leaves rejecting unknown literal
 *  names to the parser.]
 *
 * [Numbers:]
 *
 *    number = [ minus ] int [ frac ] [ exp ]
 *    decimal-point = %x2E       ; .
 *    digit1-9 = %x31-39         ; 1-9
 *    e = %x65 / %x45            ; e E
 *    exp = e [ minus / plus ] 1*DIGIT
 *    frac = decimal-point 1*DIGIT
 *    int = zero / ( digit1-9 *DIGIT )
 *    minus = %x2D               ; -
 *    plus = %x2B                ; +
 *    zero = %x30                ; 0
 *
 * [Strings:]
 *    string = quotation-mark *char quotation-mark
 *
 *    char = unescaped /
 *        escape (
 *            %x22 /          ; "    quotation mark  U+0022
 *            %x5C /          ; \    reverse solidus U+005C
 *            %x2F /          ; /    solidus         U+002F
 *            %x62 /          ; b    backspace       U+0008
 *            %x66 /          ; f    form feed       U+000C
 *            %x6E /          ; n    line feed       U+000A
 *            %x72 /          ; r    carriage return U+000D
 *            %x74 /          ; t    tab             U+0009
 *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
 *    escape = %x5C              ; \
 *    quotation-mark = %x22      ; "
 *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
 *    [This lexer accepts any non-control character after escape, and
 *    leaves rejecting invalid ones to the parser.]
 *
 *
 * Extensions over RFC 8259:
 * - Extra escape sequence in strings:
 *   0x27 (apostrophe) is recognized after escape, too
 * - Single-quoted strings:
 *   Like double-quoted strings, except they're delimited by %x27
 *   (apostrophe) instead of %x22 (quotation mark), and can't contain
 *   unescaped apostrophe, but can contain unescaped quotation mark.
 * - Interpolation, if enabled:
 *   The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
 *   ones to the parser.
 *
 * Note:
 * - Input must be encoded in modified UTF-8.
 * - Decoding and validating is left to the parser.
 */

enum json_lexer_state {
    IN_RECOVERY = 1,
    IN_DQ_STRING_ESCAPE,
    IN_DQ_STRING,
    IN_SQ_STRING_ESCAPE,
    IN_SQ_STRING,
    IN_ZERO,
    IN_EXP_DIGITS,
    IN_EXP_SIGN,
    IN_EXP_E,
    IN_MANTISSA,
    IN_MANTISSA_DIGITS,
    IN_DIGITS,
    IN_SIGN,
    IN_KEYWORD,
    IN_INTERP,
    IN_WHITESPACE,
    IN_START,
    IN_START_INTERP,            /* must be IN_START + 1 */
};

QEMU_BUILD_BUG_ON(JSON_ERROR != 0);
QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1);
QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80);
QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);

#define LOOKAHEAD 0x80
#define TERMINAL(state) [0 ... 0xFF] = ((state) | LOOKAHEAD)

static const uint8_t json_lexer[][256] =  {
    /* Relies on default initialization to IN_ERROR! */

    /* error recovery */
    [IN_RECOVERY] = {
        /*
         * Skip characters until a structural character, an ASCII
         * control character other than '\t', or impossible UTF-8
         * bytes '\xFE', '\xFF'.  Structural characters and line
         * endings are promising resynchronization points.  Clients
         * may use the others to force the JSON parser into known-good
         * state; see docs/interop/qmp-spec.txt.
         */
        [0 ... 0x1F] = IN_START | LOOKAHEAD,
        [0x20 ... 0xFD] = IN_RECOVERY,
        [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
        ['\t'] = IN_RECOVERY,
        ['['] = IN_START | LOOKAHEAD,
        [']'] = IN_START | LOOKAHEAD,
        ['{'] = IN_START | LOOKAHEAD,
        ['}'] = IN_START | LOOKAHEAD,
        [':'] = IN_START | LOOKAHEAD,
        [','] = IN_START | LOOKAHEAD,
    },

    /* double quote string */
    [IN_DQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
    },
    [IN_DQ_STRING] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
        ['\\'] = IN_DQ_STRING_ESCAPE,
        ['"'] = JSON_STRING,
    },

    /* single quote string */
    [IN_SQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
    },
    [IN_SQ_STRING] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
        ['\\'] = IN_SQ_STRING_ESCAPE,
        ['\''] = JSON_STRING,
    },

    /* Zero */
    [IN_ZERO] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = JSON_ERROR,
        ['.'] = IN_MANTISSA,
    },

    /* Float */
    [IN_EXP_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_EXP_SIGN] = {
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_EXP_E] = {
        ['-'] = IN_EXP_SIGN,
        ['+'] = IN_EXP_SIGN,
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_MANTISSA_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
    },

    [IN_MANTISSA] = {
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    },

    /* Number */
    [IN_DIGITS] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
        ['.'] = IN_MANTISSA,
    },

    [IN_SIGN] = {
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_DIGITS,
    },

    /* keywords */
    [IN_KEYWORD] = {
        TERMINAL(JSON_KEYWORD),
        ['a' ... 'z'] = IN_KEYWORD,
    },

    /* whitespace */
    [IN_WHITESPACE] = {
        TERMINAL(JSON_SKIP),
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },

    /* interpolation */
    [IN_INTERP] = {
        TERMINAL(JSON_INTERP),
        ['A' ... 'Z'] = IN_INTERP,
        ['a' ... 'z'] = IN_INTERP,
        ['0' ... '9'] = IN_INTERP,
    },

    /*
     * Two start states:
     * - IN_START recognizes JSON tokens with our string extensions
     * - IN_START_INTERP additionally recognizes interpolation.
     */
    [IN_START ... IN_START_INTERP] = {
        ['"'] = IN_DQ_STRING,
        ['\''] = IN_SQ_STRING,
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_DIGITS,
        ['-'] = IN_SIGN,
        ['{'] = JSON_LCURLY,
        ['}'] = JSON_RCURLY,
        ['['] = JSON_LSQUARE,
        [']'] = JSON_RSQUARE,
        [','] = JSON_COMMA,
        [':'] = JSON_COLON,
        ['a' ... 'z'] = IN_KEYWORD,
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },
    [IN_START_INTERP]['%'] = IN_INTERP,
};

static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush,
                                 bool *char_consumed)
{
    uint8_t next;

    assert(lexer->state <= ARRAY_SIZE(json_lexer));
    next = json_lexer[lexer->state][(uint8_t)ch];
    *char_consumed = !flush && !(next & LOOKAHEAD);
    return next & ~LOOKAHEAD;
}

void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
{
    lexer->start_state = lexer->state = enable_interpolation
        ? IN_START_INTERP : IN_START;
    lexer->token = g_string_sized_new(3);
    lexer->x = lexer->y = 0;
}

static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
{
    int new_state;
    bool char_consumed = false;

    lexer->x++;
    if (ch == '\n') {
        lexer->x = 0;
        lexer->y++;
    }

    while (flush ? lexer->state != lexer->start_state : !char_consumed) {
        new_state = next_state(lexer, ch, flush, &char_consumed);
        if (char_consumed) {
            assert(!flush);
            g_string_append_c(lexer->token, ch);
        }

        switch (new_state) {
        case JSON_LCURLY:
        case JSON_RCURLY:
        case JSON_LSQUARE:
        case JSON_RSQUARE:
        case JSON_COLON:
        case JSON_COMMA:
        case JSON_INTERP:
        case JSON_INTEGER:
        case JSON_FLOAT:
        case JSON_KEYWORD:
        case JSON_STRING:
            json_message_process_token(lexer, lexer->token, new_state,
                                       lexer->x, lexer->y);
            /* fall through */
        case JSON_SKIP:
            g_string_truncate(lexer->token, 0);
            /* fall through */
        case IN_START:
            new_state = lexer->start_state;
            break;
        case JSON_ERROR:
            json_message_process_token(lexer, lexer->token, JSON_ERROR,
                                       lexer->x, lexer->y);
            new_state = IN_RECOVERY;
            /* fall through */
        case IN_RECOVERY:
            g_string_truncate(lexer->token, 0);
            break;
        default:
            break;
        }
        lexer->state = new_state;
    }

    /* Do not let a single token grow to an arbitrarily large size,
     * this is a security consideration.
     */
    if (lexer->token->len > MAX_TOKEN_SIZE) {
        json_message_process_token(lexer, lexer->token, lexer->state,
                                   lexer->x, lexer->y);
        g_string_truncate(lexer->token, 0);
        lexer->state = lexer->start_state;
    }
}

void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
{
    size_t i;

    for (i = 0; i < size; i++) {
        json_lexer_feed_char(lexer, buffer[i], false);
    }
}

void json_lexer_flush(JSONLexer *lexer)
{
    json_lexer_feed_char(lexer, 0, true);
    assert(lexer->state == lexer->start_state);
    json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT,
                               lexer->x, lexer->y);
}

void json_lexer_destroy(JSONLexer *lexer)
{
    g_string_free(lexer->token, true);
}
Commit	Line	Data
5ab8558d AL	1	/*
	2	* JSON lexer
	3	*
	4	* Copyright IBM, Corp. 2009
	5	*
	6	* Authors:
	7	* Anthony Liguori <[email protected]>
	8	*
	9	* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
	10	* See the COPYING.LIB file in the top-level directory.
	11	*
	12	*/
	13
f2ad72b3	14	#include "qemu/osdep.h"
86cdf9ec	15	#include "json-parser-int.h"
5ab8558d	16
325601b4 AL	17	#define MAX_TOKEN_SIZE (64ULL << 20)
325601b4 AL	18
5ab8558d	19	/*
eddc0a7f MA	20	* From RFC 8259 "The JavaScript Object Notation (JSON) Data
eddc0a7f MA	21	* Interchange Format", with [comments in brackets]:
ff5394ad	22	*
eddc0a7f MA	23	* The set of tokens includes six structural characters, strings,
eddc0a7f MA	24	* numbers, and three literal names.
ff5394ad	25	*
eddc0a7f	26	* These are the six structural characters:
ff5394ad	27	*
eddc0a7f MA	28	* begin-array = ws %x5B ws ; [ left square bracket
	29	* begin-object = ws %x7B ws ; { left curly bracket
	30	* end-array = ws %x5D ws ; ] right square bracket
	31	* end-object = ws %x7D ws ; } right curly bracket
	32	* name-separator = ws %x3A ws ; : colon
	33	* value-separator = ws %x2C ws ; , comma
ff5394ad	34	*
eddc0a7f MA	35	* Insignificant whitespace is allowed before or after any of the six
	36	* structural characters.
	37	* [This lexer accepts it before or after any token, which is actually
	38	* the same, as the grammar always has structural characters between
	39	* other tokens.]
ff5394ad	40	*
eddc0a7f MA	41	* ws = *(
	42	* %x20 / ; Space
	43	* %x09 / ; Horizontal tab
	44	* %x0A / ; Line feed or New line
	45	* %x0D ) ; Carriage return
5ab8558d	46	*
eddc0a7f MA	47	* [...] three literal names:
	48	* false null true
	49	* [This lexer accepts [a-z]+, and leaves rejecting unknown literal
	50	* names to the parser.]
	51	*
	52	* [Numbers:]
	53	*
	54	* number = [ minus ] int [ frac ] [ exp ]
	55	* decimal-point = %x2E ; .
	56	* digit1-9 = %x31-39 ; 1-9
	57	* e = %x65 / %x45 ; e E
	58	* exp = e [ minus / plus ] 1*DIGIT
	59	* frac = decimal-point 1*DIGIT
	60	* int = zero / ( digit1-9 *DIGIT )
	61	* minus = %x2D ; -
	62	* plus = %x2B ; +
	63	* zero = %x30 ; 0
	64	*
	65	* [Strings:]
	66	* string = quotation-mark *char quotation-mark
	67	*
	68	* char = unescaped /
	69	* escape (
	70	* %x22 / ; " quotation mark U+0022
	71	* %x5C / ; \ reverse solidus U+005C
	72	* %x2F / ; / solidus U+002F
	73	* %x62 / ; b backspace U+0008
	74	* %x66 / ; f form feed U+000C
	75	* %x6E / ; n line feed U+000A
	76	* %x72 / ; r carriage return U+000D
	77	* %x74 / ; t tab U+0009
	78	* %x75 4HEXDIG ) ; uXXXX U+XXXX
	79	* escape = %x5C ; \
	80	* quotation-mark = %x22 ; "
	81	* unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
b2da4a4d MA	82	* [This lexer accepts any non-control character after escape, and
b2da4a4d MA	83	* leaves rejecting invalid ones to the parser.]
eddc0a7f MA	84	*
	85	*
	86	* Extensions over RFC 8259:
	87	* - Extra escape sequence in strings:
	88	* 0x27 (apostrophe) is recognized after escape, too
	89	* - Single-quoted strings:
	90	* Like double-quoted strings, except they're delimited by %x27
	91	* (apostrophe) instead of %x22 (quotation mark), and can't contain
	92	* unescaped apostrophe, but can contain unescaped quotation mark.
2cbd15aa	93	* - Interpolation, if enabled:
f7617d45 MA	94	* The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
f7617d45 MA	95	* ones to the parser.
eddc0a7f MA	96	*
eddc0a7f MA	97	* Note:
4b1c0cd7	98	* - Input must be encoded in modified UTF-8.
eddc0a7f	99	* - Decoding and validating is left to the parser.
5ab8558d AL	100	*/
	101
	102	enum json_lexer_state {
2ce4ee64	103	IN_RECOVERY = 1,
5ab8558d AL	104	IN_DQ_STRING_ESCAPE,
5ab8558d AL	105	IN_DQ_STRING,
5ab8558d AL	106	IN_SQ_STRING_ESCAPE,
	107	IN_SQ_STRING,
	108	IN_ZERO,
4d400661 MA	109	IN_EXP_DIGITS,
4d400661 MA	110	IN_EXP_SIGN,
5ab8558d AL	111	IN_EXP_E,
	112	IN_MANTISSA,
	113	IN_MANTISSA_DIGITS,
4d400661 MA	114	IN_DIGITS,
4d400661 MA	115	IN_SIGN,
5ab8558d	116	IN_KEYWORD,
61030280	117	IN_INTERP,
5ab8558d	118	IN_WHITESPACE,
5ab8558d	119	IN_START,
2cbd15aa	120	IN_START_INTERP, /* must be IN_START + 1 */
5ab8558d AL	121	};
5ab8558d AL	122
2ce4ee64 MA	123	QEMU_BUILD_BUG_ON(JSON_ERROR != 0);
2ce4ee64 MA	124	QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1);
2cbd15aa	125	QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
c0ee3afa	126	QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80);
2cbd15aa	127	QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
b8d3b1da	128
c0ee3afa MA	129	#define LOOKAHEAD 0x80
c0ee3afa MA	130	#define TERMINAL(state) [0 ... 0xFF] = ((state) \| LOOKAHEAD)
f7c05274	131
5ab8558d	132	static const uint8_t json_lexer[][256] = {
b8d3b1da MA	133	/* Relies on default initialization to IN_ERROR! */
b8d3b1da MA	134
0f07a5d5 MA	135	/* error recovery */
	136	[IN_RECOVERY] = {
	137	/*
	138	* Skip characters until a structural character, an ASCII
	139	* control character other than '\t', or impossible UTF-8
	140	* bytes '\xFE', '\xFF'. Structural characters and line
	141	* endings are promising resynchronization points. Clients
	142	* may use the others to force the JSON parser into known-good
	143	* state; see docs/interop/qmp-spec.txt.
	144	*/
	145	[0 ... 0x1F] = IN_START \| LOOKAHEAD,
	146	[0x20 ... 0xFD] = IN_RECOVERY,
	147	[0xFE ... 0xFF] = IN_START \| LOOKAHEAD,
	148	['\t'] = IN_RECOVERY,
	149	['['] = IN_START \| LOOKAHEAD,
	150	[']'] = IN_START \| LOOKAHEAD,
	151	['{'] = IN_START \| LOOKAHEAD,
	152	['}'] = IN_START \| LOOKAHEAD,
	153	[':'] = IN_START \| LOOKAHEAD,
	154	[','] = IN_START \| LOOKAHEAD,
	155	},
	156
5ab8558d	157	/* double quote string */
5ab8558d	158	[IN_DQ_STRING_ESCAPE] = {
b2da4a4d	159	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d AL	160	},
5ab8558d AL	161	[IN_DQ_STRING] = {
de930f45	162	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d	163	['\\'] = IN_DQ_STRING_ESCAPE,
28e91a68	164	['"'] = JSON_STRING,
5ab8558d AL	165	},
	166
	167	/* single quote string */
5ab8558d	168	[IN_SQ_STRING_ESCAPE] = {
b2da4a4d	169	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d AL	170	},
5ab8558d AL	171	[IN_SQ_STRING] = {
de930f45	172	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d	173	['\\'] = IN_SQ_STRING_ESCAPE,
28e91a68	174	['\''] = JSON_STRING,
5ab8558d AL	175	},
	176
	177	/* Zero */
	178	[IN_ZERO] = {
	179	TERMINAL(JSON_INTEGER),
2ce4ee64	180	['0' ... '9'] = JSON_ERROR,
5ab8558d AL	181	['.'] = IN_MANTISSA,
	182	},
	183
	184	/* Float */
4d400661	185	[IN_EXP_DIGITS] = {
5ab8558d	186	TERMINAL(JSON_FLOAT),
4d400661	187	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	188	},
5ab8558d AL	189
4d400661 MA	190	[IN_EXP_SIGN] = {
4d400661 MA	191	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	192	},
	193
	194	[IN_EXP_E] = {
4d400661 MA	195	['-'] = IN_EXP_SIGN,
	196	['+'] = IN_EXP_SIGN,
	197	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	198	},
	199
	200	[IN_MANTISSA_DIGITS] = {
	201	TERMINAL(JSON_FLOAT),
	202	['0' ... '9'] = IN_MANTISSA_DIGITS,
	203	['e'] = IN_EXP_E,
	204	['E'] = IN_EXP_E,
	205	},
	206
	207	[IN_MANTISSA] = {
	208	['0' ... '9'] = IN_MANTISSA_DIGITS,
	209	},
	210
	211	/* Number */
4d400661	212	[IN_DIGITS] = {
5ab8558d	213	TERMINAL(JSON_INTEGER),
4d400661	214	['0' ... '9'] = IN_DIGITS,
5ab8558d AL	215	['e'] = IN_EXP_E,
	216	['E'] = IN_EXP_E,
	217	['.'] = IN_MANTISSA,
	218	},
	219
4d400661	220	[IN_SIGN] = {
5ab8558d	221	['0'] = IN_ZERO,
4d400661	222	['1' ... '9'] = IN_DIGITS,
5ab8558d AL	223	},
	224
	225	/* keywords */
	226	[IN_KEYWORD] = {
	227	TERMINAL(JSON_KEYWORD),
	228	['a' ... 'z'] = IN_KEYWORD,
	229	},
	230
	231	/* whitespace */
	232	[IN_WHITESPACE] = {
	233	TERMINAL(JSON_SKIP),
	234	[' '] = IN_WHITESPACE,
	235	['\t'] = IN_WHITESPACE,
	236	['\r'] = IN_WHITESPACE,
	237	['\n'] = IN_WHITESPACE,
ff5394ad	238	},
5ab8558d	239
61030280	240	/* interpolation */
61030280	241	[IN_INTERP] = {
f7617d45 MA	242	TERMINAL(JSON_INTERP),
	243	['A' ... 'Z'] = IN_INTERP,
	244	['a' ... 'z'] = IN_INTERP,
	245	['0' ... '9'] = IN_INTERP,
5ab8558d AL	246	},
5ab8558d AL	247
2cbd15aa MA	248	/*
	249	* Two start states:
	250	* - IN_START recognizes JSON tokens with our string extensions
	251	* - IN_START_INTERP additionally recognizes interpolation.
	252	*/
	253	[IN_START ... IN_START_INTERP] = {
5ab8558d AL	254	['"'] = IN_DQ_STRING,
	255	['\''] = IN_SQ_STRING,
	256	['0'] = IN_ZERO,
4d400661 MA	257	['1' ... '9'] = IN_DIGITS,
4d400661 MA	258	['-'] = IN_SIGN,
c5461660 MA	259	['{'] = JSON_LCURLY,
	260	['}'] = JSON_RCURLY,
	261	['['] = JSON_LSQUARE,
	262	[']'] = JSON_RSQUARE,
	263	[','] = JSON_COMMA,
	264	[':'] = JSON_COLON,
5ab8558d	265	['a' ... 'z'] = IN_KEYWORD,
5ab8558d AL	266	[' '] = IN_WHITESPACE,
	267	['\t'] = IN_WHITESPACE,
	268	['\r'] = IN_WHITESPACE,
	269	['\n'] = IN_WHITESPACE,
	270	},
2cbd15aa	271	[IN_START_INTERP]['%'] = IN_INTERP,
5ab8558d AL	272	};
5ab8558d AL	273
c0ee3afa MA	274	static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush,
	275	bool *char_consumed)
	276	{
	277	uint8_t next;
	278
	279	assert(lexer->state <= ARRAY_SIZE(json_lexer));
	280	next = json_lexer[lexer->state][(uint8_t)ch];
	281	*char_consumed = !flush && !(next & LOOKAHEAD);
	282	return next & ~LOOKAHEAD;
	283	}
	284
2cbd15aa	285	void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
5ab8558d	286	{
2cbd15aa MA	287	lexer->start_state = lexer->state = enable_interpolation
2cbd15aa MA	288	? IN_START_INTERP : IN_START;
d2ca7c0b	289	lexer->token = g_string_sized_new(3);
03308f6c	290	lexer->x = lexer->y = 0;
5ab8558d AL	291	}
5ab8558d AL	292
7c1e1d54	293	static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
5ab8558d	294	{
852dfa76 MA	295	int new_state;
852dfa76 MA	296	bool char_consumed = false;
f7c05274	297
5ab8558d AL	298	lexer->x++;
	299	if (ch == '\n') {
	300	lexer->x = 0;
	301	lexer->y++;
	302	}
	303
852dfa76	304	while (flush ? lexer->state != lexer->start_state : !char_consumed) {
c0ee3afa	305	new_state = next_state(lexer, ch, flush, &char_consumed);
852dfa76	306	if (char_consumed) {
c0ee3afa	307	assert(!flush);
d2ca7c0b	308	g_string_append_c(lexer->token, ch);
f7c05274	309	}
5ab8558d	310
f7c05274	311	switch (new_state) {
c5461660 MA	312	case JSON_LCURLY:
	313	case JSON_RCURLY:
	314	case JSON_LSQUARE:
	315	case JSON_RSQUARE:
	316	case JSON_COLON:
	317	case JSON_COMMA:
61030280	318	case JSON_INTERP:
f7c05274 PB	319	case JSON_INTEGER:
	320	case JSON_FLOAT:
	321	case JSON_KEYWORD:
	322	case JSON_STRING:
037f2440 MA	323	json_message_process_token(lexer, lexer->token, new_state,
037f2440 MA	324	lexer->x, lexer->y);
0b0404bf	325	/* fall through */
f7c05274	326	case JSON_SKIP:
d2ca7c0b	327	g_string_truncate(lexer->token, 0);
0f07a5d5 MA	328	/* fall through */
0f07a5d5 MA	329	case IN_START:
2cbd15aa	330	new_state = lexer->start_state;
f7c05274	331	break;
2ce4ee64	332	case JSON_ERROR:
037f2440 MA	333	json_message_process_token(lexer, lexer->token, JSON_ERROR,
037f2440 MA	334	lexer->x, lexer->y);
0f07a5d5 MA	335	new_state = IN_RECOVERY;
	336	/* fall through */
	337	case IN_RECOVERY:
d2ca7c0b	338	g_string_truncate(lexer->token, 0);
0f07a5d5	339	break;
f7c05274 PB	340	default:
	341	break;
	342	}
	343	lexer->state = new_state;
852dfa76	344	}
325601b4 AL	345
	346	/* Do not let a single token grow to an arbitrarily large size,
	347	* this is a security consideration.
	348	*/
d2ca7c0b	349	if (lexer->token->len > MAX_TOKEN_SIZE) {
037f2440 MA	350	json_message_process_token(lexer, lexer->token, lexer->state,
037f2440 MA	351	lexer->x, lexer->y);
d2ca7c0b	352	g_string_truncate(lexer->token, 0);
2cbd15aa	353	lexer->state = lexer->start_state;
325601b4	354	}
5ab8558d AL	355	}
5ab8558d AL	356
7c1e1d54	357	void json_lexer_feed(JSONLexer lexer, const char buffer, size_t size)
5ab8558d AL	358	{
	359	size_t i;
	360
	361	for (i = 0; i < size; i++) {
7c1e1d54	362	json_lexer_feed_char(lexer, buffer[i], false);
5ab8558d	363	}
5ab8558d AL	364	}
5ab8558d AL	365
7c1e1d54	366	void json_lexer_flush(JSONLexer *lexer)
5ab8558d	367	{
852dfa76 MA	368	json_lexer_feed_char(lexer, 0, true);
852dfa76 MA	369	assert(lexer->state == lexer->start_state);
f9277915 MA	370	json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT,
f9277915 MA	371	lexer->x, lexer->y);
5ab8558d AL	372	}
	373
	374	void json_lexer_destroy(JSONLexer *lexer)
	375	{
d2ca7c0b	376	g_string_free(lexer->token, true);
5ab8558d	377	}