* [PATCH 1/5] json-parser: pass around lookahead token, constify
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
@ 2026-01-07 8:48 ` Paolo Bonzini
2026-02-06 10:45 ` Markus Armbruster
2026-01-07 8:48 ` [PATCH 2/5] json-parser: replace with a push parser Paolo Bonzini
` (5 subsequent siblings)
6 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-07 8:48 UTC (permalink / raw)
To: qemu-devel; +Cc: armbru
Pass the lookahead token down to the various functions implementing the
recursive descent, instead of first peeking and then getting the token
again multiple times.
The main purpose of this patch is to switch the argument passing style
for parse_* functions to something more desirable for a push parser,
which gets one and exactly one token at a time. However, there are
some minor improvement in code size and a bugfix as well.
In particular, because parse_array() and parse_object() can assume
that the opening bracket/brace is not anymore in the token stream, it
is now apparent that the first entry of an array incorrectly used the
'[' character (stored in "token") as its location. parse_pair, for
comparison, handled it correctly:
if (!key) {
parse_error(ctxt, peek, "key is not a string in object");
goto out;
}
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
qobject/json-parser.c | 125 +++++++++++++++++++-----------------------
1 file changed, 55 insertions(+), 70 deletions(-)
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 7483e582fea..eabc9f8358c 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -49,13 +49,13 @@ typedef struct JSONParserContext {
* 4) deal with premature EOI
*/
-static QObject *parse_value(JSONParserContext *ctxt);
+static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token);
/**
* Error handler
*/
static void G_GNUC_PRINTF(3, 4) parse_error(JSONParserContext *ctxt,
- JSONToken *token, const char *msg, ...)
+ const JSONToken *token, const char *msg, ...)
{
va_list ap;
char message[1024];
@@ -126,7 +126,7 @@ static int cvt4hex(const char *s)
* - Invalid Unicode characters are rejected.
* - Control characters \x00..\x1F are rejected by the lexer.
*/
-static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
+static QString *parse_string(JSONParserContext *ctxt, const JSONToken *token)
{
const char *ptr = token->str;
GString *str;
@@ -235,42 +235,29 @@ out:
return NULL;
}
-/* Note: the token object returned by parser_context_peek_token or
- * parser_context_pop_token is deleted as soon as parser_context_pop_token
- * is called again.
+/* Note: the token object returned by parser_context_pop_token is
+ * deleted as soon as parser_context_pop_token is called again.
*/
-static JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
+static const JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
{
g_free(ctxt->current);
ctxt->current = g_queue_pop_head(ctxt->buf);
return ctxt->current;
}
-static JSONToken *parser_context_peek_token(JSONParserContext *ctxt)
-{
- return g_queue_peek_head(ctxt->buf);
-}
-
/**
* Parsing rules
*/
-static int parse_pair(JSONParserContext *ctxt, QDict *dict)
+static int parse_pair(JSONParserContext *ctxt, const JSONToken *token, QDict *dict)
{
QObject *key_obj = NULL;
QString *key;
QObject *value;
- JSONToken *peek, *token;
- peek = parser_context_peek_token(ctxt);
- if (peek == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- key_obj = parse_value(ctxt);
+ key_obj = parse_value(ctxt, token);
key = qobject_to(QString, key_obj);
if (!key) {
- parse_error(ctxt, peek, "key is not a string in object");
+ parse_error(ctxt, token, "key is not a string in object");
goto out;
}
@@ -285,7 +272,13 @@ static int parse_pair(JSONParserContext *ctxt, QDict *dict)
goto out;
}
- value = parse_value(ctxt);
+ token = parser_context_pop_token(ctxt);
+ if (token == NULL) {
+ parse_error(ctxt, NULL, "premature EOI");
+ goto out;
+ }
+
+ value = parse_value(ctxt, token);
if (value == NULL) {
parse_error(ctxt, token, "Missing value in dict");
goto out;
@@ -309,21 +302,18 @@ out:
static QObject *parse_object(JSONParserContext *ctxt)
{
QDict *dict = NULL;
- JSONToken *token, *peek;
-
- token = parser_context_pop_token(ctxt);
- assert(token && token->type == JSON_LCURLY);
+ const JSONToken *token;
dict = qdict_new();
- peek = parser_context_peek_token(ctxt);
- if (peek == NULL) {
+ token = parser_context_pop_token(ctxt);
+ if (token == NULL) {
parse_error(ctxt, NULL, "premature EOI");
goto out;
}
- if (peek->type != JSON_RCURLY) {
- if (parse_pair(ctxt, dict) == -1) {
+ if (token->type != JSON_RCURLY) {
+ if (parse_pair(ctxt, token, dict) == -1) {
goto out;
}
@@ -339,7 +329,13 @@ static QObject *parse_object(JSONParserContext *ctxt)
goto out;
}
- if (parse_pair(ctxt, dict) == -1) {
+ token = parser_context_pop_token(ctxt);
+ if (token == NULL) {
+ parse_error(ctxt, NULL, "premature EOI");
+ goto out;
+ }
+
+ if (parse_pair(ctxt, token, dict) == -1) {
goto out;
}
@@ -349,8 +345,6 @@ static QObject *parse_object(JSONParserContext *ctxt)
goto out;
}
}
- } else {
- (void)parser_context_pop_token(ctxt);
}
return QOBJECT(dict);
@@ -363,23 +357,20 @@ out:
static QObject *parse_array(JSONParserContext *ctxt)
{
QList *list = NULL;
- JSONToken *token, *peek;
-
- token = parser_context_pop_token(ctxt);
- assert(token && token->type == JSON_LSQUARE);
+ const JSONToken *token;
list = qlist_new();
- peek = parser_context_peek_token(ctxt);
- if (peek == NULL) {
+ token = parser_context_pop_token(ctxt);
+ if (token == NULL) {
parse_error(ctxt, NULL, "premature EOI");
goto out;
}
- if (peek->type != JSON_RSQUARE) {
+ if (token->type != JSON_RSQUARE) {
QObject *obj;
- obj = parse_value(ctxt);
+ obj = parse_value(ctxt, token);
if (obj == NULL) {
parse_error(ctxt, token, "expecting value");
goto out;
@@ -399,7 +390,13 @@ static QObject *parse_array(JSONParserContext *ctxt)
goto out;
}
- obj = parse_value(ctxt);
+ token = parser_context_pop_token(ctxt);
+ if (token == NULL) {
+ parse_error(ctxt, NULL, "premature EOI");
+ goto out;
+ }
+
+ obj = parse_value(ctxt, token);
if (obj == NULL) {
parse_error(ctxt, token, "expecting value");
goto out;
@@ -413,8 +410,6 @@ static QObject *parse_array(JSONParserContext *ctxt)
goto out;
}
}
- } else {
- (void)parser_context_pop_token(ctxt);
}
return QOBJECT(list);
@@ -424,11 +419,8 @@ out:
return NULL;
}
-static QObject *parse_keyword(JSONParserContext *ctxt)
+static QObject *parse_keyword(JSONParserContext *ctxt, const JSONToken *token)
{
- JSONToken *token;
-
- token = parser_context_pop_token(ctxt);
assert(token && token->type == JSON_KEYWORD);
if (!strcmp(token->str, "true")) {
@@ -442,11 +434,8 @@ static QObject *parse_keyword(JSONParserContext *ctxt)
return NULL;
}
-static QObject *parse_interpolation(JSONParserContext *ctxt)
+static QObject *parse_interpolation(JSONParserContext *ctxt, const JSONToken *token)
{
- JSONToken *token;
-
- token = parser_context_pop_token(ctxt);
assert(token && token->type == JSON_INTERP);
if (!strcmp(token->str, "%p")) {
@@ -478,11 +467,8 @@ static QObject *parse_interpolation(JSONParserContext *ctxt)
return NULL;
}
-static QObject *parse_literal(JSONParserContext *ctxt)
+static QObject *parse_literal(JSONParserContext *ctxt, const JSONToken *token)
{
- JSONToken *token;
-
- token = parser_context_pop_token(ctxt);
assert(token);
switch (token->type) {
@@ -530,29 +516,21 @@ static QObject *parse_literal(JSONParserContext *ctxt)
}
}
-static QObject *parse_value(JSONParserContext *ctxt)
+static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
{
- JSONToken *token;
-
- token = parser_context_peek_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- return NULL;
- }
-
switch (token->type) {
case JSON_LCURLY:
return parse_object(ctxt);
case JSON_LSQUARE:
return parse_array(ctxt);
case JSON_INTERP:
- return parse_interpolation(ctxt);
+ return parse_interpolation(ctxt, token);
case JSON_INTEGER:
case JSON_FLOAT:
case JSON_STRING:
- return parse_literal(ctxt);
+ return parse_literal(ctxt, token);
case JSON_KEYWORD:
- return parse_keyword(ctxt);
+ return parse_keyword(ctxt, token);
default:
parse_error(ctxt, token, "expecting value");
return NULL;
@@ -575,8 +553,15 @@ QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp)
{
JSONParserContext ctxt = { .buf = tokens, .ap = ap };
QObject *result;
+ const JSONToken *token;
- result = parse_value(&ctxt);
+ token = parser_context_pop_token(&ctxt);
+ if (token == NULL) {
+ parse_error(&ctxt, NULL, "premature EOI");
+ return NULL;
+ }
+
+ result = parse_value(&ctxt, token);
assert(ctxt.err || g_queue_is_empty(ctxt.buf));
error_propagate(errp, ctxt.err);
--
2.52.0
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 1/5] json-parser: pass around lookahead token, constify
2026-01-07 8:48 ` [PATCH 1/5] json-parser: pass around lookahead token, constify Paolo Bonzini
@ 2026-02-06 10:45 ` Markus Armbruster
2026-02-06 10:54 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-06 10:45 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Subject smells like a patch splitting opportunity: a trivial patch
adding const, followed by the more interesting patch passing around the
lookahead. Or the other way round.
In fact, I just did this to help me review the interesting patch: the
trivial patch adds const to JSONToken *. I can split in my tree.
Paolo Bonzini <pbonzini@redhat.com> writes:
> Pass the lookahead token down to the various functions implementing the
> recursive descent, instead of first peeking and then getting the token
> again multiple times.
>
> The main purpose of this patch is to switch the argument passing style
> for parse_* functions to something more desirable for a push parser,
> which gets one and exactly one token at a time.
Peeking at the lookahead is just fine in a traditional parser, but
you're preparing the way for a push parser. Got it.
> However, there are
> some minor improvement in code size and a bugfix as well.
>
> In particular, because parse_array() and parse_object() can assume
> that the opening bracket/brace is not anymore in the token stream, it
> is now apparent that the first entry of an array incorrectly used the
> '[' character (stored in "token") as its location.
For a value of "apparent". Not sure I would've seen it without your
heads up.
I believe you're talking about
obj = parse_value(ctxt);
if (obj == NULL) {
parse_error(ctxt, token, "expecting value");
goto out;
}
where @token indeed holds the array's '['.
This isn't the only spot where we pass a bad @token argument to
parse_error(). For instance:
parse_error(ctxt, NULL, "premature EOI");
These bad arguments are all masked by parse_error() completely ignoring
its argument! We tell users what's wrong with the input, but telling
them where it's wrong has been TODO since the initial commit in 2009[*].
I see your PATCH 5 finally takes care of it.
Losely related: how the parser recovers from syntax errors. Consider
parse_pair():
key_obj = parse_value(ctxt);
If @key_obj, parse_value() successfully parsed a value into @key_obj.
Else, it reported an error.
key = qobject_to(QString, key_obj);
If @key, parse_value() recognized a string.
Else, it either recognized something else, or reported an error.
if (!key) {
parse_error(ctxt, peek, "key is not a string in object");
This error makes sense if it recognized something else.
It doesn't when it reported an error. Works, sort of, because
parse_error() keeps only the first error, and silently ignores
subsequent ones.
Robust parsers handle errors explicitly, they don't blindly continue,
throwing away any subsequent errors. Moreover, I find these misleading
errors grating when I read the code.
But I digress.
goto out;
}
> parse_pair, for
> comparison, handled it correctly:
>
> if (!key) {
> parse_error(ctxt, peek, "key is not a string in object");
> goto out;
> }
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> qobject/json-parser.c | 125 +++++++++++++++++++-----------------------
> 1 file changed, 55 insertions(+), 70 deletions(-)
>
> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
> index 7483e582fea..eabc9f8358c 100644
> --- a/qobject/json-parser.c
> +++ b/qobject/json-parser.c
> @@ -49,13 +49,13 @@ typedef struct JSONParserContext {
> * 4) deal with premature EOI
> */
>
> -static QObject *parse_value(JSONParserContext *ctxt);
> +static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token);
>
> /**
> * Error handler
> */
> static void G_GNUC_PRINTF(3, 4) parse_error(JSONParserContext *ctxt,
> - JSONToken *token, const char *msg, ...)
> + const JSONToken *token, const char *msg, ...)
checkpatch points out
0001-json-parser-pass-around-lookahead-token-constify.patch:48: WARNING: line over 80 characters
Easy enough to avoid. Both
static void G_GNUC_PRINTF(3, 4) parse_error(JSONParserContext *ctxt,
const JSONToken *token,
const char *msg, ...)
and
static void G_GNUC_PRINTF(3, 4)
parse_error(JSONParserContext *ctxt, const JSONToken *token,
const char *msg, ...)
would be easier on my eyes.
More of the same below, not flagging again.
> {
> va_list ap;
> char message[1024];
> @@ -126,7 +126,7 @@ static int cvt4hex(const char *s)
> * - Invalid Unicode characters are rejected.
> * - Control characters \x00..\x1F are rejected by the lexer.
> */
> -static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
> +static QString *parse_string(JSONParserContext *ctxt, const JSONToken *token)
> {
> const char *ptr = token->str;
> GString *str;
> @@ -235,42 +235,29 @@ out:
> return NULL;
> }
>
> -/* Note: the token object returned by parser_context_peek_token or
> - * parser_context_pop_token is deleted as soon as parser_context_pop_token
> - * is called again.
> +/* Note: the token object returned by parser_context_pop_token is
> + * deleted as soon as parser_context_pop_token is called again.
> */
> -static JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
> +static const JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
> {
> g_free(ctxt->current);
> ctxt->current = g_queue_pop_head(ctxt->buf);
> return ctxt->current;
> }
>
> -static JSONToken *parser_context_peek_token(JSONParserContext *ctxt)
> -{
> - return g_queue_peek_head(ctxt->buf);
> -}
> -
> /**
> * Parsing rules
> */
> -static int parse_pair(JSONParserContext *ctxt, QDict *dict)
> +static int parse_pair(JSONParserContext *ctxt, const JSONToken *token, QDict *dict)
> {
> QObject *key_obj = NULL;
> QString *key;
> QObject *value;
> - JSONToken *peek, *token;
>
> - peek = parser_context_peek_token(ctxt);
> - if (peek == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - key_obj = parse_value(ctxt);
> + key_obj = parse_value(ctxt, token);
> key = qobject_to(QString, key_obj);
> if (!key) {
> - parse_error(ctxt, peek, "key is not a string in object");
> + parse_error(ctxt, token, "key is not a string in object");
> goto out;
> }
>
@peek isn't exactly a great name, but @token tells me even less.
@first_token?
> @@ -285,7 +272,13 @@ static int parse_pair(JSONParserContext *ctxt, QDict *dict)
> goto out;
> }
>
> - value = parse_value(ctxt);
> + token = parser_context_pop_token(ctxt);
Assignment to parameter. Not a fan.
If we rename the parameter, we can keep the local variable @token for
use here.
> + if (token == NULL) {
> + parse_error(ctxt, NULL, "premature EOI");
> + goto out;
> + }
> +
> + value = parse_value(ctxt, token);
> if (value == NULL) {
> parse_error(ctxt, token, "Missing value in dict");
> goto out;
> @@ -309,21 +302,18 @@ out:
> static QObject *parse_object(JSONParserContext *ctxt)
> {
> QDict *dict = NULL;
> - JSONToken *token, *peek;
> -
> - token = parser_context_pop_token(ctxt);
> - assert(token && token->type == JSON_LCURLY);
> + const JSONToken *token;
Before the patch, parse_object() parses the entire object, as its name
indicates, with the precondition that the lookahead is '{'.
After the patch, it parses the part of the object after the '{'. Hmm.
We could pass it the first token, like we do for all the other parsing
functions except this one and parse_array(), and keep the assertion.
I'd prefer that, but it's a matter of taste.
>
> dict = qdict_new();
>
> - peek = parser_context_peek_token(ctxt);
> - if (peek == NULL) {
> + token = parser_context_pop_token(ctxt);
> + if (token == NULL) {
> parse_error(ctxt, NULL, "premature EOI");
> goto out;
> }
>
> - if (peek->type != JSON_RCURLY) {
> - if (parse_pair(ctxt, dict) == -1) {
> + if (token->type != JSON_RCURLY) {
> + if (parse_pair(ctxt, token, dict) == -1) {
> goto out;
> }
>
> @@ -339,7 +329,13 @@ static QObject *parse_object(JSONParserContext *ctxt)
> goto out;
> }
>
> - if (parse_pair(ctxt, dict) == -1) {
> + token = parser_context_pop_token(ctxt);
> + if (token == NULL) {
> + parse_error(ctxt, NULL, "premature EOI");
> + goto out;
> + }
> +
> + if (parse_pair(ctxt, token, dict) == -1) {
> goto out;
> }
>
> @@ -349,8 +345,6 @@ static QObject *parse_object(JSONParserContext *ctxt)
> goto out;
> }
> }
> - } else {
> - (void)parser_context_pop_token(ctxt);
> }
>
> return QOBJECT(dict);
> @@ -363,23 +357,20 @@ out:
> static QObject *parse_array(JSONParserContext *ctxt)
> {
> QList *list = NULL;
> - JSONToken *token, *peek;
> -
> - token = parser_context_pop_token(ctxt);
> - assert(token && token->type == JSON_LSQUARE);
> + const JSONToken *token;
Likewise.
>
> list = qlist_new();
>
> - peek = parser_context_peek_token(ctxt);
> - if (peek == NULL) {
> + token = parser_context_pop_token(ctxt);
> + if (token == NULL) {
> parse_error(ctxt, NULL, "premature EOI");
> goto out;
> }
>
> - if (peek->type != JSON_RSQUARE) {
> + if (token->type != JSON_RSQUARE) {
> QObject *obj;
>
> - obj = parse_value(ctxt);
> + obj = parse_value(ctxt, token);
> if (obj == NULL) {
> parse_error(ctxt, token, "expecting value");
> goto out;
> @@ -399,7 +390,13 @@ static QObject *parse_array(JSONParserContext *ctxt)
> goto out;
> }
>
> - obj = parse_value(ctxt);
> + token = parser_context_pop_token(ctxt);
> + if (token == NULL) {
> + parse_error(ctxt, NULL, "premature EOI");
> + goto out;
> + }
> +
> + obj = parse_value(ctxt, token);
> if (obj == NULL) {
> parse_error(ctxt, token, "expecting value");
> goto out;
> @@ -413,8 +410,6 @@ static QObject *parse_array(JSONParserContext *ctxt)
> goto out;
> }
> }
> - } else {
> - (void)parser_context_pop_token(ctxt);
> }
>
> return QOBJECT(list);
> @@ -424,11 +419,8 @@ out:
> return NULL;
> }
>
> -static QObject *parse_keyword(JSONParserContext *ctxt)
> +static QObject *parse_keyword(JSONParserContext *ctxt, const JSONToken *token)
> {
> - JSONToken *token;
> -
> - token = parser_context_pop_token(ctxt);
> assert(token && token->type == JSON_KEYWORD);
>
> if (!strcmp(token->str, "true")) {
> @@ -442,11 +434,8 @@ static QObject *parse_keyword(JSONParserContext *ctxt)
> return NULL;
> }
>
> -static QObject *parse_interpolation(JSONParserContext *ctxt)
> +static QObject *parse_interpolation(JSONParserContext *ctxt, const JSONToken *token)
> {
> - JSONToken *token;
> -
> - token = parser_context_pop_token(ctxt);
> assert(token && token->type == JSON_INTERP);
>
> if (!strcmp(token->str, "%p")) {
> @@ -478,11 +467,8 @@ static QObject *parse_interpolation(JSONParserContext *ctxt)
> return NULL;
> }
>
> -static QObject *parse_literal(JSONParserContext *ctxt)
> +static QObject *parse_literal(JSONParserContext *ctxt, const JSONToken *token)
> {
> - JSONToken *token;
> -
> - token = parser_context_pop_token(ctxt);
> assert(token);
>
> switch (token->type) {
> @@ -530,29 +516,21 @@ static QObject *parse_literal(JSONParserContext *ctxt)
> }
> }
>
> -static QObject *parse_value(JSONParserContext *ctxt)
> +static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
If we name parse_pair()'s new argument @first_token, we should name this
one the same.
> {
> - JSONToken *token;
> -
> - token = parser_context_peek_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - return NULL;
> - }
> -
> switch (token->type) {
> case JSON_LCURLY:
> return parse_object(ctxt);
> case JSON_LSQUARE:
> return parse_array(ctxt);
> case JSON_INTERP:
> - return parse_interpolation(ctxt);
> + return parse_interpolation(ctxt, token);
> case JSON_INTEGER:
> case JSON_FLOAT:
> case JSON_STRING:
> - return parse_literal(ctxt);
> + return parse_literal(ctxt, token);
> case JSON_KEYWORD:
> - return parse_keyword(ctxt);
> + return parse_keyword(ctxt, token);
> default:
> parse_error(ctxt, token, "expecting value");
> return NULL;
> @@ -575,8 +553,15 @@ QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp)
> {
> JSONParserContext ctxt = { .buf = tokens, .ap = ap };
> QObject *result;
> + const JSONToken *token;
>
> - result = parse_value(&ctxt);
> + token = parser_context_pop_token(&ctxt);
> + if (token == NULL) {
> + parse_error(&ctxt, NULL, "premature EOI");
> + return NULL;
> + }
> +
> + result = parse_value(&ctxt, token);
> assert(ctxt.err || g_queue_is_empty(ctxt.buf));
>
> error_propagate(errp, ctxt.err);
while (!g_queue_is_empty(ctxt.buf)) {
parser_context_pop_token(&ctxt);
This is the only call of parser_context_pop_token() that isn't
immediately followed by "if it returned null, report "premature EOI".
I went "let's factor this out", then realized the next patch replaces
all this. Nevermind!
}
g_free(ctxt.current);
return result;
}
Summary:
* parse_pair() and parse_value() now receive the first token as
argument. Popping the first token moves to their callers.
* parse_keyword(), parse_interpolation(), and parse_literal() now
receive the token as argument. Popping it moves to their callers.
parse_string() works like this even before the patch, and is not
changed.
* parse_object() and parse_array() now parse the part after the object's
/ array's first token.
Works.
[*] Reminds me of "Ed is generous enough to flag errors, yet prudent
enough not to overwhelm the novice with verbosity."
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 1/5] json-parser: pass around lookahead token, constify
2026-02-06 10:45 ` Markus Armbruster
@ 2026-02-06 10:54 ` Paolo Bonzini
0 siblings, 0 replies; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-06 10:54 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On Fri, Feb 6, 2026 at 11:46 AM Markus Armbruster <armbru@redhat.com> wrote:
> > In particular, because parse_array() and parse_object() can assume
> > that the opening bracket/brace is not anymore in the token stream, it
> > is now apparent that the first entry of an array incorrectly used the
> > '[' character (stored in "token") as its location.
>
> For a value of "apparent". Not sure I would've seen it without your
> heads up.
Me neither - I found it only while reviewing my own changes. :)
> I believe you're talking about
>
> obj = parse_value(ctxt);
> if (obj == NULL) {
> parse_error(ctxt, token, "expecting value");
> goto out;
> }
>
> where @token indeed holds the array's '['.
Correct.
> parse_error(ctxt, NULL, "premature EOI");
>
> These bad arguments are all masked by parse_error() completely ignoring
> its argument! We tell users what's wrong with the input, but telling
> them where it's wrong has been TODO since the initial commit in 2009[*].
> I see your PATCH 5 finally takes care of it.
Yep, the clearer distinction between the parser and streamer helps a
little in that respect.
> > - key_obj = parse_value(ctxt);
> > + key_obj = parse_value(ctxt, token);
> > key = qobject_to(QString, key_obj);
> > if (!key) {
> > - parse_error(ctxt, peek, "key is not a string in object");
> > + parse_error(ctxt, token, "key is not a string in object");
> > goto out;
> > }
> >
>
> @peek isn't exactly a great name, but @token tells me even less.
> @first_token?
Possibly - though it all goes away since the parser becomes a state
machine, and the only separate functions that survive are for
one-token literals; which is why I just went simply with "token"
everywhere (as in, "next token" or "lookahead that led the parser to
call this function").
> > static QObject *parse_object(JSONParserContext *ctxt)
> > {
> > QDict *dict = NULL;
> > - JSONToken *token, *peek;
> > -
> > - token = parser_context_pop_token(ctxt);
> > - assert(token && token->type == JSON_LCURLY);
> > + const JSONToken *token;
>
> Before the patch, parse_object() parses the entire object, as its name
> indicates, with the precondition that the lookahead is '{'.
>
> After the patch, it parses the part of the object after the '{'. Hmm.
>
> We could pass it the first token, like we do for all the other parsing
> functions except this one and parse_array(), and keep the assertion.
> I'd prefer that, but it's a matter of taste.
I see, yes we could. I can do this in v2, since I'm sure you'll
(rightly) find something in the other patches that warrants a resend.
> This is the only call of parser_context_pop_token() that isn't
> immediately followed by "if it returned null, report "premature EOI".
> I went "let's factor this out", then realized the next patch replaces
> all this. Nevermind!
Yeah, not an excuse for being sloppy but perhaps an excuse for not
being too picky. :)
> * parse_pair() and parse_value() now receive the first token as
> argument. Popping the first token moves to their callers.
>
> * parse_keyword(), parse_interpolation(), and parse_literal() now
> receive the token as argument. Popping it moves to their callers.
> parse_string() works like this even before the patch, and is not
> changed.
>
> * parse_object() and parse_array() now parse the part after the object's
> / array's first token.
>
> Works.
Good :)
Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 2/5] json-parser: replace with a push parser
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
2026-01-07 8:48 ` [PATCH 1/5] json-parser: pass around lookahead token, constify Paolo Bonzini
@ 2026-01-07 8:48 ` Paolo Bonzini
2026-02-09 9:36 ` Markus Armbruster
2026-01-07 8:48 ` [PATCH 3/5] json-streamer: remove token queue Paolo Bonzini
` (4 subsequent siblings)
6 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-07 8:48 UTC (permalink / raw)
To: qemu-devel; +Cc: armbru
In order to avoid stashing all the tokens corresponding to a JSON value,
embed the parsing stack and state machine in JSONParser. This is more
efficient and allows for more prompt error recovery; it also does not
make the code substantially larger than the current recursive descent
parser, though the state machine is probably a bit harder to follow.
The stack consists of QLists and QDicts corresponding to open
brackets and braces, plus optionally a QString with the current
key on top of each QDict.
After each value is parsed, it is added to the top array or dictionary
or, if the stack is empty, json_parser_feed returns the complete
QObject.
For now, json-streamer.c keeps tracking the tokens up until braces
and brackets are balanced, and then shoves the whole queue of tokens
into the push parser. The only logic change is that JSON_END_OF_INPUT
always triggers the emptying of the queue; the parser takes notice and
checks that there is nothing on the stack. Not using brace_count
and bracket_count for this is the first step towards improved separation
of concerns between json-parser.c and json-streamer.c.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/qobject/json-parser.h | 6 +
qobject/json-parser-int.h | 4 +-
qobject/json-parser.c | 430 +++++++++++++++++-----------------
qobject/json-streamer.c | 21 +-
4 files changed, 245 insertions(+), 216 deletions(-)
diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
index 7345a9bd5cb..05346fa816b 100644
--- a/include/qobject/json-parser.h
+++ b/include/qobject/json-parser.h
@@ -20,6 +20,12 @@ typedef struct JSONLexer {
int x, y;
} JSONLexer;
+typedef struct JSONParserContext {
+ Error *err;
+ GQueue *stack;
+ va_list *ap;
+} JSONParserContext;
+
typedef struct JSONMessageParser {
void (*emit)(void *opaque, QObject *json, Error *err);
void *opaque;
diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
index 8c01f236276..05e2e8e1831 100644
--- a/qobject/json-parser-int.h
+++ b/qobject/json-parser-int.h
@@ -49,6 +49,8 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
/* json-parser.c */
JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
-QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp);
+void json_parser_init(JSONParserContext *ctxt, va_list *ap);
+QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
+void json_parser_destroy(JSONParserContext *ctxt);
#endif
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index eabc9f8358c..338f0c5aae6 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -31,12 +31,36 @@ struct JSONToken {
char str[];
};
-typedef struct JSONParserContext {
- Error *err;
- JSONToken *current;
- GQueue *buf;
- va_list *ap;
-} JSONParserContext;
+/*
+ * Objects: { } | { members }
+ * - Empty: { -> AFTER_LCURLY -> }
+ * - Non-empty: { -> AFTER_LCURLY -> BEFORE_KEY -> string -> END_OF_KEY -> : ->
+ * BEFORE_VALUE -> value -> END_OF_VALUE -> , -> BEFORE_KEY -> ... -> }
+ *
+ * Arrays: [ ] | [ elements ]
+ * - Empty: [ -> AFTER_LSQUARE -> ]
+ * - Non-empty: [ -> AFTER_LSQUARE -> BEFORE_VALUE -> value -> END_OF_VALUE -> , ->
+ * BEFORE_VALUE -> ... -> ]
+ *
+ * The two cases for END_OF_VALUE are distinguished based on the type of QObject at
+ * top-of-stack.
+ */
+typedef enum JSONParserState {
+ AFTER_LCURLY,
+ AFTER_LSQUARE,
+ BEFORE_KEY,
+ BEFORE_VALUE,
+ END_OF_KEY,
+ END_OF_VALUE,
+} JSONParserState;
+
+typedef struct JSONParserStackEntry {
+ /* A QString with the last parsed key, or a QList/QDict for the current container. */
+ QObject *partial;
+
+ /* Needed to distinguish whether the parser is waiting for a colon or comma. */
+ JSONParserState state;
+} JSONParserStackEntry;
#define BUG_ON(cond) assert(!(cond))
@@ -49,7 +73,29 @@ typedef struct JSONParserContext {
* 4) deal with premature EOI
*/
-static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token);
+static inline JSONParserStackEntry *current_entry(JSONParserContext *ctxt)
+{
+ return g_queue_peek_tail(ctxt->stack);
+}
+
+static void push_entry(JSONParserContext *ctxt, QObject *partial, JSONParserState state)
+{
+ JSONParserStackEntry *entry = g_new(JSONParserStackEntry, 1);
+ entry->partial = partial;
+ entry->state = state;
+ g_queue_push_tail(ctxt->stack, entry);
+}
+
+/* Note that entry->partial does *not* lose its reference count even if value == NULL. */
+static JSONParserStackEntry *pop_entry(JSONParserContext *ctxt, QObject **value)
+{
+ JSONParserStackEntry *entry = g_queue_pop_tail(ctxt->stack);
+ if (value) {
+ *value = entry->partial;
+ }
+ g_free(entry);
+ return current_entry(ctxt);
+}
/**
* Error handler
@@ -235,189 +281,7 @@ out:
return NULL;
}
-/* Note: the token object returned by parser_context_pop_token is
- * deleted as soon as parser_context_pop_token is called again.
- */
-static const JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
-{
- g_free(ctxt->current);
- ctxt->current = g_queue_pop_head(ctxt->buf);
- return ctxt->current;
-}
-
-/**
- * Parsing rules
- */
-static int parse_pair(JSONParserContext *ctxt, const JSONToken *token, QDict *dict)
-{
- QObject *key_obj = NULL;
- QString *key;
- QObject *value;
-
- key_obj = parse_value(ctxt, token);
- key = qobject_to(QString, key_obj);
- if (!key) {
- parse_error(ctxt, token, "key is not a string in object");
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- if (token->type != JSON_COLON) {
- parse_error(ctxt, token, "missing : in object pair");
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- value = parse_value(ctxt, token);
- if (value == NULL) {
- parse_error(ctxt, token, "Missing value in dict");
- goto out;
- }
-
- if (qdict_haskey(dict, qstring_get_str(key))) {
- parse_error(ctxt, token, "duplicate key");
- goto out;
- }
-
- qdict_put_obj(dict, qstring_get_str(key), value);
-
- qobject_unref(key_obj);
- return 0;
-
-out:
- qobject_unref(key_obj);
- return -1;
-}
-
-static QObject *parse_object(JSONParserContext *ctxt)
-{
- QDict *dict = NULL;
- const JSONToken *token;
-
- dict = qdict_new();
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- if (token->type != JSON_RCURLY) {
- if (parse_pair(ctxt, token, dict) == -1) {
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- while (token->type != JSON_RCURLY) {
- if (token->type != JSON_COMMA) {
- parse_error(ctxt, token, "expected separator in dict");
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- if (parse_pair(ctxt, token, dict) == -1) {
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
- }
- }
-
- return QOBJECT(dict);
-
-out:
- qobject_unref(dict);
- return NULL;
-}
-
-static QObject *parse_array(JSONParserContext *ctxt)
-{
- QList *list = NULL;
- const JSONToken *token;
-
- list = qlist_new();
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- if (token->type != JSON_RSQUARE) {
- QObject *obj;
-
- obj = parse_value(ctxt, token);
- if (obj == NULL) {
- parse_error(ctxt, token, "expecting value");
- goto out;
- }
-
- qlist_append_obj(list, obj);
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- while (token->type != JSON_RSQUARE) {
- if (token->type != JSON_COMMA) {
- parse_error(ctxt, token, "expected separator in list");
- goto out;
- }
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
-
- obj = parse_value(ctxt, token);
- if (obj == NULL) {
- parse_error(ctxt, token, "expecting value");
- goto out;
- }
-
- qlist_append_obj(list, obj);
-
- token = parser_context_pop_token(ctxt);
- if (token == NULL) {
- parse_error(ctxt, NULL, "premature EOI");
- goto out;
- }
- }
- }
-
- return QOBJECT(list);
-
-out:
- qobject_unref(list);
- return NULL;
-}
+/* Terminals */
static QObject *parse_keyword(JSONParserContext *ctxt, const JSONToken *token)
{
@@ -516,13 +380,17 @@ static QObject *parse_literal(JSONParserContext *ctxt, const JSONToken *token)
}
}
-static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
+/* Parsing state machine */
+
+static QObject *parse_begin_value(JSONParserContext *ctxt, const JSONToken *token)
{
switch (token->type) {
case JSON_LCURLY:
- return parse_object(ctxt);
+ push_entry(ctxt, QOBJECT(qdict_new()), AFTER_LCURLY);
+ return NULL;
case JSON_LSQUARE:
- return parse_array(ctxt);
+ push_entry(ctxt, QOBJECT(qlist_new()), AFTER_LSQUARE);
+ return NULL;
case JSON_INTERP:
return parse_interpolation(ctxt, token);
case JSON_INTEGER:
@@ -537,6 +405,130 @@ static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
}
}
+static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
+{
+ JSONParserStackEntry *entry;
+ JSONParserState state;
+ QString *key;
+ QObject *value = NULL;
+
+ entry = current_entry(ctxt);
+ state = entry ? entry->state : BEFORE_VALUE;
+ switch (state) {
+ case AFTER_LCURLY:
+ /* Grab '}' for empty object or fall through to BEFORE_KEY */
+ if (token->type == JSON_RCURLY) {
+ entry = pop_entry(ctxt, &value);
+ break;
+ }
+ entry->state = BEFORE_KEY;
+ /* fall through */
+
+ case BEFORE_KEY:
+ /* Expecting object key */
+ if (token->type == JSON_STRING) {
+ key = parse_string(ctxt, token);
+ if (!key) {
+ return NULL;
+ }
+
+ /* Store key in a special entry on the stack */
+ push_entry(ctxt, QOBJECT(key), END_OF_KEY);
+ } else {
+ parse_error(ctxt, token, "expecting key");
+ }
+ return NULL;
+
+ case END_OF_KEY:
+ /* Expecting ':' after key */
+ if (token->type == JSON_COLON) {
+ entry->state = BEFORE_VALUE;
+ } else {
+ parse_error(ctxt, token, "expecting ':'");
+ }
+ return NULL;
+
+ case AFTER_LSQUARE:
+ /* Grab ']' for empty array or fall through to BEFORE_VALUE */
+ if (token->type == JSON_RSQUARE) {
+ entry = pop_entry(ctxt, &value);
+ break;
+ }
+ entry->state = BEFORE_VALUE;
+ /* fall through */
+
+ case BEFORE_VALUE:
+ /* Expecting value */
+ value = parse_begin_value(ctxt, token);
+ if (!value) {
+ /* Error or '['/'{' */
+ return NULL;
+ }
+ /* Return value or insert it into a container */
+ break;
+
+ case END_OF_VALUE:
+ /* Grab ',' or ']' for array; ',' or '}' for object */
+ if (qobject_to(QList, entry->partial)) {
+ /* Array */
+ if (token->type != JSON_RSQUARE) {
+ if (token->type == JSON_COMMA) {
+ entry->state = BEFORE_VALUE;
+ } else {
+ parse_error(ctxt, token, "expected ',' or ']'");
+ }
+ return NULL;
+ }
+ } else if (qobject_to(QDict, entry->partial)) {
+ /* Object */
+ if (token->type != JSON_RCURLY) {
+ if (token->type == JSON_COMMA) {
+ entry->state = BEFORE_KEY;
+ } else {
+ parse_error(ctxt, token, "expected ',' or '}'");
+ }
+ return NULL;
+ }
+ } else {
+ g_assert_not_reached();
+ }
+
+ /* Got ']' or '}', return value or insert into parent container */
+ entry = pop_entry(ctxt, &value);
+ break;
+ }
+
+ assert(value);
+ if (entry == NULL) {
+ /* The toplevel value is complete. */
+ return value;
+ }
+
+ key = qobject_to(QString, entry->partial);
+ if (key) {
+ const char *key_str;
+ QDict *dict;
+
+ entry = pop_entry(ctxt, NULL);
+ dict = qobject_to(QDict, entry->partial);
+ assert(dict);
+ key_str = qstring_get_str(key);
+ if (qdict_haskey(dict, key_str)) {
+ parse_error(ctxt, token, "duplicate key");
+ qobject_unref(value);
+ return NULL;
+ }
+ qdict_put_obj(dict, key_str, value);
+ qobject_unref(key);
+ } else {
+ /* Add to array */
+ qlist_append_obj(qobject_to(QList, entry->partial), value);
+ }
+
+ entry->state = END_OF_VALUE;
+ return NULL;
+}
+
JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
{
JSONToken *token = g_malloc(sizeof(JSONToken) + tokstr->len + 1);
@@ -549,27 +541,43 @@ JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
return token;
}
-QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp)
+void json_parser_init(JSONParserContext *ctxt, va_list *ap)
{
- JSONParserContext ctxt = { .buf = tokens, .ap = ap };
- QObject *result;
- const JSONToken *token;
+ ctxt->err = NULL;
+ ctxt->stack = g_queue_new();
+ ctxt->ap = ap;
+}
- token = parser_context_pop_token(&ctxt);
- if (token == NULL) {
- parse_error(&ctxt, NULL, "premature EOI");
- return NULL;
+void json_parser_destroy(JSONParserContext *ctxt)
+{
+ JSONParserStackEntry *entry;
+
+ while ((entry = g_queue_pop_tail(ctxt->stack)) != NULL) {
+ qobject_unref(entry->partial);
+ g_free(entry);
+ }
+ g_queue_free(ctxt->stack);
+ ctxt->stack = NULL;
+}
+
+QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp)
+{
+ QObject *result = NULL;
+
+ assert(!ctxt->err);
+ switch (token->type) {
+ case JSON_END_OF_INPUT:
+ /* Check for premature end of input */
+ if (!g_queue_is_empty(ctxt->stack)) {
+ parse_error(ctxt, token, "premature end of input");
+ }
+ break;
+
+ default:
+ result = json_parser_parse_token(ctxt, token);
+ break;
}
- result = parse_value(&ctxt, token);
- assert(ctxt.err || g_queue_is_empty(ctxt.buf));
-
- error_propagate(errp, ctxt.err);
-
- while (!g_queue_is_empty(ctxt.buf)) {
- parser_context_pop_token(&ctxt);
- }
- g_free(ctxt.current);
-
+ error_propagate(errp, ctxt->err);
return result;
}
diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
index b93d97b995f..6c93e6fd78d 100644
--- a/qobject/json-streamer.c
+++ b/qobject/json-streamer.c
@@ -32,6 +32,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
JSONTokenType type, int x, int y)
{
JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
+ JSONParserContext ctxt;
QObject *json = NULL;
Error *err = NULL;
JSONToken *token;
@@ -56,8 +57,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
if (g_queue_is_empty(&parser->tokens)) {
return;
}
- json = json_parser_parse(&parser->tokens, parser->ap, &err);
- goto out_emit;
+ break;
default:
break;
}
@@ -85,11 +85,24 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
g_queue_push_tail(&parser->tokens, token);
if ((parser->brace_count > 0 || parser->bracket_count > 0)
- && parser->brace_count >= 0 && parser->bracket_count >= 0) {
+ && parser->brace_count >= 0 && parser->bracket_count >= 0
+ && type != JSON_END_OF_INPUT) {
return;
}
- json = json_parser_parse(&parser->tokens, parser->ap, &err);
+ json_parser_init(&ctxt, parser->ap);
+
+ /* Process all tokens in the queue */
+ while (!g_queue_is_empty(&parser->tokens)) {
+ token = g_queue_pop_head(&parser->tokens);
+ json = json_parser_feed(&ctxt, token, &err);
+ g_free(token);
+ if (json || err) {
+ break;
+ }
+ }
+
+ json_parser_destroy(&ctxt);
out_emit:
parser->brace_count = 0;
--
2.52.0
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 2/5] json-parser: replace with a push parser
2026-01-07 8:48 ` [PATCH 2/5] json-parser: replace with a push parser Paolo Bonzini
@ 2026-02-09 9:36 ` Markus Armbruster
2026-02-09 10:53 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-09 9:36 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> In order to avoid stashing all the tokens corresponding to a JSON value,
> embed the parsing stack and state machine in JSONParser. This is more
> efficient and allows for more prompt error recovery; it also does not
> make the code substantially larger than the current recursive descent
> parser, though the state machine is probably a bit harder to follow.
I figure that's tolerable, because we don't expect to change the parser
much.
> The stack consists of QLists and QDicts corresponding to open
> brackets and braces, plus optionally a QString with the current
> key on top of each QDict.
>
> After each value is parsed, it is added to the top array or dictionary
> or, if the stack is empty, json_parser_feed returns the complete
> QObject.
>
> For now, json-streamer.c keeps tracking the tokens up until braces
> and brackets are balanced, and then shoves the whole queue of tokens
> into the push parser.
Peeking ahead... the next patch will make the streamer stream tokens as
they arrive?
> The only logic change is that JSON_END_OF_INPUT
> always triggers the emptying of the queue; the parser takes notice and
> checks that there is nothing on the stack. Not using brace_count
> and bracket_count for this is the first step towards improved separation
> of concerns between json-parser.c and json-streamer.c.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> include/qobject/json-parser.h | 6 +
> qobject/json-parser-int.h | 4 +-
> qobject/json-parser.c | 430 +++++++++++++++++-----------------
> qobject/json-streamer.c | 21 +-
> 4 files changed, 245 insertions(+), 216 deletions(-)
>
> diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
> index 7345a9bd5cb..05346fa816b 100644
> --- a/include/qobject/json-parser.h
> +++ b/include/qobject/json-parser.h
> @@ -20,6 +20,12 @@ typedef struct JSONLexer {
> int x, y;
> } JSONLexer;
>
> +typedef struct JSONParserContext {
> + Error *err;
> + GQueue *stack;
This tells us we have a stack of something, but not what the something
is. Further down, we'll see what the something is.
The commit message explains what the stack means.
Worth a comment here?
We know the stack is at most 1024 deep (MAX_NESTING). Have you
considered using an array instead of GQueue?
> + va_list *ap;
> +} JSONParserContext;
> +
Having to move struct definitions to headers is always a bit sad. Could
this go into json-parser-int.h instead?
> typedef struct JSONMessageParser {
> void (*emit)(void *opaque, QObject *json, Error *err);
> void *opaque;
> diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
> index 8c01f236276..05e2e8e1831 100644
> --- a/qobject/json-parser-int.h
> +++ b/qobject/json-parser-int.h
> @@ -49,6 +49,8 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
>
> /* json-parser.c */
> JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
> -QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp);
> +void json_parser_init(JSONParserContext *ctxt, va_list *ap);
> +QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
checkpatch points out
0002-json-parser-replace-with-a-push-parser.patch:64: WARNING: line over 80 characters
The line is easy enough to break.
Several more long lines below, not flagging them.
> +void json_parser_destroy(JSONParserContext *ctxt);
>
> #endif
Note: I'm jumping down to qobject/json-streamer.c further down before I
continue here. Suggest you do the same.
> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
> index eabc9f8358c..338f0c5aae6 100644
> --- a/qobject/json-parser.c
> +++ b/qobject/json-parser.c
> @@ -31,12 +31,36 @@ struct JSONToken {
> char str[];
> };
>
> -typedef struct JSONParserContext {
> - Error *err;
> - JSONToken *current;
> - GQueue *buf;
> - va_list *ap;
> -} JSONParserContext;
> +/*
> + * Objects: { } | { members }
> + * - Empty: { -> AFTER_LCURLY -> }
> + * - Non-empty: { -> AFTER_LCURLY -> BEFORE_KEY -> string -> END_OF_KEY -> : ->
> + * BEFORE_VALUE -> value -> END_OF_VALUE -> , -> BEFORE_KEY -> ... -> }
> + *
> + * Arrays: [ ] | [ elements ]
> + * - Empty: [ -> AFTER_LSQUARE -> ]
> + * - Non-empty: [ -> AFTER_LSQUARE -> BEFORE_VALUE -> value -> END_OF_VALUE -> , ->
> + * BEFORE_VALUE -> ... -> ]
> + *
> + * The two cases for END_OF_VALUE are distinguished based on the type of QObject at
> + * top-of-stack.
> + */
I'm afraid I need a comment telling me how to read this comment.
> +typedef enum JSONParserState {
> + AFTER_LCURLY,
> + AFTER_LSQUARE,
> + BEFORE_KEY,
> + BEFORE_VALUE,
> + END_OF_KEY,
> + END_OF_VALUE,
> +} JSONParserState;
> +
> +typedef struct JSONParserStackEntry {
> + /* A QString with the last parsed key, or a QList/QDict for the current container. */
> + QObject *partial;
> +
> + /* Needed to distinguish whether the parser is waiting for a colon or comma. */
> + JSONParserState state;
If the comment was 100% accurate, a bool would do.
> +} JSONParserStackEntry;
This is the "something" in the stack of something. Not locally obvious.
>
> #define BUG_ON(cond) assert(!(cond))
>
> @@ -49,7 +73,29 @@ typedef struct JSONParserContext {
> * 4) deal with premature EOI
> */
>
> -static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token);
> +static inline JSONParserStackEntry *current_entry(JSONParserContext *ctxt)
> +{
> + return g_queue_peek_tail(ctxt->stack);
> +}
> +
> +static void push_entry(JSONParserContext *ctxt, QObject *partial, JSONParserState state)
> +{
> + JSONParserStackEntry *entry = g_new(JSONParserStackEntry, 1);
I'd like a blank line between declarations and statements, please. More
of the same below, not flagging it again.
> + entry->partial = partial;
> + entry->state = state;
> + g_queue_push_tail(ctxt->stack, entry);
> +}
> +
> +/* Note that entry->partial does *not* lose its reference count even if value == NULL. */
> +static JSONParserStackEntry *pop_entry(JSONParserContext *ctxt, QObject **value)
> +{
> + JSONParserStackEntry *entry = g_queue_pop_tail(ctxt->stack);
> + if (value) {
> + *value = entry->partial;
> + }
> + g_free(entry);
> + return current_entry(ctxt);
> +}
This is a rather peculiar function. If you disagree, try writing a
contract for it :)
>
> /**
> * Error handler
> @@ -235,189 +281,7 @@ out:
> return NULL;
> }
>
> -/* Note: the token object returned by parser_context_pop_token is
> - * deleted as soon as parser_context_pop_token is called again.
> - */
> -static const JSONToken *parser_context_pop_token(JSONParserContext *ctxt)
> -{
> - g_free(ctxt->current);
> - ctxt->current = g_queue_pop_head(ctxt->buf);
> - return ctxt->current;
> -}
> -
> -/**
> - * Parsing rules
> - */
> -static int parse_pair(JSONParserContext *ctxt, const JSONToken *token, QDict *dict)
> -{
> - QObject *key_obj = NULL;
> - QString *key;
> - QObject *value;
> -
> - key_obj = parse_value(ctxt, token);
> - key = qobject_to(QString, key_obj);
> - if (!key) {
> - parse_error(ctxt, token, "key is not a string in object");
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - if (token->type != JSON_COLON) {
> - parse_error(ctxt, token, "missing : in object pair");
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - value = parse_value(ctxt, token);
> - if (value == NULL) {
> - parse_error(ctxt, token, "Missing value in dict");
> - goto out;
> - }
> -
> - if (qdict_haskey(dict, qstring_get_str(key))) {
> - parse_error(ctxt, token, "duplicate key");
> - goto out;
> - }
> -
> - qdict_put_obj(dict, qstring_get_str(key), value);
> -
> - qobject_unref(key_obj);
> - return 0;
> -
> -out:
> - qobject_unref(key_obj);
> - return -1;
> -}
> -
> -static QObject *parse_object(JSONParserContext *ctxt)
> -{
> - QDict *dict = NULL;
> - const JSONToken *token;
> -
> - dict = qdict_new();
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - if (token->type != JSON_RCURLY) {
> - if (parse_pair(ctxt, token, dict) == -1) {
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - while (token->type != JSON_RCURLY) {
> - if (token->type != JSON_COMMA) {
> - parse_error(ctxt, token, "expected separator in dict");
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - if (parse_pair(ctxt, token, dict) == -1) {
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> - }
> - }
> -
> - return QOBJECT(dict);
> -
> -out:
> - qobject_unref(dict);
> - return NULL;
> -}
> -
> -static QObject *parse_array(JSONParserContext *ctxt)
> -{
> - QList *list = NULL;
> - const JSONToken *token;
> -
> - list = qlist_new();
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - if (token->type != JSON_RSQUARE) {
> - QObject *obj;
> -
> - obj = parse_value(ctxt, token);
> - if (obj == NULL) {
> - parse_error(ctxt, token, "expecting value");
> - goto out;
> - }
> -
> - qlist_append_obj(list, obj);
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - while (token->type != JSON_RSQUARE) {
> - if (token->type != JSON_COMMA) {
> - parse_error(ctxt, token, "expected separator in list");
> - goto out;
> - }
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> -
> - obj = parse_value(ctxt, token);
> - if (obj == NULL) {
> - parse_error(ctxt, token, "expecting value");
> - goto out;
> - }
> -
> - qlist_append_obj(list, obj);
> -
> - token = parser_context_pop_token(ctxt);
> - if (token == NULL) {
> - parse_error(ctxt, NULL, "premature EOI");
> - goto out;
> - }
> - }
> - }
> -
> - return QOBJECT(list);
> -
> -out:
> - qobject_unref(list);
> - return NULL;
> -}
> +/* Terminals */
>
> static QObject *parse_keyword(JSONParserContext *ctxt, const JSONToken *token)
> {
> @@ -516,13 +380,17 @@ static QObject *parse_literal(JSONParserContext *ctxt, const JSONToken *token)
> }
> }
>
The parser's interesting part follows. The parser is a pushdown
automaton. The pushdown automaton is coded in C. On the one hand,
d'oh! Of course it is. On the other hand, it's hard to see the actual
automaton in C. May I have a comment explaining state and state
transitions? Perhaps we better start with an informal description,
discuss it, then distill the discussion into a comment.
> -static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
> +/* Parsing state machine */
> +
> +static QObject *parse_begin_value(JSONParserContext *ctxt, const JSONToken *token)
> {
> switch (token->type) {
> case JSON_LCURLY:
> - return parse_object(ctxt);
> + push_entry(ctxt, QOBJECT(qdict_new()), AFTER_LCURLY);
> + return NULL;
> case JSON_LSQUARE:
> - return parse_array(ctxt);
> + push_entry(ctxt, QOBJECT(qlist_new()), AFTER_LSQUARE);
> + return NULL;
> case JSON_INTERP:
> return parse_interpolation(ctxt, token);
> case JSON_INTEGER:
> @@ -537,6 +405,130 @@ static QObject *parse_value(JSONParserContext *ctxt, const JSONToken *token)
> }
> }
>
> +static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
Rename to parse_token()? Or inline into json_parser_feed()?
> +{
> + JSONParserStackEntry *entry;
> + JSONParserState state;
> + QString *key;
> + QObject *value = NULL;
> +
> + entry = current_entry(ctxt);
> + state = entry ? entry->state : BEFORE_VALUE;
Blank line here, please.
> + switch (state) {
> + case AFTER_LCURLY:
> + /* Grab '}' for empty object or fall through to BEFORE_KEY */
> + if (token->type == JSON_RCURLY) {
> + entry = pop_entry(ctxt, &value);
> + break;
> + }
> + entry->state = BEFORE_KEY;
> + /* fall through */
> +
> + case BEFORE_KEY:
> + /* Expecting object key */
> + if (token->type == JSON_STRING) {
> + key = parse_string(ctxt, token);
> + if (!key) {
> + return NULL;
> + }
Key to understand the switch is the meaning of "break" and "return
NULL".
We do the latter when we're done for this token.
We do the former when this token completed a (sub-)value. @value holds
it. If the stack is empty, @value is the result of the parse, and the
code after the switch returns it. Else, @value is a sub-value, and the
code after the switch stores it in the object being constructed.
> +
> + /* Store key in a special entry on the stack */
> + push_entry(ctxt, QOBJECT(key), END_OF_KEY);
> + } else {
> + parse_error(ctxt, token, "expecting key");
What's the state machine's parse error recovery story?
> + }
> + return NULL;
> +
> + case END_OF_KEY:
> + /* Expecting ':' after key */
> + if (token->type == JSON_COLON) {
> + entry->state = BEFORE_VALUE;
> + } else {
> + parse_error(ctxt, token, "expecting ':'");
> + }
> + return NULL;
> +
> + case AFTER_LSQUARE:
> + /* Grab ']' for empty array or fall through to BEFORE_VALUE */
> + if (token->type == JSON_RSQUARE) {
> + entry = pop_entry(ctxt, &value);
> + break;
> + }
> + entry->state = BEFORE_VALUE;
> + /* fall through */
> +
> + case BEFORE_VALUE:
> + /* Expecting value */
> + value = parse_begin_value(ctxt, token);
> + if (!value) {
> + /* Error or '['/'{' */
> + return NULL;
> + }
> + /* Return value or insert it into a container */
> + break;
> +
> + case END_OF_VALUE:
> + /* Grab ',' or ']' for array; ',' or '}' for object */
> + if (qobject_to(QList, entry->partial)) {
> + /* Array */
> + if (token->type != JSON_RSQUARE) {
> + if (token->type == JSON_COMMA) {
> + entry->state = BEFORE_VALUE;
> + } else {
> + parse_error(ctxt, token, "expected ',' or ']'");
> + }
> + return NULL;
> + }
> + } else if (qobject_to(QDict, entry->partial)) {
> + /* Object */
> + if (token->type != JSON_RCURLY) {
> + if (token->type == JSON_COMMA) {
> + entry->state = BEFORE_KEY;
> + } else {
> + parse_error(ctxt, token, "expected ',' or '}'");
> + }
> + return NULL;
> + }
> + } else {
> + g_assert_not_reached();
> + }
> +
> + /* Got ']' or '}', return value or insert into parent container */
> + entry = pop_entry(ctxt, &value);
> + break;
> + }
> +
> + assert(value);
> + if (entry == NULL) {
> + /* The toplevel value is complete. */
> + return value;
> + }
> +
> + key = qobject_to(QString, entry->partial);
> + if (key) {
> + const char *key_str;
> + QDict *dict;
> +
> + entry = pop_entry(ctxt, NULL);
> + dict = qobject_to(QDict, entry->partial);
> + assert(dict);
> + key_str = qstring_get_str(key);
> + if (qdict_haskey(dict, key_str)) {
> + parse_error(ctxt, token, "duplicate key");
> + qobject_unref(value);
> + return NULL;
> + }
> + qdict_put_obj(dict, key_str, value);
> + qobject_unref(key);
> + } else {
> + /* Add to array */
> + qlist_append_obj(qobject_to(QList, entry->partial), value);
> + }
> +
> + entry->state = END_OF_VALUE;
> + return NULL;
> +}
> +
> JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
> {
> JSONToken *token = g_malloc(sizeof(JSONToken) + tokstr->len + 1);
> @@ -549,27 +541,43 @@ JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
> return token;
> }
>
> -QObject *json_parser_parse(GQueue *tokens, va_list *ap, Error **errp)
> +void json_parser_init(JSONParserContext *ctxt, va_list *ap)
> {
> - JSONParserContext ctxt = { .buf = tokens, .ap = ap };
> - QObject *result;
> - const JSONToken *token;
> + ctxt->err = NULL;
> + ctxt->stack = g_queue_new();
> + ctxt->ap = ap;
> +}
>
> - token = parser_context_pop_token(&ctxt);
> - if (token == NULL) {
> - parse_error(&ctxt, NULL, "premature EOI");
> - return NULL;
> +void json_parser_destroy(JSONParserContext *ctxt)
> +{
> + JSONParserStackEntry *entry;
> +
> + while ((entry = g_queue_pop_tail(ctxt->stack)) != NULL) {
> + qobject_unref(entry->partial);
> + g_free(entry);
> + }
> + g_queue_free(ctxt->stack);
> + ctxt->stack = NULL;
> +}
> +
> +QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp)
The return value isn't immediately obvious. A brief function contract
could help the reader.
> +{
> + QObject *result = NULL;
> +
> + assert(!ctxt->err);
> + switch (token->type) {
> + case JSON_END_OF_INPUT:
> + /* Check for premature end of input */
> + if (!g_queue_is_empty(ctxt->stack)) {
> + parse_error(ctxt, token, "premature end of input");
> + }
> + break;
> +
> + default:
> + result = json_parser_parse_token(ctxt, token);
> + break;
> }
>
> - result = parse_value(&ctxt, token);
> - assert(ctxt.err || g_queue_is_empty(ctxt.buf));
> -
> - error_propagate(errp, ctxt.err);
> -
> - while (!g_queue_is_empty(ctxt.buf)) {
> - parser_context_pop_token(&ctxt);
> - }
> - g_free(ctxt.current);
> -
> + error_propagate(errp, ctxt->err);
> return result;
> }
> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
> index b93d97b995f..6c93e6fd78d 100644
> --- a/qobject/json-streamer.c
> +++ b/qobject/json-streamer.c
> @@ -32,6 +32,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> JSONTokenType type, int x, int y)
> {
> JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
> + JSONParserContext ctxt;
> QObject *json = NULL;
> Error *err = NULL;
> JSONToken *token;
> @@ -56,8 +57,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> if (g_queue_is_empty(&parser->tokens)) {
> return;
> }
> - json = json_parser_parse(&parser->tokens, parser->ap, &err);
> - goto out_emit;
> + break;
Before the patch: flush the queue on end of input.
Afterwards: push a JSON_END_OF_INPUT token into the queue.
> default:
> break;
> }
> @@ -85,11 +85,24 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> g_queue_push_tail(&parser->tokens, token);
>
> if ((parser->brace_count > 0 || parser->bracket_count > 0)
> - && parser->brace_count >= 0 && parser->bracket_count >= 0) {
> + && parser->brace_count >= 0 && parser->bracket_count >= 0
> + && type != JSON_END_OF_INPUT) {
> return;
Before the patch: flush the queue when both counts are zero or at least
one is negative.
After the patch: additionally flush it on end of input.
I figure the two hunks together make the parser see a JSON_END_OF_INPUT
token at the end of input. Correct?
> }
>
> - json = json_parser_parse(&parser->tokens, parser->ap, &err);
> + json_parser_init(&ctxt, parser->ap);
> +
> + /* Process all tokens in the queue */
> + while (!g_queue_is_empty(&parser->tokens)) {
> + token = g_queue_pop_head(&parser->tokens);
> + json = json_parser_feed(&ctxt, token, &err);
> + g_free(token);
> + if (json || err) {
> + break;
> + }
> + }
> +
> + json_parser_destroy(&ctxt);
This is where we switch from feeding the parser balanced expressions to
feeding it tokens one by one.
When we break the loop early, parser->tokens is not empty here. Fine,
because...
>
> out_emit:
> parser->brace_count = 0;
parser->bracket_count = 0;
json_message_free_tokens(parser);
... we drain it here.
parser->token_size = 0;
parser->emit(parser->opaque, json, err);
}
Back to qobject/json-parser.c now.
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 2/5] json-parser: replace with a push parser
2026-02-09 9:36 ` Markus Armbruster
@ 2026-02-09 10:53 ` Paolo Bonzini
2026-02-12 13:12 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-09 10:53 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On 2/9/26 10:36, Markus Armbruster wrote:
>> +typedef struct JSONParserContext {
>> + Error *err;
>> + GQueue *stack;
>
> This tells us we have a stack of something, but not what the something
> is. Further down, we'll see what the something is.
>
> The commit message explains what the stack means.
>
> Worth a comment here?
Yes, makes sense.
> We know the stack is at most 1024 deep (MAX_NESTING). Have you
> considered using an array instead of GQueue?
It'd be only 8K, but I preferred to keep MAX_NESTING hidden within
json-streamer.c. Not having bounds checking makes me a bit nervous
about having fixed-size arrays.
See more below about responsibility split between parser and streamer.
>> + va_list *ap;
>> +} JSONParserContext;
>> +
>
> Having to move struct definitions to headers is always a bit sad. Could
> this go into json-parser-int.h instead?
No because it's embedded in JSONMessageParser, just like JSONLexer. It
was previously hidden only because the parser was created and destroyed
after parentheses were balanced, but not doing that is kind of the whole
story here. :)
>> +/*
>> + * Objects: { } | { members }
>> + * - Empty: { -> AFTER_LCURLY -> }
>> + * - Non-empty: { -> AFTER_LCURLY -> BEFORE_KEY -> string -> END_OF_KEY -> : ->
>> + * BEFORE_VALUE -> value -> END_OF_VALUE -> , -> BEFORE_KEY -> ... -> }
>> + *
>> + * Arrays: [ ] | [ elements ]
>> + * - Empty: [ -> AFTER_LSQUARE -> ]
>> + * - Non-empty: [ -> AFTER_LSQUARE -> BEFORE_VALUE -> value -> END_OF_VALUE -> , ->
>> + * BEFORE_VALUE -> ... -> ]
>> + *
>> + * The two cases for END_OF_VALUE are distinguished based on the type of QObject at
>> + * top-of-stack.
>> + */
>
> I'm afraid I need a comment telling me how to read this comment.
That's basically the automaton that you request below (with epsilon
transitions), I'll expand on it.
>> +typedef enum JSONParserState {
>> + AFTER_LCURLY,
>> + AFTER_LSQUARE,
>> + BEFORE_KEY,
>> + BEFORE_VALUE,
>> + END_OF_KEY,
>> + END_OF_VALUE,
>> +} JSONParserState;
>> +
>> +typedef struct JSONParserStackEntry {
>> + /* A QString with the last parsed key, or a QList/QDict for the current container. */
>> + QObject *partial;
>> +
>> + /* Needed to distinguish whether the parser is waiting for a colon or comma. */
>> + JSONParserState state;
>
> If the comment was 100% accurate, a bool would do.
Hmm when I wrote I did think that a bool + the type of the top QObject
would do. But really there are three cases that the top QObject does
not distinguish:
- "after opening bracket, closed parenthesis or next element accepted",
- "after comma, next element required"
- "after element, closed parenthesis or comma accepted"
So, a bool would not be enough. Will fix.
>> +/* Note that entry->partial does *not* lose its reference count even if value == NULL. */
>> +static JSONParserStackEntry *pop_entry(JSONParserContext *ctxt, QObject **value)
>> +{
>> + JSONParserStackEntry *entry = g_queue_pop_tail(ctxt->stack);
>> + if (value) {
>> + *value = entry->partial;
>> + }
>> + g_free(entry);
>> + return current_entry(ctxt);
>> +}
>
> This is a rather peculiar function. If you disagree, try writing a
> contract for it :)
Hmm... yes, better pull the "value = entry->partial" to the callers even
if there are several of them. The code is overall simpler.
> The parser's interesting part follows. The parser is a pushdown
> automaton. The pushdown automaton is coded in C. On the one hand,
> d'oh! Of course it is. On the other hand, it's hard to see the actual
> automaton in C. May I have a comment explaining state and state
> transitions? Perhaps we better start with an informal description,
> discuss it, then distill the discussion into a comment.
That was the purpose of the mysterious comment above. If a mix between
BNF and automaton is okay for you, it could be something like
input := value -> END_OF_VALUE
END_OF_INPUT -> check stack is empty, return parsed value
// entered on BEFORE_VALUE; after any of these rules are processed, the
// parser has completed a QObject and is in the END_OF_VALUE state.
//
// When the parser reaches the END_OF_VALUE state, it examines the
// top of the stack to see if it's coming from "input" (stack empty),
// "array_items" (TOS is a QList) or "dict_pairs" (TOS is a QString; the
// item below will be a QDict). It then proceeds with the corresponding
// actions, which will be one of:
// - return parsed value
// - add value to QList
// - pop QString with the key, add key/value to the QDict
value := literal -> END_OF_VALUE
| '[' -> push empty QList -> AFTER_LSQUARE
after_lsquare -> END_OF_VALUE
| '{' -> push empty QDict -> AFTER_LCURLY
after_lcurly -> END_OF_VALUE
// non-recursive values, entered on BEFORE_VALUE
literal := INTEGER -> END_OF_VALUE
| FLOAT -> END_OF_VALUE
| KEYWORD -> END_OF_VALUE
| STRING -> END_OF_VALUE
| INTERP -> END_OF_VALUE
// entered on AFTER_LSQUARE
after_lsquare := ']' -> pop completed QList -> END_OF_VALUE
| ϵ -> BEFORE_VALUE
array_items -> END_OF_VALUE
// entered on BEFORE_VALUE, with TOS being a QList
array_items := value -> add value to QList -> END_OF_VALUE
(']' -> pop completed QList -> END_OF_VALUE
| ',' -> BEFORE_VALUE
array_items) -> END_OF_VALUE
// entered on AFTER_LCURLY
after_lcurly := '}' -> pop completed QDict -> END_OF_VALUE
| ϵ -> BEFORE_KEY
dict_pairs -> END_OF_VALUE
// entered on BEFORE_KEY, with TOS being a QDict
dict_pairs := STRING -> push QString -> END_OF_KEY
':' -> BEFORE_VALUE
value -> pop QString + add pair to QDict -> END_OF_VALUE
('}' -> pop completed QDict -> END_OF_VALUE
| ',' -> BEFORE_KEY
dict_pairs) -> END_OF_VALUE
>> +static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
>
> Rename to parse_token()? Or inline into json_parser_feed()?
Renaming is good.
>> + if (!key) {
>> + return NULL;
>> + }
>
> Key to understand the switch is the meaning of "break" and "return
> NULL".
>
> We do the latter when we're done for this token.
>
> We do the former when this token completed a (sub-)value. @value holds
> it. If the stack is empty, @value is the result of the parse, and the
> code after the switch returns it. Else, @value is a sub-value, and the
> code after the switch stores it in the object being constructed.
Correct. Want me to include it somehow or is the grammar above fine?
>> +
>> + /* Store key in a special entry on the stack */
>> + push_entry(ctxt, QOBJECT(key), END_OF_KEY);
>> + } else {
>> + parse_error(ctxt, token, "expecting key");
>
> What's the state machine's parse error recovery story?
As simple as it can be:
assert(!ctxt->err);
because a push parser can actually refuse to deal with tokens after an
error, and make recovery someone else's problem.
In other words a push parser can actually be the pure theoretical thing
from your compilers and automata book, and all the engineering sits on
top. IMO this (at least for a JSON parser that you won't touch that
often) balances the complexity of the state machine.
So, error recovery is handled entirely in json-streamer.c. As of this
patch, json-streamer.c has gathered all tokens up to the next balanced
parentheses, and that's implicitly the place where the error recovery is
complete.
>> +QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp)
>
> The return value isn't immediately obvious. A brief function contract
> could help the reader.
Ok, will add.
> Before the patch: flush the queue when both counts are zero or at least
> one is negative.
>
> After the patch: additionally flush it on end of input.
>
> I figure the two hunks together make the parser see a JSON_END_OF_INPUT
> token at the end of input. Correct?
Yes, see the commit message. This is the only change I have made here
to the parser/streamer split, because it makes sense to have the
unbalanced parentheses error happen in the parser. Otherwise you're not
really implementing the whole grammar.
>> }
>>
>> - json = json_parser_parse(&parser->tokens, parser->ap, &err);
>> + json_parser_init(&ctxt, parser->ap);
>> +
>> + /* Process all tokens in the queue */
>> + while (!g_queue_is_empty(&parser->tokens)) {
>> + token = g_queue_pop_head(&parser->tokens);
>> + json = json_parser_feed(&ctxt, token, &err);
>> + g_free(token);
>> + if (json || err) {
>> + break;
>> + }
>> + }
>> +
>> + json_parser_destroy(&ctxt);
>
> This is where we switch from feeding the parser balanced expressions to
> feeding it tokens one by one.
>
> When we break the loop early, parser->tokens is not empty here. Fine,
> because...
>
>>
>> out_emit:
>> parser->brace_count = 0;
> parser->bracket_count = 0;
> json_message_free_tokens(parser);
>
> ... we drain it here.
>
> parser->token_size = 0;
> parser->emit(parser->opaque, json, err);
> }
Correct, and this reset code will survive in patch 3, sort of like this:
// feed tokens as they're passed to the streamer
if (!ctxt->err) {
json = json_parser_feed(...)
if (json || ctxt->err) {
parser->emit(parser->opaque, json, ctxt->err);
}
}
if (value || error recovery complete) {
// reset the streamer state
// reset the parser
}
Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 2/5] json-parser: replace with a push parser
2026-02-09 10:53 ` Paolo Bonzini
@ 2026-02-12 13:12 ` Markus Armbruster
2026-02-16 16:41 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-12 13:12 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 2/9/26 10:36, Markus Armbruster wrote:
>>> +typedef struct JSONParserContext {
>>> + Error *err;
>>> + GQueue *stack;
>>
>> This tells us we have a stack of something, but not what the something
>> is. Further down, we'll see what the something is.
>>
>> The commit message explains what the stack means.
>>
>> Worth a comment here?
>
> Yes, makes sense.
>
>> We know the stack is at most 1024 deep (MAX_NESTING). Have you
>> considered using an array instead of GQueue?
>
> It'd be only 8K, but I preferred to keep MAX_NESTING hidden within
> json-streamer.c. Not having bounds checking makes me a bit nervous
> about having fixed-size arrays.
Looks mostly harmless to me: there's exactly one spot where we push
something onto the stack, one where we pop, and one where we pop
everything. We only ever look at the top of the stack.
> See more below about responsibility split between parser and streamer.
>
>>> + va_list *ap;
>>> +} JSONParserContext;
>>> +
>>
>> Having to move struct definitions to headers is always a bit sad. Could
>> this go into json-parser-int.h instead?
>
> No because it's embedded in JSONMessageParser, just like JSONLexer. It
> was previously hidden only because the parser was created and destroyed
> after parentheses were balanced, but not doing that is kind of the whole
> story here. :)
I see.
To avoid putting private parser state in json-parser.h, we'd need to let
json_message_parser_init() allocate it dynamically. Reasonable option,
but let's stay on target here. We can always do it later.
[...]
>>> +typedef enum JSONParserState {
>>> + AFTER_LCURLY,
>>> + AFTER_LSQUARE,
>>> + BEFORE_KEY,
>>> + BEFORE_VALUE,
>>> + END_OF_KEY,
>>> + END_OF_VALUE,
>>> +} JSONParserState;
>>> +
>>> +typedef struct JSONParserStackEntry {
>>> + /* A QString with the last parsed key, or a QList/QDict for the current container. */
>>> + QObject *partial;
>>> +
>>> + /* Needed to distinguish whether the parser is waiting for a colon or comma. */
>>> + JSONParserState state;
>> If the comment was 100% accurate, a bool would do.
>
> Hmm when I wrote I did think that a bool + the type of the top QObject would do. But really there are three cases that the top QObject does not distinguish:
>
> - "after opening bracket, closed parenthesis or next element accepted",
>
> - "after comma, next element required"
>
> - "after element, closed parenthesis or comma accepted"
>
> So, a bool would not be enough. Will fix.
>
>>> +/* Note that entry->partial does *not* lose its reference count even if value == NULL. */
>>> +static JSONParserStackEntry *pop_entry(JSONParserContext *ctxt, QObject **value)
>>> +{
>>> + JSONParserStackEntry *entry = g_queue_pop_tail(ctxt->stack);
>>> + if (value) {
>>> + *value = entry->partial;
>>> + }
>>> + g_free(entry);
>>> + return current_entry(ctxt);
>>> +}
>>
>> This is a rather peculiar function. If you disagree, try writing a
>> contract for it :)
>
> Hmm... yes, better pull the "value = entry->partial" to the callers even
> if there are several of them. The code is overall simpler.
That's what I figured.
>> The parser's interesting part follows. The parser is a pushdown
>> automaton. The pushdown automaton is coded in C. On the one hand,
>> d'oh! Of course it is. On the other hand, it's hard to see the actual
>> automaton in C. May I have a comment explaining state and state
>> transitions? Perhaps we better start with an informal description,
>> discuss it, then distill the discussion into a comment.
>
> That was the purpose of the mysterious comment above. If a mix between
> BNF and automaton is okay for you, it could be something like
>
> input := value -> END_OF_VALUE
> END_OF_INPUT -> check stack is empty, return parsed value
>
> // entered on BEFORE_VALUE; after any of these rules are processed, the
> // parser has completed a QObject and is in the END_OF_VALUE state.
What do you mean by "after any of these rules are processed, ..."?
> //
> // When the parser reaches the END_OF_VALUE state, it examines the
> // top of the stack to see if it's coming from "input" (stack empty),
> // "array_items" (TOS is a QList) or "dict_pairs" (TOS is a QString; the
> // item below will be a QDict). It then proceeds with the corresponding
> // actions, which will be one of:
> // - return parsed value
> // - add value to QList
> // - pop QString with the key, add key/value to the QDict
Forward references to array_items and dict_pairs. Suggest to add a
suitable "below", or indicate the forward reference some other way.
>
> value := literal -> END_OF_VALUE
> | '[' -> push empty QList -> AFTER_LSQUARE
> after_lsquare -> END_OF_VALUE
> | '{' -> push empty QDict -> AFTER_LCURLY
> after_lcurly -> END_OF_VALUE
>
> // non-recursive values, entered on BEFORE_VALUE
> literal := INTEGER -> END_OF_VALUE
> | FLOAT -> END_OF_VALUE
> | KEYWORD -> END_OF_VALUE
> | STRING -> END_OF_VALUE
> | INTERP -> END_OF_VALUE
>
> // entered on AFTER_LSQUARE
> after_lsquare := ']' -> pop completed QList -> END_OF_VALUE
> | ϵ -> BEFORE_VALUE
> array_items -> END_OF_VALUE
>
> // entered on BEFORE_VALUE, with TOS being a QList
> array_items := value -> add value to QList -> END_OF_VALUE
> (']' -> pop completed QList -> END_OF_VALUE
> | ',' -> BEFORE_VALUE
> array_items) -> END_OF_VALUE
>
> // entered on AFTER_LCURLY
> after_lcurly := '}' -> pop completed QDict -> END_OF_VALUE
> | ϵ -> BEFORE_KEY
> dict_pairs -> END_OF_VALUE
>
> // entered on BEFORE_KEY, with TOS being a QDict
> dict_pairs := STRING -> push QString -> END_OF_KEY
> ':' -> BEFORE_VALUE
> value -> pop QString + add pair to QDict -> END_OF_VALUE
> ('}' -> pop completed QDict -> END_OF_VALUE
> | ',' -> BEFORE_KEY
> dict_pairs) -> END_OF_VALUE
The BNF part is everything but the -> ... Easy enough. However, I'm
not quite certain how to interpret the --> ...
The "// entered on STATE ..." I can read as assertion.
The text after "->" seems to be one ore more state / stack transitions.
Let me describe them in my own words to make sure I understand you.
Note that I had to also look at the source code to produce my
description.
-> STATE
Set top stack entry's state to STATE.
-> push OBJ -> STATE
Push (STATE, OBJ) onto the stack.
-> pop completed OBJ -> END_OF_VALUE
Pop (_, OBJ) from the stack, set new top stack entry's state to
END_OF_VALUE. OBJ is the result of parsing non-terminal @value.
-> add value to QList -> END_OF_VALUE
Append the result of parsing non-terminal @value to the top stack
entry's QList. Set the top stack entry's state to END_OF_VALUE.
-> pop QString + add pair to QDict -> END_OF_VALUE
Pop (_, QString) from the stack, it's the key. Its value is the
result of parsing non-terminal value. Store this key: value pair in
the top stack entry's QDict. Set the top stack entry's state to
END_OF_VALUE.
Unsaid, but reasonably obvious: the result of parsing non-terminal
@value when it's not an array or object.
The structure of the stack isn't explicitly explained, but can be
extracted from the above.
Stack entry members @partial from bottom to top:
* a QList element for an unclosed '[',
* a QDict and a QString element for an unclosed '{', except the
innermost one need not have a QString. This is because the QString is
on the stack only while we're parsing ':' value.
So, stack contents invariant:
( QList | QDict QString )* QDict?
I found the QString business somewhat confusing, probably because I
expected one stack entry per nesting level, and tossing that invalid
assumption took mental effort.
I figure we could have one stack entry per: add a const char *key member
to JSONParserStackEntry, non-null exactly when we have a QString sitting
on a QDict now. We'd trade representable invalid state "QString
directly above non-QDict in the stack" for "QList stack entry has member
@key set". Would be slightly more local. Matter of taste, I guess.
More representable invalid state, as far as I can tell (observation, not
objection):
* a QList stack entry can only have state AFTER_LSQUARE, BEFORE_VALUE,
END_OF_VALUE, never AFTER_LCURLY, BEFORE_KEY, END_OF_KEY
* a QDict stack entry can only have state AFTER_LCURLY, BEFORE_KEY,
END_OF_VALUE, never AFTER_LSQUARE, BEFORE_VALUE, END_OF_KEY
* a QString stack entry can only have state BEFORE_VALUE and END_OF_KEY,
never AFTER_LCURLY, AFTER_LSQUARE, BEFORE_VALUE, END_OF_VALUE.
Stack entries by @state instead of by @partial:
* AFTER_LCURLY: @partial is QDict
* AFTER_LSQUARE: @partial is QList
* BEFORE_KEY: @partial is QDict
* BEFORE_VALUE: @partial is QList or QString
* END_OF_KEY: @partial is QString
* END_OF_VALUE: @partial is QList or QString
Invariants worth documenting, I think.
Here's my attempt at explaining the automaton and how states and stack
fit together: annotated JSON syntax diagrams. The diagrams are from
www.json.org with whitespace nonterminal omitted for brevity.
Semantic actions are connected with dashed lines. State names are
abbreviated to ALC, ALS, BK, BV, EOK, EOV. (STATE, QTYPE) describes the
top stack entry. "assert (STATE, QTYPE)" asserts the top stack entry's
@state is STATE, and its @partial is of QTYPE. "set STATE" means the
top stack entry's state is set to STATE.
set EOV
result is QObject
value ┆
>────────╮─── string ──╭───>
┆ │ │
assert (BV, _) ╰─── number ──╯
│ │
╰─── object ──╯
│ │
╰─── array ───╯
│ │
╰─── true ────╯
│ │
╰─── false ───╯
│ │
╰─── null ────╯
push (ALS, QList) pop (ALS, QList)
┆ assert (BV, _)
┆ set EOV
┆ result is popped QList
array ┆ ┆
>─────── [ ─╮────────────╭─ ] ──>
┆ │ │
assert (BV, _) │ ┄ set BV │
│ │
set BV │ │
┆ │ │
╭─ , ────╰── value ──╮╯
│ │ ┄ assert (EOV, QList)
╰────────────────────╯ put value into QList
push (ALC, QDict) pop (ALC, QDict)
┆ assert (BV, _)
┆ set EOV
┆ result is popped QDict
object ┆ ┆
>─────── { ─╮───────────────────────────╭─ } ──>
┆ │ │
assert (BV, _) │ ┄ set BK │
│ │
set BK │ push (EOK, QString) │
┆ │ ┆ │
╭─ , ────╰─ string ─── : ─── value ─╮╯
│ ┆ │ pop (BV, QString)
│ set BV │ ┄ assert (EOV, QDict)
│ │ put string: value
╰───────────────────────────────────╯ into QDict
Might be your turn to be confused ;)
>>> +static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
>>
>> Rename to parse_token()? Or inline into json_parser_feed()?
>
> Renaming is good.
>
>>> + if (!key) {
>>> + return NULL;
>>> + }
>>
>> Key to understand the switch is the meaning of "break" and "return
>> NULL".
>>
>> We do the latter when we're done for this token.
>>
>> We do the former when this token completed a (sub-)value. @value holds
>> it. If the stack is empty, @value is the result of the parse, and the
>> code after the switch returns it. Else, @value is a sub-value, and the
>> code after the switch stores it in the object being constructed.
>
> Correct. Want me to include it somehow or is the grammar above fine?
Whether the grammar makes the meaning of "break" and "return NULL"
obvious enough is hard to tell now: I know too much. Should fix itself
within a few days ;)
>>> +
>>> + /* Store key in a special entry on the stack */
>>> + push_entry(ctxt, QOBJECT(key), END_OF_KEY);
>>> + } else {
>>> + parse_error(ctxt, token, "expecting key");
>>
>> What's the state machine's parse error recovery story?
>
> As simple as it can be:
>
> assert(!ctxt->err);
>
> because a push parser can actually refuse to deal with tokens after an
> error, and make recovery someone else's problem.
>
> In other words a push parser can actually be the pure theoretical thing
> from your compilers and automata book, and all the engineering sits on
> top. IMO this (at least for a JSON parser that you won't touch that
> often) balances the complexity of the state machine.
>
> So, error recovery is handled entirely in json-streamer.c. As of this
> patch, json-streamer.c has gathered all tokens up to the next balanced
> parentheses, and that's implicitly the place where the error recovery is
> complete.
This became clear to me once I had processed the remainder of the
series.
Peeling error recovery off the pushdown automaton that way indeed
simplifies the automaton. Neat! Possible because JSON's simple syntax
lets us get away with simple parenthesis counting error recovery.
>>> +QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp)
>>
>> The return value isn't immediately obvious. A brief function contract
>> could help the reader.
>
> Ok, will add.
>
>> Before the patch: flush the queue when both counts are zero or at least
>> one is negative.
>>
>> After the patch: additionally flush it on end of input.
>>
>> I figure the two hunks together make the parser see a JSON_END_OF_INPUT
>> token at the end of input. Correct?
>
> Yes, see the commit message. This is the only change I have made here
> to the parser/streamer split, because it makes sense to have the
> unbalanced parentheses error happen in the parser. Otherwise you're not
> really implementing the whole grammar.
Yes.
At the end of the series, the only error checks in json-streamer.c are
about limits. These aren't part of the syntax. Everything that is part
of the syntax is checked in json-parser.c, except for lexical errors,
which are diagnosed in json-lexer.c, and reported in json-parser.c.
Perfectly sensible.
[...]
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 2/5] json-parser: replace with a push parser
2026-02-12 13:12 ` Markus Armbruster
@ 2026-02-16 16:41 ` Paolo Bonzini
2026-02-17 7:39 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-16 16:41 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On 2/12/26 14:12, Markus Armbruster wrote:
>>> We know the stack is at most 1024 deep (MAX_NESTING). Have you
>>> considered using an array instead of GQueue?
>>
>> It'd be only 8K, but I preferred to keep MAX_NESTING hidden within
>> json-streamer.c. Not having bounds checking makes me a bit nervous
>> about having fixed-size arrays.
>
> Looks mostly harmless to me: there's exactly one spot where we push
> something onto the stack, one where we pop, and one where we pop
> everything. We only ever look at the top of the stack.
Yes, code-wise it's not hard. But it's ugly/brittle to have two pieces
of code that have to sync on MAX_NESTING.
>>> The parser's interesting part follows. The parser is a pushdown
>>> automaton. The pushdown automaton is coded in C. On the one hand,
>>> d'oh! Of course it is. On the other hand, it's hard to see the actual
>>> automaton in C. May I have a comment explaining state and state
>>> transitions? Perhaps we better start with an informal description,
>>> discuss it, then distill the discussion into a comment.
>>
>> That was the purpose of the mysterious comment above. If a mix between
>> BNF and automaton is okay for you, it could be something like
>>
>> input := value -> END_OF_VALUE
>> END_OF_INPUT -> check stack is empty, return parsed value
>>
>> // entered on BEFORE_VALUE; after any of these rules are processed, the
>> // parser has completed a QObject and is in the END_OF_VALUE state.
>
> What do you mean by "after any of these rules are processed, ..."?
That "literal", "'[' after_lsquare" and "'{' after_lcurly" both end up
with the state machine in the END_OF_VALUE state.
>> //
>> // When the parser reaches the END_OF_VALUE state, it examines the
>> // top of the stack to see if it's coming from "input" (stack empty),
>> // "array_items" (TOS is a QList) or "dict_pairs" (TOS is a QString; the
>> // item below will be a QDict). It then proceeds with the corresponding
>> // actions, which will be one of:
>> // - return parsed value
>> // - add value to QList
>> // - pop QString with the key, add key/value to the QDict
>
> Forward references to array_items and dict_pairs. Suggest to add a
> suitable "below", or indicate the forward reference some other way.
I can put instead after_lsquare and after_lcurly.
>> value := literal -> END_OF_VALUE
>> | '[' -> push empty QList -> AFTER_LSQUARE
>> after_lsquare -> END_OF_VALUE
>> | '{' -> push empty QDict -> AFTER_LCURLY
>> after_lcurly -> END_OF_VALUE
>>
>> // non-recursive values, entered on BEFORE_VALUE
>> literal := INTEGER -> END_OF_VALUE
>> | FLOAT -> END_OF_VALUE
>> | KEYWORD -> END_OF_VALUE
>> | STRING -> END_OF_VALUE
>> | INTERP -> END_OF_VALUE
>>
>> // entered on AFTER_LSQUARE
>> after_lsquare := ']' -> pop completed QList -> END_OF_VALUE
>> | ϵ -> BEFORE_VALUE
>> array_items -> END_OF_VALUE
>>
>> // entered on BEFORE_VALUE, with TOS being a QList
>> array_items := value -> add value to QList -> END_OF_VALUE
>> (']' -> pop completed QList -> END_OF_VALUE
>> | ',' -> BEFORE_VALUE
>> array_items) -> END_OF_VALUE
>>
>> // entered on AFTER_LCURLY
>> after_lcurly := '}' -> pop completed QDict -> END_OF_VALUE
>> | ϵ -> BEFORE_KEY
>> dict_pairs -> END_OF_VALUE
>>
>> // entered on BEFORE_KEY, with TOS being a QDict
>> dict_pairs := STRING -> push QString -> END_OF_KEY
>> ':' -> BEFORE_VALUE
>> value -> pop QString + add pair to QDict -> END_OF_VALUE
>> ('}' -> pop completed QDict -> END_OF_VALUE
>> | ',' -> BEFORE_KEY
>> dict_pairs) -> END_OF_VALUE
>
> The BNF part is everything but the -> ... Easy enough. However, I'm
> not quite certain how to interpret the --> ...
>
> The "// entered on STATE ..." I can read as assertion.
Yes, that's a comment (with C++ syntax to have something familiar).
> The text after "->" seems to be one ore more state / stack transitions.
> Let me describe them in my own words to make sure I understand you.
> Note that I had to also look at the source code to produce my
> description.
>
> -> STATE
>
> Set top stack entry's state to STATE.
>
> -> push OBJ -> STATE
>
> Push (STATE, OBJ) onto the stack.
Intended more like push (OBJ, undefined), set top stack entry state to
STATE, but yes.
> -> pop completed OBJ -> END_OF_VALUE
>
> Pop (_, OBJ) from the stack, set new top stack entry's state to
> END_OF_VALUE. OBJ is the result of parsing non-terminal @value.
>
> -> add value to QList -> END_OF_VALUE
>
> Append the result of parsing non-terminal @value to the top stack
> entry's QList. Set the top stack entry's state to END_OF_VALUE.
>
> -> pop QString + add pair to QDict -> END_OF_VALUE
>
> Pop (_, QString) from the stack, it's the key. Its value is the
> result of parsing non-terminal value. Store this key: value pair in
> the top stack entry's QDict. Set the top stack entry's state to
> END_OF_VALUE.
Correct. "-> STATE" generally means "go to state STATE", where teh state
is stored in the top stack entry.
> Stack entry members @partial from bottom to top:
>
> * a QList element for an unclosed '[',
>
> * a QDict and a QString element for an unclosed '{', except the
> innermost one need not have a QString. This is because the QString is
> on the stack only while we're parsing ':' value.
>
> So, stack contents invariant:
>
> ( QList | QDict QString )* QDict?
Makes sense.
> I found the QString business somewhat confusing, probably because I
> expected one stack entry per nesting level, and tossing that invalid
> assumption took mental effort.
... while I wrote the parser as one per pending nonterminal. Coming
from the recursive descent probably suggested that approach, somewhat
subconsciously.
> I figure we could have one stack entry per: add a const char *key member
> to JSONParserStackEntry, non-null exactly when we have a QString sitting
> on a QDict now. We'd trade representable invalid state "QString
> directly above non-QDict in the stack" for "QList stack entry has member
> @key set". Would be slightly more local. Matter of taste, I guess.
I can check what the code looks like, sure.
> More representable invalid state, as far as I can tell (observation, not
> objection):
>
> * a QList stack entry can only have state AFTER_LSQUARE, BEFORE_VALUE,
> END_OF_VALUE, never AFTER_LCURLY, BEFORE_KEY, END_OF_KEY
>
> * a QDict stack entry can only have state AFTER_LCURLY, BEFORE_KEY,
> END_OF_VALUE, never AFTER_LSQUARE, BEFORE_VALUE, END_OF_KEY
>
> * a QString stack entry can only have state BEFORE_VALUE and END_OF_KEY,
> never AFTER_LCURLY, AFTER_LSQUARE, BEFORE_VALUE, END_OF_VALUE.
>
> Stack entries by @state instead of by @partial:
> * AFTER_LCURLY: @partial is QDict
> * AFTER_LSQUARE: @partial is QList
> * BEFORE_KEY: @partial is QDict
> * BEFORE_VALUE: @partial is QList or QString
... or empty:
state = entry ? entry->state : BEFORE_VALUE;
> * END_OF_KEY: @partial is QString
> * END_OF_VALUE: @partial is QList or QString
>
> Invariants worth documenting, I think.
Ok---as assertions in the code?
> Here's my attempt at explaining the automaton and how states and stack
> fit together: annotated JSON syntax diagrams. The diagrams are from
> www.json.org with whitespace nonterminal omitted for brevity.
>
> Semantic actions are connected with dashed lines. State names are
> abbreviated to ALC, ALS, BK, BV, EOK, EOV. (STATE, QTYPE) describes the
> top stack entry. "assert (STATE, QTYPE)" asserts the top stack entry's
> @state is STATE, and its @partial is of QTYPE. "set STATE" means the
> top stack entry's state is set to STATE.
>
> set EOV
> result is QObject
> value ┆
> >────────╮─── string ──╭───>
> ┆ │ │
> assert (BV, _) ╰─── number ──╯
> │ │
> ╰─── object ──╯
> │ │
> ╰─── array ───╯
> │ │
> ╰─── true ────╯
> │ │
> ╰─── false ───╯
> │ │
> ╰─── null ────╯
>
>
> push (ALS, QList) pop (ALS, QList)
> ┆ assert (BV, _)
> ┆ set EOV
> ┆ result is popped QList
> array ┆ ┆
> >─────── [ ─╮────────────╭─ ] ──>
> ┆ │ │
> assert (BV, _) │ ┄ set BV │
> │ │
> set BV │ │
> ┆ │ │
> ╭─ , ────╰── value ──╮╯
> │ │ ┄ assert (EOV, QList)
> ╰────────────────────╯ put value into QList
>
>
>
> push (ALC, QDict) pop (ALC, QDict)
> ┆ assert (BV, _)
> ┆ set EOV
> ┆ result is popped QDict
> object ┆ ┆
> >─────── { ─╮───────────────────────────╭─ } ──>
> ┆ │ │
> assert (BV, _) │ ┄ set BK │
> │ │
> set BK │ push (EOK, QString) │
> ┆ │ ┆ │
> ╭─ , ────╰─ string ─── : ─── value ─╮╯
> │ ┆ │ pop (BV, QString)
> │ set BV │ ┄ assert (EOV, QDict)
> │ │ put string: value
> ╰───────────────────────────────────╯ into QDict
>
> Might be your turn to be confused ;)
I'm not confused but maybe it is a bit too detailed.
>>>> +static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
>>>
>>> Rename to parse_token()? Or inline into json_parser_feed()?
>>
>> Renaming is good.
>>
>>>> + if (!key) {
>>>> + return NULL;
>>>> + }
>>>
>>> Key to understand the switch is the meaning of "break" and "return
>>> NULL".
>>>
>>> We do the latter when we're done for this token.
>>>
>>> We do the former when this token completed a (sub-)value. @value holds
>>> it. If the stack is empty, @value is the result of the parse, and the
>>> code after the switch returns it. Else, @value is a sub-value, and the
>>> code after the switch stores it in the object being constructed.
>>
>> Correct. Want me to include it somehow or is the grammar above fine?
>
> Whether the grammar makes the meaning of "break" and "return NULL"
> obvious enough is hard to tell now: I know too much. Should fix itself
> within a few days ;)
Probably not. :) I'll see if I can move the post-pr
Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 2/5] json-parser: replace with a push parser
2026-02-16 16:41 ` Paolo Bonzini
@ 2026-02-17 7:39 ` Markus Armbruster
0 siblings, 0 replies; 26+ messages in thread
From: Markus Armbruster @ 2026-02-17 7:39 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 2/12/26 14:12, Markus Armbruster wrote:
>>>> We know the stack is at most 1024 deep (MAX_NESTING). Have you
>>>> considered using an array instead of GQueue?
>>>
>>> It'd be only 8K, but I preferred to keep MAX_NESTING hidden within
>>> json-streamer.c. Not having bounds checking makes me a bit nervous
>>> about having fixed-size arrays.
>>
>> Looks mostly harmless to me: there's exactly one spot where we push
>> something onto the stack, one where we pop, and one where we pop
>> everything. We only ever look at the top of the stack.
>
> Yes, code-wise it's not hard. But it's ugly/brittle to have two pieces
> of code that have to sync on MAX_NESTING.
>
>>>> The parser's interesting part follows. The parser is a pushdown
>>>> automaton. The pushdown automaton is coded in C. On the one hand,
>>>> d'oh! Of course it is. On the other hand, it's hard to see the actual
>>>> automaton in C. May I have a comment explaining state and state
>>>> transitions? Perhaps we better start with an informal description,
>>>> discuss it, then distill the discussion into a comment.
>>>
>>> That was the purpose of the mysterious comment above. If a mix between
>>> BNF and automaton is okay for you, it could be something like
>>>
>>> input := value -> END_OF_VALUE
>>> END_OF_INPUT -> check stack is empty, return parsed value
>>>
>>> // entered on BEFORE_VALUE; after any of these rules are processed, the
>>> // parser has completed a QObject and is in the END_OF_VALUE state.
>>
>> What do you mean by "after any of these rules are processed, ..."?
>
> That "literal", "'[' after_lsquare" and "'{' after_lcurly" both end up
> with the state machine in the END_OF_VALUE state.
So this is a "how to read the annotated BNF" comment?
>>> //
>>> // When the parser reaches the END_OF_VALUE state, it examines the
>>> // top of the stack to see if it's coming from "input" (stack empty),
>>> // "array_items" (TOS is a QList) or "dict_pairs" (TOS is a QString; the
>>> // item below will be a QDict). It then proceeds with the corresponding
>>> // actions, which will be one of:
>>> // - return parsed value
>>> // - add value to QList
>>> // - pop QString with the key, add key/value to the QDict
>>
>> Forward references to array_items and dict_pairs. Suggest to add a
>> suitable "below", or indicate the forward reference some other way.
>
> I can put instead after_lsquare and after_lcurly.
BNF usually reads easiest when written top down, i.e. start symbol
production first, other non-terminals' productions after their use.
I don't mind the forward references in the comment here, it just took me
a second to recognize them. No real problem, but perhaps adding "above"
/ "below" could help the reader.
>>> value := literal -> END_OF_VALUE
>>> | '[' -> push empty QList -> AFTER_LSQUARE
>>> after_lsquare -> END_OF_VALUE
>>> | '{' -> push empty QDict -> AFTER_LCURLY
>>> after_lcurly -> END_OF_VALUE
>>>
>>> // non-recursive values, entered on BEFORE_VALUE
>>> literal := INTEGER -> END_OF_VALUE
>>> | FLOAT -> END_OF_VALUE
>>> | KEYWORD -> END_OF_VALUE
>>> | STRING -> END_OF_VALUE
>>> | INTERP -> END_OF_VALUE
>>>
>>> // entered on AFTER_LSQUARE
>>> after_lsquare := ']' -> pop completed QList -> END_OF_VALUE
>>> | ϵ -> BEFORE_VALUE
>>> array_items -> END_OF_VALUE
>>>
>>> // entered on BEFORE_VALUE, with TOS being a QList
>>> array_items := value -> add value to QList -> END_OF_VALUE
>>> (']' -> pop completed QList -> END_OF_VALUE
>>> | ',' -> BEFORE_VALUE
>>> array_items) -> END_OF_VALUE
>>>
>>> // entered on AFTER_LCURLY
>>> after_lcurly := '}' -> pop completed QDict -> END_OF_VALUE
>>> | ϵ -> BEFORE_KEY
>>> dict_pairs -> END_OF_VALUE
>>>
>>> // entered on BEFORE_KEY, with TOS being a QDict
>>> dict_pairs := STRING -> push QString -> END_OF_KEY
>>> ':' -> BEFORE_VALUE
>>> value -> pop QString + add pair to QDict -> END_OF_VALUE
>>> ('}' -> pop completed QDict -> END_OF_VALUE
>>> | ',' -> BEFORE_KEY
>>> dict_pairs) -> END_OF_VALUE
>>
>> The BNF part is everything but the -> ... Easy enough. However, I'm
>> not quite certain how to interpret the --> ...
>>
>> The "// entered on STATE ..." I can read as assertion.
>
> Yes, that's a comment (with C++ syntax to have something familiar).
>
>> The text after "->" seems to be one ore more state / stack transitions.
>> Let me describe them in my own words to make sure I understand you.
>> Note that I had to also look at the source code to produce my
>> description.
>>
>> -> STATE
>>
>> Set top stack entry's state to STATE.
>>
>> -> push OBJ -> STATE
>>
>> Push (STATE, OBJ) onto the stack.
>
> Intended more like push (OBJ, undefined), set top stack entry state to
> STATE, but yes.
>
>> -> pop completed OBJ -> END_OF_VALUE
>>
>> Pop (_, OBJ) from the stack, set new top stack entry's state to
>> END_OF_VALUE. OBJ is the result of parsing non-terminal @value.
>>
>> -> add value to QList -> END_OF_VALUE
>>
>> Append the result of parsing non-terminal @value to the top stack
>> entry's QList. Set the top stack entry's state to END_OF_VALUE.
>>
>> -> pop QString + add pair to QDict -> END_OF_VALUE
>>
>> Pop (_, QString) from the stack, it's the key. Its value is the
>> result of parsing non-terminal value. Store this key: value pair in
>> the top stack entry's QDict. Set the top stack entry's state to
>> END_OF_VALUE.
>
> Correct. "-> STATE" generally means "go to state STATE", where teh state
> is stored in the top stack entry.
So I was able to work out how to read the annotations. How can we make
that easier for the next guy? Could well be me in six months...
>> Stack entry members @partial from bottom to top:
>>
>> * a QList element for an unclosed '[',
>>
>> * a QDict and a QString element for an unclosed '{', except the
>> innermost one need not have a QString. This is because the QString is
>> on the stack only while we're parsing ':' value.
>>
>> So, stack contents invariant:
>>
>> ( QList | QDict QString )* QDict?
>
> Makes sense.
Let's write it down where we describe the structure of the stack.
>> I found the QString business somewhat confusing, probably because I
>> expected one stack entry per nesting level, and tossing that invalid
>> assumption took mental effort.
>
> ... while I wrote the parser as one per pending nonterminal. Coming
> from the recursive descent probably suggested that approach, somewhat
> subconsciously.
>
>> I figure we could have one stack entry per: add a const char *key member
>> to JSONParserStackEntry, non-null exactly when we have a QString sitting
>> on a QDict now. We'd trade representable invalid state "QString
>> directly above non-QDict in the stack" for "QList stack entry has member
>> @key set". Would be slightly more local. Matter of taste, I guess.
>
> I can check what the code looks like, sure.
>
>> More representable invalid state, as far as I can tell (observation, not
>> objection):
>>
>> * a QList stack entry can only have state AFTER_LSQUARE, BEFORE_VALUE,
>> END_OF_VALUE, never AFTER_LCURLY, BEFORE_KEY, END_OF_KEY
>>
>> * a QDict stack entry can only have state AFTER_LCURLY, BEFORE_KEY,
>> END_OF_VALUE, never AFTER_LSQUARE, BEFORE_VALUE, END_OF_KEY
>>
>> * a QString stack entry can only have state BEFORE_VALUE and END_OF_KEY,
>> never AFTER_LCURLY, AFTER_LSQUARE, BEFORE_VALUE, END_OF_VALUE.
>>
>> Stack entries by @state instead of by @partial:
>>
>> * AFTER_LCURLY: @partial is QDict
>>
>> * AFTER_LSQUARE: @partial is QList
>>
>> * BEFORE_KEY: @partial is QDict
>>
>> * BEFORE_VALUE: @partial is QList or QString
>
> ... or empty:
>
> state = entry ? entry->state : BEFORE_VALUE;
Yes, forgot that one.
>> * END_OF_KEY: @partial is QString
>>
>> * END_OF_VALUE: @partial is QList or QString
>>
>> Invariants worth documenting, I think.
>
> Ok---as assertions in the code?
Making invalid states unrepresentable is best, but that's often not
practical in C.
I generally prefer assertions to comments as long as the assertions are
easy enough to read.
>> Here's my attempt at explaining the automaton and how states and stack
>> fit together: annotated JSON syntax diagrams. The diagrams are from
>> www.json.org with whitespace nonterminal omitted for brevity.
>>
>> Semantic actions are connected with dashed lines. State names are
>> abbreviated to ALC, ALS, BK, BV, EOK, EOV. (STATE, QTYPE) describes the
>> top stack entry. "assert (STATE, QTYPE)" asserts the top stack entry's
>> @state is STATE, and its @partial is of QTYPE. "set STATE" means the
>> top stack entry's state is set to STATE.
>>
>> set EOV
>> result is QObject
>> value ┆
>> >────────╮─── string ──╭───>
>> ┆ │ │
>> assert (BV, _) ╰─── number ──╯
>> │ │
>> ╰─── object ──╯
>> │ │
>> ╰─── array ───╯
>> │ │
>> ╰─── true ────╯
>> │ │
>> ╰─── false ───╯
>> │ │
>> ╰─── null ────╯
>>
>>
>> push (ALS, QList) pop (ALS, QList)
>> ┆ assert (BV, _)
>> ┆ set EOV
>> ┆ result is popped QList
>> array ┆ ┆
>> >─────── [ ─╮────────────╭─ ] ──>
>> ┆ │ │
>> assert (BV, _) │ ┄ set BV │
>> │ │
>> set BV │ │
>> ┆ │ │
>> ╭─ , ────╰── value ──╮╯
>> │ │ ┄ assert (EOV, QList)
>> ╰────────────────────╯ put value into QList
>>
>>
>>
>> push (ALC, QDict) pop (ALC, QDict)
>> ┆ assert (BV, _)
>> ┆ set EOV
>> ┆ result is popped QDict
>> object ┆ ┆
>> >─────── { ─╮───────────────────────────╭─ } ──>
>> ┆ │ │
>> assert (BV, _) │ ┄ set BK │
>> │ │
>> set BK │ push (EOK, QString) │
>> ┆ │ ┆ │
>> ╭─ , ────╰─ string ─── : ─── value ─╮╯
>> │ ┆ │ pop (BV, QString)
>> │ set BV │ ┄ assert (EOV, QDict)
>> │ │ put string: value
>> ╰───────────────────────────────────╯ into QDict
>>
>> Might be your turn to be confused ;)
>
> I'm not confused but maybe it is a bit too detailed.
We can elide detail. For instance, if we document the stack structure
elsewhere, assertions on stack contents are low value here.
>>>>> +static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken *token)
>>>>
>>>> Rename to parse_token()? Or inline into json_parser_feed()?
>>>
>>> Renaming is good.
>>>
>>>>> + if (!key) {
>>>>> + return NULL;
>>>>> + }
>>>>
>>>> Key to understand the switch is the meaning of "break" and "return
>>>> NULL".
>>>>
>>>> We do the latter when we're done for this token.
>>>>
>>>> We do the former when this token completed a (sub-)value. @value holds
>>>> it. If the stack is empty, @value is the result of the parse, and the
>>>> code after the switch returns it. Else, @value is a sub-value, and the
>>>> code after the switch stores it in the object being constructed.
>>>
>>> Correct. Want me to include it somehow or is the grammar above fine?
>>
>> Whether the grammar makes the meaning of "break" and "return NULL"
>> obvious enough is hard to tell now: I know too much. Should fix itself
>> within a few days ;)
>
> Probably not. :) I'll see if I can move the post-pr
I can offer a pair of fresher eyes on v2.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 3/5] json-streamer: remove token queue
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
2026-01-07 8:48 ` [PATCH 1/5] json-parser: pass around lookahead token, constify Paolo Bonzini
2026-01-07 8:48 ` [PATCH 2/5] json-parser: replace with a push parser Paolo Bonzini
@ 2026-01-07 8:48 ` Paolo Bonzini
2026-02-10 7:58 ` Markus Armbruster
2026-01-07 8:48 ` [PATCH 4/5] json-streamer: do not heap-allocate JSONToken Paolo Bonzini
` (3 subsequent siblings)
6 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-07 8:48 UTC (permalink / raw)
To: qemu-devel; +Cc: armbru
Now fully exploit the push parser, feeding it one token at a time
without having to wait until braces and brackets are balanced.
While the nesting counts are retained for error recovery purposes,
the system can now report the first parsing error without waiting
for delimiters to be balanced. This also means that JSON_ERROR
can be handled in json-parser.c, not json-streamer.c.
After reporting the error, json-streamer.c then enters an error recovery
mode where subsequent errors are suppressed. This mimics the previous
error reporting behavior, but it provides prompt feedback on parsing
errors. As an example, here is an example interaction with qemu-ga.
BEFORE (error reported only once braces are balanced):
>> {'execute':foo
>> }
<< {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
>> {'execute':'somecommand'}
<< {"error": {"class": "CommandNotFound", "desc": "The command somecommand has not been found"}}
AFTER (error reported immediately, but similar error recovery as before):
>> {'execute':foo
<< {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
>> }
>> {'execute':'qmp_capabilities'}
<< {"error": {"class": "CommandNotFound", "desc": "The command qmp_capabilities has not been found"}}
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/qobject/json-parser.h | 5 +-
qobject/json-parser-int.h | 1 +
qobject/json-parser.c | 26 +++++---
qobject/json-streamer.c | 116 ++++++++++++++--------------------
4 files changed, 68 insertions(+), 80 deletions(-)
diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
index 05346fa816b..923eb74ca00 100644
--- a/include/qobject/json-parser.h
+++ b/include/qobject/json-parser.h
@@ -29,11 +29,12 @@ typedef struct JSONParserContext {
typedef struct JSONMessageParser {
void (*emit)(void *opaque, QObject *json, Error *err);
void *opaque;
- va_list *ap;
JSONLexer lexer;
+ JSONParserContext parser;
int brace_count;
int bracket_count;
- GQueue tokens;
+ int token_count;
+ bool error;
uint64_t token_size;
} JSONMessageParser;
diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
index 05e2e8e1831..1f435cb8eb2 100644
--- a/qobject/json-parser-int.h
+++ b/qobject/json-parser-int.h
@@ -50,6 +50,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
/* json-parser.c */
JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
void json_parser_init(JSONParserContext *ctxt, va_list *ap);
+void json_parser_reset(JSONParserContext *ctxt);
QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
void json_parser_destroy(JSONParserContext *ctxt);
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 338f0c5aae6..7abdea4dacb 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -541,21 +541,27 @@ JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
return token;
}
-void json_parser_init(JSONParserContext *ctxt, va_list *ap)
-{
- ctxt->err = NULL;
- ctxt->stack = g_queue_new();
- ctxt->ap = ap;
-}
-
-void json_parser_destroy(JSONParserContext *ctxt)
+void json_parser_reset(JSONParserContext *ctxt)
{
JSONParserStackEntry *entry;
+ ctxt->err = NULL;
while ((entry = g_queue_pop_tail(ctxt->stack)) != NULL) {
qobject_unref(entry->partial);
g_free(entry);
}
+}
+
+void json_parser_init(JSONParserContext *ctxt, va_list *ap)
+{
+ ctxt->stack = g_queue_new();
+ ctxt->ap = ap;
+ json_parser_reset(ctxt);
+}
+
+void json_parser_destroy(JSONParserContext *ctxt)
+{
+ json_parser_reset(ctxt);
g_queue_free(ctxt->stack);
ctxt->stack = NULL;
}
@@ -566,6 +572,10 @@ QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error
assert(!ctxt->err);
switch (token->type) {
+ case JSON_ERROR:
+ parse_error(ctxt, token, "JSON parse error, stray '%s'", token->str);
+ break;
+
case JSON_END_OF_INPUT:
/* Check for premature end of input */
if (!g_queue_is_empty(ctxt->stack)) {
diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
index 6c93e6fd78d..a1210128ac1 100644
--- a/qobject/json-streamer.c
+++ b/qobject/json-streamer.c
@@ -1,5 +1,5 @@
/*
- * JSON streaming support
+ * JSON parser - callback interface and error recovery
*
* Copyright IBM, Corp. 2009
*
@@ -19,97 +19,73 @@
#define MAX_TOKEN_COUNT (2ULL << 20)
#define MAX_NESTING (1 << 10)
-static void json_message_free_tokens(JSONMessageParser *parser)
-{
- JSONToken *token;
-
- while ((token = g_queue_pop_head(&parser->tokens))) {
- g_free(token);
- }
-}
-
void json_message_process_token(JSONLexer *lexer, GString *input,
JSONTokenType type, int x, int y)
{
JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
- JSONParserContext ctxt;
- QObject *json = NULL;
+ g_autofree JSONToken *token = json_token(type, x, y, input);
Error *err = NULL;
- JSONToken *token;
+ parser->token_size += input->len;
+ parser->token_count++;
+
+ /* Detect message boundaries for error recovery purposes. */
switch (type) {
case JSON_LCURLY:
parser->brace_count++;
break;
case JSON_RCURLY:
- parser->brace_count--;
+ if (parser->brace_count > 0) {
+ parser->brace_count--;
+ }
break;
case JSON_LSQUARE:
parser->bracket_count++;
break;
case JSON_RSQUARE:
- parser->bracket_count--;
- break;
- case JSON_ERROR:
- error_setg(&err, "JSON parse error, stray '%s'", input->str);
- goto out_emit;
- case JSON_END_OF_INPUT:
- if (g_queue_is_empty(&parser->tokens)) {
- return;
+ if (parser->bracket_count > 0) {
+ parser->bracket_count--;
}
break;
default:
break;
}
- /*
- * Security consideration, we limit total memory allocated per object
- * and the maximum recursion depth that a message can force.
- */
- if (parser->token_size + input->len + 1 > MAX_TOKEN_SIZE) {
- error_setg(&err, "JSON token size limit exceeded");
- goto out_emit;
- }
- if (g_queue_get_length(&parser->tokens) + 1 > MAX_TOKEN_COUNT) {
- error_setg(&err, "JSON token count limit exceeded");
- goto out_emit;
- }
- if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
- error_setg(&err, "JSON nesting depth limit exceeded");
- goto out_emit;
- }
+ /* during error recovery eat tokens until parentheses balance */
+ if (!parser->error) {
+ /*
+ * Security consideration, we limit total memory allocated per object
+ * and the maximum recursion depth that a message can force.
+ */
+ if (parser->token_size > MAX_TOKEN_SIZE) {
+ error_setg(&err, "JSON token size limit exceeded");
+ } else if (parser->token_count > MAX_TOKEN_COUNT) {
+ error_setg(&err, "JSON token count limit exceeded");
+ } else if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
+ error_setg(&err, "JSON nesting depth limit exceeded");
+ } else {
+ QObject *json = json_parser_feed(&parser->parser, token, &err);
+ if (json) {
+ parser->emit(parser->opaque, json, NULL);
+ }
+ }
- token = json_token(type, x, y, input);
- parser->token_size += input->len;
-
- g_queue_push_tail(&parser->tokens, token);
-
- if ((parser->brace_count > 0 || parser->bracket_count > 0)
- && parser->brace_count >= 0 && parser->bracket_count >= 0
- && type != JSON_END_OF_INPUT) {
- return;
- }
-
- json_parser_init(&ctxt, parser->ap);
-
- /* Process all tokens in the queue */
- while (!g_queue_is_empty(&parser->tokens)) {
- token = g_queue_pop_head(&parser->tokens);
- json = json_parser_feed(&ctxt, token, &err);
- g_free(token);
- if (json || err) {
- break;
+ if (err) {
+ parser->emit(parser->opaque, NULL, err);
+ /* start recovery */
+ parser->error = true;
}
}
- json_parser_destroy(&ctxt);
-
-out_emit:
- parser->brace_count = 0;
- parser->bracket_count = 0;
- json_message_free_tokens(parser);
- parser->token_size = 0;
- parser->emit(parser->opaque, json, err);
+ if ((parser->brace_count == 0 && parser->bracket_count == 0)
+ || type == JSON_END_OF_INPUT) {
+ parser->error = false;
+ parser->brace_count = 0;
+ parser->bracket_count = 0;
+ parser->token_count = 0;
+ parser->token_size = 0;
+ json_parser_reset(&parser->parser);
+ }
}
void json_message_parser_init(JSONMessageParser *parser,
@@ -119,12 +95,13 @@ void json_message_parser_init(JSONMessageParser *parser,
{
parser->emit = emit;
parser->opaque = opaque;
- parser->ap = ap;
+ parser->error = false;
parser->brace_count = 0;
parser->bracket_count = 0;
- g_queue_init(&parser->tokens);
+ parser->token_count = 0;
parser->token_size = 0;
+ json_parser_init(&parser->parser, ap);
json_lexer_init(&parser->lexer, !!ap);
}
@@ -137,11 +114,10 @@ void json_message_parser_feed(JSONMessageParser *parser,
void json_message_parser_flush(JSONMessageParser *parser)
{
json_lexer_flush(&parser->lexer);
- assert(g_queue_is_empty(&parser->tokens));
}
void json_message_parser_destroy(JSONMessageParser *parser)
{
json_lexer_destroy(&parser->lexer);
- json_message_free_tokens(parser);
+ json_parser_destroy(&parser->parser);
}
--
2.52.0
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 3/5] json-streamer: remove token queue
2026-01-07 8:48 ` [PATCH 3/5] json-streamer: remove token queue Paolo Bonzini
@ 2026-02-10 7:58 ` Markus Armbruster
2026-02-10 8:22 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-10 7:58 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> Now fully exploit the push parser, feeding it one token at a time
> without having to wait until braces and brackets are balanced.
>
> While the nesting counts are retained for error recovery purposes,
> the system can now report the first parsing error without waiting
> for delimiters to be balanced. This also means that JSON_ERROR
> can be handled in json-parser.c, not json-streamer.c.
>
> After reporting the error, json-streamer.c then enters an error recovery
> mode where subsequent errors are suppressed. This mimics the previous
> error reporting behavior, but it provides prompt feedback on parsing
> errors. As an example, here is an example interaction with qemu-ga.
>
> BEFORE (error reported only once braces are balanced):
>
> >> {'execute':foo
> >> }
> << {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
> >> {'execute':'somecommand'}
> << {"error": {"class": "CommandNotFound", "desc": "The command somecommand has not been found"}}
>
> AFTER (error reported immediately, but similar error recovery as before):
>
> >> {'execute':foo
> << {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
> >> }
> >> {'execute':'qmp_capabilities'}
> << {"error": {"class": "CommandNotFound", "desc": "The command qmp_capabilities has not been found"}}
Welcome quality of life improvement for human use of QMP, such as manual
testing. The experience is still sub-par when you miss closing
parenthesis: you get the silent treatment.
Nitpick #1: Single quotes are an syntax extension made to enable more
readable QMP input in C strings. Letting it leak into the external
interface was a mistake. The less we show their use, the better. Let's
stick to doublequotes here.
Nitpick #2: example input differs needlessly between BEFORE and AFTER.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> include/qobject/json-parser.h | 5 +-
> qobject/json-parser-int.h | 1 +
> qobject/json-parser.c | 26 +++++---
> qobject/json-streamer.c | 116 ++++++++++++++--------------------
> 4 files changed, 68 insertions(+), 80 deletions(-)
>
> diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
> index 05346fa816b..923eb74ca00 100644
> --- a/include/qobject/json-parser.h
> +++ b/include/qobject/json-parser.h
> @@ -29,11 +29,12 @@ typedef struct JSONParserContext {
> typedef struct JSONMessageParser {
> void (*emit)(void *opaque, QObject *json, Error *err);
> void *opaque;
> - va_list *ap;
> JSONLexer lexer;
> + JSONParserContext parser;
> int brace_count;
> int bracket_count;
> - GQueue tokens;
> + int token_count;
> + bool error;
> uint64_t token_size;
> } JSONMessageParser;
>
> diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
> index 05e2e8e1831..1f435cb8eb2 100644
> --- a/qobject/json-parser-int.h
> +++ b/qobject/json-parser-int.h
> @@ -50,6 +50,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> /* json-parser.c */
> JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
> void json_parser_init(JSONParserContext *ctxt, va_list *ap);
> +void json_parser_reset(JSONParserContext *ctxt);
> QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
> void json_parser_destroy(JSONParserContext *ctxt);
>
> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
> index 338f0c5aae6..7abdea4dacb 100644
> --- a/qobject/json-parser.c
> +++ b/qobject/json-parser.c
> @@ -541,21 +541,27 @@ JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
> return token;
> }
>
> -void json_parser_init(JSONParserContext *ctxt, va_list *ap)
> -{
> - ctxt->err = NULL;
> - ctxt->stack = g_queue_new();
> - ctxt->ap = ap;
> -}
> -
> -void json_parser_destroy(JSONParserContext *ctxt)
> +void json_parser_reset(JSONParserContext *ctxt)
> {
> JSONParserStackEntry *entry;
>
> + ctxt->err = NULL;
> while ((entry = g_queue_pop_tail(ctxt->stack)) != NULL) {
> qobject_unref(entry->partial);
> g_free(entry);
> }
> +}
> +
> +void json_parser_init(JSONParserContext *ctxt, va_list *ap)
> +{
> + ctxt->stack = g_queue_new();
> + ctxt->ap = ap;
> + json_parser_reset(ctxt);
> +}
> +
> +void json_parser_destroy(JSONParserContext *ctxt)
> +{
> + json_parser_reset(ctxt);
> g_queue_free(ctxt->stack);
> ctxt->stack = NULL;
> }
> @@ -566,6 +572,10 @@ QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error
>
> assert(!ctxt->err);
> switch (token->type) {
> + case JSON_ERROR:
> + parse_error(ctxt, token, "JSON parse error, stray '%s'", token->str);
> + break;
> +
> case JSON_END_OF_INPUT:
> /* Check for premature end of input */
> if (!g_queue_is_empty(ctxt->stack)) {
> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
> index 6c93e6fd78d..a1210128ac1 100644
> --- a/qobject/json-streamer.c
> +++ b/qobject/json-streamer.c
> @@ -1,5 +1,5 @@
> /*
> - * JSON streaming support
> + * JSON parser - callback interface and error recovery
"Streaming" has always been misleading. This isn't a streaming parser,
and never was. It's a conventional parser wed to a push lexer with a
hack before this series, and a push parser afterwards.
> *
> * Copyright IBM, Corp. 2009
> *
> @@ -19,97 +19,73 @@
> #define MAX_TOKEN_COUNT (2ULL << 20)
> #define MAX_NESTING (1 << 10)
>
> -static void json_message_free_tokens(JSONMessageParser *parser)
> -{
> - JSONToken *token;
> -
> - while ((token = g_queue_pop_head(&parser->tokens))) {
> - g_free(token);
> - }
> -}
> -
> void json_message_process_token(JSONLexer *lexer, GString *input,
> JSONTokenType type, int x, int y)
> {
> JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
> - JSONParserContext ctxt;
> - QObject *json = NULL;
> + g_autofree JSONToken *token = json_token(type, x, y, input);
> Error *err = NULL;
> - JSONToken *token;
>
> + parser->token_size += input->len;
> + parser->token_count++;
> +
> + /* Detect message boundaries for error recovery purposes. */
I like the comment.
> switch (type) {
> case JSON_LCURLY:
> parser->brace_count++;
> break;
> case JSON_RCURLY:
> - parser->brace_count--;
> + if (parser->brace_count > 0) {
> + parser->brace_count--;
> + }
Looks like you're keeping @brace_count (here) and @bracket_count (below)
non-negative. Any particular reason?
Make them unsigned?
> break;
> case JSON_LSQUARE:
> parser->bracket_count++;
> break;
> case JSON_RSQUARE:
> - parser->bracket_count--;
> - break;
> - case JSON_ERROR:
> - error_setg(&err, "JSON parse error, stray '%s'", input->str);
> - goto out_emit;
> - case JSON_END_OF_INPUT:
> - if (g_queue_is_empty(&parser->tokens)) {
> - return;
> + if (parser->bracket_count > 0) {
> + parser->bracket_count--;
> }
> break;
> default:
> break;
> }
>
> - /*
> - * Security consideration, we limit total memory allocated per object
> - * and the maximum recursion depth that a message can force.
> - */
> - if (parser->token_size + input->len + 1 > MAX_TOKEN_SIZE) {
> - error_setg(&err, "JSON token size limit exceeded");
> - goto out_emit;
> - }
> - if (g_queue_get_length(&parser->tokens) + 1 > MAX_TOKEN_COUNT) {
> - error_setg(&err, "JSON token count limit exceeded");
> - goto out_emit;
> - }
> - if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
> - error_setg(&err, "JSON nesting depth limit exceeded");
> - goto out_emit;
> - }
> + /* during error recovery eat tokens until parentheses balance */
> + if (!parser->error) {
I think I'd write this as
if (parser->error) {
/* error recovery: eat tokens until parentheses balance */
} else {
> + /*
> + * Security consideration, we limit total memory allocated per object
> + * and the maximum recursion depth that a message can force.
> + */
This comment now needs rephrasing: there's no recursion anymore. In the
previous commit if we're feeling persnickety.
Suggest s/security/safety/ while there.
> + if (parser->token_size > MAX_TOKEN_SIZE) {
> + error_setg(&err, "JSON token size limit exceeded");
> + } else if (parser->token_count > MAX_TOKEN_COUNT) {
> + error_setg(&err, "JSON token count limit exceeded");
> + } else if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
> + error_setg(&err, "JSON nesting depth limit exceeded");
> + } else {
> + QObject *json = json_parser_feed(&parser->parser, token, &err);
> + if (json) {
> + parser->emit(parser->opaque, json, NULL);
> + }
> + }
>
> - token = json_token(type, x, y, input);
> - parser->token_size += input->len;
> -
> - g_queue_push_tail(&parser->tokens, token);
> -
> - if ((parser->brace_count > 0 || parser->bracket_count > 0)
> - && parser->brace_count >= 0 && parser->bracket_count >= 0
> - && type != JSON_END_OF_INPUT) {
> - return;
> - }
> -
> - json_parser_init(&ctxt, parser->ap);
> -
> - /* Process all tokens in the queue */
> - while (!g_queue_is_empty(&parser->tokens)) {
> - token = g_queue_pop_head(&parser->tokens);
> - json = json_parser_feed(&ctxt, token, &err);
> - g_free(token);
> - if (json || err) {
> - break;
> + if (err) {
> + parser->emit(parser->opaque, NULL, err);
> + /* start recovery */
> + parser->error = true;
> }
> }
>
> - json_parser_destroy(&ctxt);
> -
> -out_emit:
> - parser->brace_count = 0;
> - parser->bracket_count = 0;
> - json_message_free_tokens(parser);
> - parser->token_size = 0;
> - parser->emit(parser->opaque, json, err);
> + if ((parser->brace_count == 0 && parser->bracket_count == 0)
> + || type == JSON_END_OF_INPUT) {
> + parser->error = false;
> + parser->brace_count = 0;
> + parser->bracket_count = 0;
> + parser->token_count = 0;
> + parser->token_size = 0;
> + json_parser_reset(&parser->parser);
> + }
> }
>
> void json_message_parser_init(JSONMessageParser *parser,
> @@ -119,12 +95,13 @@ void json_message_parser_init(JSONMessageParser *parser,
> {
> parser->emit = emit;
> parser->opaque = opaque;
> - parser->ap = ap;
> + parser->error = false;
> parser->brace_count = 0;
> parser->bracket_count = 0;
> - g_queue_init(&parser->tokens);
> + parser->token_count = 0;
> parser->token_size = 0;
>
> + json_parser_init(&parser->parser, ap);
> json_lexer_init(&parser->lexer, !!ap);
> }
>
> @@ -137,11 +114,10 @@ void json_message_parser_feed(JSONMessageParser *parser,
> void json_message_parser_flush(JSONMessageParser *parser)
> {
> json_lexer_flush(&parser->lexer);
> - assert(g_queue_is_empty(&parser->tokens));
> }
>
> void json_message_parser_destroy(JSONMessageParser *parser)
> {
> json_lexer_destroy(&parser->lexer);
> - json_message_free_tokens(parser);
> + json_parser_destroy(&parser->parser);
> }
Not mentioned in the commit message: before the patch, the "streamer"
calls json_parser_init() and json_parser_destroy() for each balanced
expression. Afterwards, it calls them once, and instea calls
json_parser_reset() between balanced expressions.
I wonder whether it could be a separate patch.
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 3/5] json-streamer: remove token queue
2026-02-10 7:58 ` Markus Armbruster
@ 2026-02-10 8:22 ` Paolo Bonzini
2026-02-11 7:13 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-10 8:22 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On 2/10/26 08:58, Markus Armbruster wrote:
> Paolo Bonzini <pbonzini@redhat.com> writes:
>
>> Now fully exploit the push parser, feeding it one token at a time
>> without having to wait until braces and brackets are balanced.
>>
>> While the nesting counts are retained for error recovery purposes,
>> the system can now report the first parsing error without waiting
>> for delimiters to be balanced. This also means that JSON_ERROR
>> can be handled in json-parser.c, not json-streamer.c.
>>
>> After reporting the error, json-streamer.c then enters an error recovery
>> mode where subsequent errors are suppressed. This mimics the previous
>> error reporting behavior, but it provides prompt feedback on parsing
>> errors. As an example, here is an example interaction with qemu-ga.
>>
>> BEFORE (error reported only once braces are balanced):
>>
>> >> {'execute':foo
>> >> }
>> << {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
>> >> {'execute':'somecommand'}
>> << {"error": {"class": "CommandNotFound", "desc": "The command somecommand has not been found"}}
>>
>> AFTER (error reported immediately, but similar error recovery as before):
>>
>> >> {'execute':foo
>> << {"error": {"class": "GenericError", "desc": "JSON parse error, invalid keyword 'foo'"}}
>> >> }
>> >> {'execute':'qmp_capabilities'}
>> << {"error": {"class": "CommandNotFound", "desc": "The command qmp_capabilities has not been found"}}
>
> Welcome quality of life improvement for human use of QMP, such as manual
> testing. The experience is still sub-par when you miss closing
> parenthesis: you get the silent treatment.
>
> Nitpick #1: Single quotes are an syntax extension made to enable more
> readable QMP input in C strings. Letting it leak into the external
> interface was a mistake. The less we show their use, the better. Let's
> stick to doublequotes here.
>
> Nitpick #2: example input differs needlessly between BEFORE and AFTER.
>
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> include/qobject/json-parser.h | 5 +-
>> qobject/json-parser-int.h | 1 +
>> qobject/json-parser.c | 26 +++++---
>> qobject/json-streamer.c | 116 ++++++++++++++--------------------
>> 4 files changed, 68 insertions(+), 80 deletions(-)
>>
>> diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
>> index 05346fa816b..923eb74ca00 100644
>> --- a/include/qobject/json-parser.h
>> +++ b/include/qobject/json-parser.h
>> @@ -29,11 +29,12 @@ typedef struct JSONParserContext {
>> typedef struct JSONMessageParser {
>> void (*emit)(void *opaque, QObject *json, Error *err);
>> void *opaque;
>> - va_list *ap;
>> JSONLexer lexer;
>> + JSONParserContext parser;
>> int brace_count;
>> int bracket_count;
>> - GQueue tokens;
>> + int token_count;
>> + bool error;
>> uint64_t token_size;
>> } JSONMessageParser;
>>
>> diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
>> index 05e2e8e1831..1f435cb8eb2 100644
>> --- a/qobject/json-parser-int.h
>> +++ b/qobject/json-parser-int.h
>> @@ -50,6 +50,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
>> /* json-parser.c */
>> JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
>> void json_parser_init(JSONParserContext *ctxt, va_list *ap);
>> +void json_parser_reset(JSONParserContext *ctxt);
>> QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
>> void json_parser_destroy(JSONParserContext *ctxt);
>>
>> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
>> index 338f0c5aae6..7abdea4dacb 100644
>> --- a/qobject/json-parser.c
>> +++ b/qobject/json-parser.c
>> @@ -541,21 +541,27 @@ JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
>> return token;
>> }
>>
>> -void json_parser_init(JSONParserContext *ctxt, va_list *ap)
>> -{
>> - ctxt->err = NULL;
>> - ctxt->stack = g_queue_new();
>> - ctxt->ap = ap;
>> -}
>> -
>> -void json_parser_destroy(JSONParserContext *ctxt)
>> +void json_parser_reset(JSONParserContext *ctxt)
>> {
>> JSONParserStackEntry *entry;
>>
>> + ctxt->err = NULL;
>> while ((entry = g_queue_pop_tail(ctxt->stack)) != NULL) {
>> qobject_unref(entry->partial);
>> g_free(entry);
>> }
>> +}
>> +
>> +void json_parser_init(JSONParserContext *ctxt, va_list *ap)
>> +{
>> + ctxt->stack = g_queue_new();
>> + ctxt->ap = ap;
>> + json_parser_reset(ctxt);
>> +}
>> +
>> +void json_parser_destroy(JSONParserContext *ctxt)
>> +{
>> + json_parser_reset(ctxt);
>> g_queue_free(ctxt->stack);
>> ctxt->stack = NULL;
>> }
>> @@ -566,6 +572,10 @@ QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error
>>
>> assert(!ctxt->err);
>> switch (token->type) {
>> + case JSON_ERROR:
>> + parse_error(ctxt, token, "JSON parse error, stray '%s'", token->str);
>> + break;
>> +
>> case JSON_END_OF_INPUT:
>> /* Check for premature end of input */
>> if (!g_queue_is_empty(ctxt->stack)) {
>> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
>> index 6c93e6fd78d..a1210128ac1 100644
>> --- a/qobject/json-streamer.c
>> +++ b/qobject/json-streamer.c
>> @@ -1,5 +1,5 @@
>> /*
>> - * JSON streaming support
>> + * JSON parser - callback interface and error recovery
>
> "Streaming" has always been misleading. This isn't a streaming parser,
> and never was. It's a conventional parser wed to a push lexer with a
> hack before this series, and a push parser afterwards.
>
>> *
>> * Copyright IBM, Corp. 2009
>> *
>> @@ -19,97 +19,73 @@
>> #define MAX_TOKEN_COUNT (2ULL << 20)
>> #define MAX_NESTING (1 << 10)
>>
>> -static void json_message_free_tokens(JSONMessageParser *parser)
>> -{
>> - JSONToken *token;
>> -
>> - while ((token = g_queue_pop_head(&parser->tokens))) {
>> - g_free(token);
>> - }
>> -}
>> -
>> void json_message_process_token(JSONLexer *lexer, GString *input,
>> JSONTokenType type, int x, int y)
>> {
>> JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
>> - JSONParserContext ctxt;
>> - QObject *json = NULL;
>> + g_autofree JSONToken *token = json_token(type, x, y, input);
>> Error *err = NULL;
>> - JSONToken *token;
>>
>> + parser->token_size += input->len;
>> + parser->token_count++;
>> +
>> + /* Detect message boundaries for error recovery purposes. */
>
> I like the comment.
>
>> switch (type) {
>> case JSON_LCURLY:
>> parser->brace_count++;
>> break;
>> case JSON_RCURLY:
>> - parser->brace_count--;
>> + if (parser->brace_count > 0) {
>> + parser->brace_count--;
>> + }
>
> Looks like you're keeping @brace_count (here) and @bracket_count (below)
> non-negative. Any particular reason?
Because negative brace_count/bracket_count is an insta-recovered error.
I could make it
if ((parser->brace_count <= 0 && parser->bracket_count <= 0)
below, or even better make this a separate patch.
>> + /*
>> + * Security consideration, we limit total memory allocated per object
>> + * and the maximum recursion depth that a message can force.
>> + */
>
> This comment now needs rephrasing: there's no recursion anymore. In the
> previous commit if we're feeling persnickety.
s/recursion/nesting/?
> Not mentioned in the commit message: before the patch, the "streamer"
> calls json_parser_init() and json_parser_destroy() for each balanced
> expression. Afterwards, it calls them once, and instea calls
> json_parser_reset() between balanced expressions.
>
> I wonder whether it could be a separate patch.
Sure. I'll also try to make "brace_count"/"bracket_count" unsigned in a
separate patch.
Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 3/5] json-streamer: remove token queue
2026-02-10 8:22 ` Paolo Bonzini
@ 2026-02-11 7:13 ` Markus Armbruster
0 siblings, 0 replies; 26+ messages in thread
From: Markus Armbruster @ 2026-02-11 7:13 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 2/10/26 08:58, Markus Armbruster wrote:
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>>
>>> Now fully exploit the push parser, feeding it one token at a time
>>> without having to wait until braces and brackets are balanced.
>>>
>>> While the nesting counts are retained for error recovery purposes,
>>> the system can now report the first parsing error without waiting
>>> for delimiters to be balanced. This also means that JSON_ERROR
>>> can be handled in json-parser.c, not json-streamer.c.
>>>
>>> After reporting the error, json-streamer.c then enters an error recovery
>>> mode where subsequent errors are suppressed. This mimics the previous
>>> error reporting behavior, but it provides prompt feedback on parsing
>>> errors. As an example, here is an example interaction with qemu-ga.
[...]
>>> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
>>> index 6c93e6fd78d..a1210128ac1 100644
>>> --- a/qobject/json-streamer.c
>>> +++ b/qobject/json-streamer.c
[...]
>>> + /*
>>> + * Security consideration, we limit total memory allocated per object
>>> + * and the maximum recursion depth that a message can force.
>>> + */
>>
>> This comment now needs rephrasing: there's no recursion anymore. In the
>> previous commit if we're feeling persnickety.
>
> s/recursion/nesting/?
Yup.
[...]
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 4/5] json-streamer: do not heap-allocate JSONToken
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
` (2 preceding siblings ...)
2026-01-07 8:48 ` [PATCH 3/5] json-streamer: remove token queue Paolo Bonzini
@ 2026-01-07 8:48 ` Paolo Bonzini
2026-02-10 8:30 ` Markus Armbruster
2026-01-07 8:48 ` [PATCH 5/5] json-parser: add location to JSON parsing errors Paolo Bonzini
` (2 subsequent siblings)
6 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-07 8:48 UTC (permalink / raw)
To: qemu-devel; +Cc: armbru
This is not needed with a push parser. Since it processes tokens
immediately, the JSONToken can be created directly on the stack
and does not need to copy the lexer's string data.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
qobject/json-parser-int.h | 8 ++++++--
qobject/json-parser.c | 18 ------------------
qobject/json-streamer.c | 4 ++--
3 files changed, 8 insertions(+), 22 deletions(-)
diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
index 1f435cb8eb2..5a6b5c9af90 100644
--- a/qobject/json-parser-int.h
+++ b/qobject/json-parser-int.h
@@ -35,7 +35,12 @@ typedef enum json_token_type {
JSON_MAX = JSON_END_OF_INPUT
} JSONTokenType;
-typedef struct JSONToken JSONToken;
+typedef struct JSONToken {
+ JSONTokenType type;
+ int x;
+ int y;
+ char *str;
+} JSONToken;
/* json-lexer.c */
void json_lexer_init(JSONLexer *lexer, bool enable_interpolation);
@@ -48,7 +53,6 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
JSONTokenType type, int x, int y);
/* json-parser.c */
-JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
void json_parser_init(JSONParserContext *ctxt, va_list *ap);
void json_parser_reset(JSONParserContext *ctxt);
QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 7abdea4dacb..2fede59842f 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -24,13 +24,6 @@
#include "qobject/qstring.h"
#include "json-parser-int.h"
-struct JSONToken {
- JSONTokenType type;
- int x;
- int y;
- char str[];
-};
-
/*
* Objects: { } | { members }
* - Empty: { -> AFTER_LCURLY -> }
@@ -529,17 +522,6 @@ static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken
return NULL;
}
-JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
-{
- JSONToken *token = g_malloc(sizeof(JSONToken) + tokstr->len + 1);
-
- token->type = type;
- memcpy(token->str, tokstr->str, tokstr->len);
- token->str[tokstr->len] = 0;
- token->x = x;
- token->y = y;
- return token;
-}
void json_parser_reset(JSONParserContext *ctxt)
{
diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
index a1210128ac1..07e0ef51ce3 100644
--- a/qobject/json-streamer.c
+++ b/qobject/json-streamer.c
@@ -23,7 +23,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
JSONTokenType type, int x, int y)
{
JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
- g_autofree JSONToken *token = json_token(type, x, y, input);
+ JSONToken token = (JSONToken) { .type = type, .x = x, .y = y, .str = input->str };
Error *err = NULL;
parser->token_size += input->len;
@@ -64,7 +64,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
} else if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
error_setg(&err, "JSON nesting depth limit exceeded");
} else {
- QObject *json = json_parser_feed(&parser->parser, token, &err);
+ QObject *json = json_parser_feed(&parser->parser, &token, &err);
if (json) {
parser->emit(parser->opaque, json, NULL);
}
--
2.52.0
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 4/5] json-streamer: do not heap-allocate JSONToken
2026-01-07 8:48 ` [PATCH 4/5] json-streamer: do not heap-allocate JSONToken Paolo Bonzini
@ 2026-02-10 8:30 ` Markus Armbruster
0 siblings, 0 replies; 26+ messages in thread
From: Markus Armbruster @ 2026-02-10 8:30 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> This is not needed with a push parser. Since it processes tokens
> immediately, the JSONToken can be created directly on the stack
> and does not need to copy the lexer's string data.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> qobject/json-parser-int.h | 8 ++++++--
> qobject/json-parser.c | 18 ------------------
> qobject/json-streamer.c | 4 ++--
> 3 files changed, 8 insertions(+), 22 deletions(-)
>
> diff --git a/qobject/json-parser-int.h b/qobject/json-parser-int.h
> index 1f435cb8eb2..5a6b5c9af90 100644
> --- a/qobject/json-parser-int.h
> +++ b/qobject/json-parser-int.h
> @@ -35,7 +35,12 @@ typedef enum json_token_type {
> JSON_MAX = JSON_END_OF_INPUT
> } JSONTokenType;
>
> -typedef struct JSONToken JSONToken;
> +typedef struct JSONToken {
> + JSONTokenType type;
> + int x;
> + int y;
> + char *str;
> +} JSONToken;
>
> /* json-lexer.c */
> void json_lexer_init(JSONLexer *lexer, bool enable_interpolation);
> @@ -48,7 +53,6 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> JSONTokenType type, int x, int y);
>
> /* json-parser.c */
> -JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr);
> void json_parser_init(JSONParserContext *ctxt, va_list *ap);
> void json_parser_reset(JSONParserContext *ctxt);
> QObject *json_parser_feed(JSONParserContext *ctxt, const JSONToken *token, Error **errp);
> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
> index 7abdea4dacb..2fede59842f 100644
> --- a/qobject/json-parser.c
> +++ b/qobject/json-parser.c
> @@ -24,13 +24,6 @@
> #include "qobject/qstring.h"
> #include "json-parser-int.h"
>
> -struct JSONToken {
> - JSONTokenType type;
> - int x;
> - int y;
> - char str[];
> -};
> -
> /*
> * Objects: { } | { members }
> * - Empty: { -> AFTER_LCURLY -> }
> @@ -529,17 +522,6 @@ static QObject *json_parser_parse_token(JSONParserContext *ctxt, const JSONToken
> return NULL;
> }
>
> -JSONToken *json_token(JSONTokenType type, int x, int y, GString *tokstr)
> -{
> - JSONToken *token = g_malloc(sizeof(JSONToken) + tokstr->len + 1);
> -
> - token->type = type;
> - memcpy(token->str, tokstr->str, tokstr->len);
> - token->str[tokstr->len] = 0;
> - token->x = x;
> - token->y = y;
> - return token;
> -}
>
> void json_parser_reset(JSONParserContext *ctxt)
> {
> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
> index a1210128ac1..07e0ef51ce3 100644
> --- a/qobject/json-streamer.c
> +++ b/qobject/json-streamer.c
> @@ -23,7 +23,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> JSONTokenType type, int x, int y)
> {
> JSONMessageParser *parser = container_of(lexer, JSONMessageParser, lexer);
> - g_autofree JSONToken *token = json_token(type, x, y, input);
> + JSONToken token = (JSONToken) { .type = type, .x = x, .y = y, .str = input->str };
checkpatch points out:
0004-json-streamer-do-not-heap-allocate-JSONToken.patch:89: WARNING: line over 80 characters
Easy enough to avoid.
> Error *err = NULL;
>
> parser->token_size += input->len;
> @@ -64,7 +64,7 @@ void json_message_process_token(JSONLexer *lexer, GString *input,
> } else if (parser->bracket_count + parser->brace_count > MAX_NESTING) {
> error_setg(&err, "JSON nesting depth limit exceeded");
> } else {
> - QObject *json = json_parser_feed(&parser->parser, token, &err);
> + QObject *json = json_parser_feed(&parser->parser, &token, &err);
> if (json) {
> parser->emit(parser->opaque, json, NULL);
> }
With the long line tidied up
Reviewed-by: Markus Armbruster <armbru@redhat.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH 5/5] json-parser: add location to JSON parsing errors
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
` (3 preceding siblings ...)
2026-01-07 8:48 ` [PATCH 4/5] json-streamer: do not heap-allocate JSONToken Paolo Bonzini
@ 2026-01-07 8:48 ` Paolo Bonzini
2026-02-10 9:21 ` Markus Armbruster
2026-01-21 5:57 ` [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
2026-01-30 13:00 ` Markus Armbruster
6 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-07 8:48 UTC (permalink / raw)
To: qemu-devel; +Cc: armbru
Now that all calls to parse_error have a token, add the line and column
to the message. As far as I can see the two important TODOs (better
errors and better EOI handling) are done, and the others (token range
information and "parsed size"?) do not really matter or are handled
better by json-streamer.c. So remove the list, which had sat unchanged
since 2009.
Note however that right now the x and y are those of the *last* character
in the token. Modify json-lexer.c to freeze tok->x and tok->y at the
first character added to the GString.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/qobject/json-parser.h | 1 +
qobject/json-lexer.c | 11 +++++++----
qobject/json-parser.c | 12 ++----------
3 files changed, 10 insertions(+), 14 deletions(-)
diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
index 923eb74ca00..98fe1371a85 100644
--- a/include/qobject/json-parser.h
+++ b/include/qobject/json-parser.h
@@ -17,6 +17,7 @@
typedef struct JSONLexer {
int start_state, state;
GString *token;
+ int cur_x, cur_y;
int x, y;
} JSONLexer;
diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c
index 51341d96e49..7753ba6c092 100644
--- a/qobject/json-lexer.c
+++ b/qobject/json-lexer.c
@@ -277,7 +277,8 @@ void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
lexer->start_state = lexer->state = enable_interpolation
? IN_START_INTERP : IN_START;
lexer->token = g_string_sized_new(3);
- lexer->x = lexer->y = 0;
+ lexer->cur_x = lexer->cur_y = 1;
+ lexer->x = lexer->y = 1;
}
static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
@@ -285,10 +286,10 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
int new_state;
bool char_consumed = false;
- lexer->x++;
+ lexer->cur_x++;
if (ch == '\n') {
- lexer->x = 0;
- lexer->y++;
+ lexer->cur_x = 1;
+ lexer->cur_y++;
}
while (flush ? lexer->state != lexer->start_state : !char_consumed) {
@@ -316,6 +317,8 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
case IN_START:
g_string_truncate(lexer->token, 0);
new_state = lexer->start_state;
+ lexer->x = lexer->cur_x;
+ lexer->y = lexer->cur_y;
break;
case JSON_ERROR:
json_message_process_token(lexer, lexer->token, JSON_ERROR,
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 2fede59842f..93b737511d1 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -57,15 +57,6 @@ typedef struct JSONParserStackEntry {
#define BUG_ON(cond) assert(!(cond))
-/**
- * TODO
- *
- * 0) make errors meaningful again
- * 1) add geometry information to tokens
- * 3) should we return a parsed size?
- * 4) deal with premature EOI
- */
-
static inline JSONParserStackEntry *current_entry(JSONParserContext *ctxt)
{
return g_queue_peek_tail(ctxt->stack);
@@ -105,7 +96,8 @@ static void G_GNUC_PRINTF(3, 4) parse_error(JSONParserContext *ctxt,
va_start(ap, msg);
vsnprintf(message, sizeof(message), msg, ap);
va_end(ap);
- error_setg(&ctxt->err, "JSON parse error, %s", message);
+ error_setg(&ctxt->err, "JSON parse error at line %d, column %d, %s",
+ token->y, token->x, message);
}
static int cvt4hex(const char *s)
--
2.52.0
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 5/5] json-parser: add location to JSON parsing errors
2026-01-07 8:48 ` [PATCH 5/5] json-parser: add location to JSON parsing errors Paolo Bonzini
@ 2026-02-10 9:21 ` Markus Armbruster
2026-02-10 9:44 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-10 9:21 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> Now that all calls to parse_error have a token, add the line and column
Enabled by PATCH 2 pushing the JSON_END_OF_INPUT to the parser. Before,
the parser detected end of input by running out of tokens. It passed a
null token to parse_error() then.
> to the message. As far as I can see the two important TODOs (better
> errors and better EOI handling) are done, and the others (token range
> information and "parsed size"?) do not really matter or are handled
> better by json-streamer.c. So remove the list, which had sat unchanged
> since 2009.
Adding my remarks to the patch hunk below. May lead to commit message
adjustments.
An example showing error before and after would be nice.
> Note however that right now the x and y are those of the *last* character
> in the token. Modify json-lexer.c to freeze tok->x and tok->y at the
> first character added to the GString.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> include/qobject/json-parser.h | 1 +
> qobject/json-lexer.c | 11 +++++++----
> qobject/json-parser.c | 12 ++----------
> 3 files changed, 10 insertions(+), 14 deletions(-)
>
> diff --git a/include/qobject/json-parser.h b/include/qobject/json-parser.h
> index 923eb74ca00..98fe1371a85 100644
> --- a/include/qobject/json-parser.h
> +++ b/include/qobject/json-parser.h
> @@ -17,6 +17,7 @@
> typedef struct JSONLexer {
> int start_state, state;
> GString *token;
> + int cur_x, cur_y;
> int x, y;
> } JSONLexer;
>
> diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c
> index 51341d96e49..7753ba6c092 100644
> --- a/qobject/json-lexer.c
> +++ b/qobject/json-lexer.c
> @@ -277,7 +277,8 @@ void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
> lexer->start_state = lexer->state = enable_interpolation
> ? IN_START_INTERP : IN_START;
> lexer->token = g_string_sized_new(3);
> - lexer->x = lexer->y = 0;
> + lexer->cur_x = lexer->cur_y = 1;
> + lexer->x = lexer->y = 1;
Not mentioned in the commit message: we switch @ and @y from zero-based
to one-based. Pefectly safe, because they're both unused before this
patch.
Suggest to rename to line and col.
> }
>
> static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
> @@ -285,10 +286,10 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
> int new_state;
> bool char_consumed = false;
>
> - lexer->x++;
> + lexer->cur_x++;
> if (ch == '\n') {
> - lexer->x = 0;
> - lexer->y++;
> + lexer->cur_x = 1;
> + lexer->cur_y++;
> }
>
> while (flush ? lexer->state != lexer->start_state : !char_consumed) {
> @@ -316,6 +317,8 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
> case IN_START:
> g_string_truncate(lexer->token, 0);
> new_state = lexer->start_state;
> + lexer->x = lexer->cur_x;
> + lexer->y = lexer->cur_y;
> break;
> case JSON_ERROR:
> json_message_process_token(lexer, lexer->token, JSON_ERROR,
> diff --git a/qobject/json-parser.c b/qobject/json-parser.c
> index 2fede59842f..93b737511d1 100644
> --- a/qobject/json-parser.c
> +++ b/qobject/json-parser.c
> @@ -57,15 +57,6 @@ typedef struct JSONParserStackEntry {
>
> #define BUG_ON(cond) assert(!(cond))
>
> -/**
> - * TODO
These go back all the way to the initial commit.
> - *
> - * 0) make errors meaningful again
The errors weren't meaningless, just bad. The "again" is kind of funny,
considering it's the initial commit.
I agree with removing the line, but i wonder what exactly improved
beyond line and column ("geometry information").
> - * 1) add geometry information to tokens
Tokens had "geometry information" x, y since day one. Delete.
> - * 3) should we return a parsed size?
Why we'd be interesed in the parsed size we no longer know. Delete.
> - * 4) deal with premature EOI
Fixed in commit 11e8a46cc35 (json-parser: detect premature EOI) almost
fourteen years ago. Delete.
> - */
> -
> static inline JSONParserStackEntry *current_entry(JSONParserContext *ctxt)
> {
> return g_queue_peek_tail(ctxt->stack);
> @@ -105,7 +96,8 @@ static void G_GNUC_PRINTF(3, 4) parse_error(JSONParserContext *ctxt,
> va_start(ap, msg);
> vsnprintf(message, sizeof(message), msg, ap);
> va_end(ap);
> - error_setg(&ctxt->err, "JSON parse error, %s", message);
> + error_setg(&ctxt->err, "JSON parse error at line %d, column %d, %s",
> + token->y, token->x, message);
The usual format used by code parsing tools such as compilers is like
"%d:%d: JSON parse error: %s, token->x, token->y, message
Should we use it?
> }
>
> static int cvt4hex(const char *s)
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 5/5] json-parser: add location to JSON parsing errors
2026-02-10 9:21 ` Markus Armbruster
@ 2026-02-10 9:44 ` Paolo Bonzini
2026-02-11 7:18 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-10 9:44 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On 2/10/26 10:21, Markus Armbruster wrote:
>> - *
>> - * 0) make errors meaningful again
>
> The errors weren't meaningless, just bad. The "again" is kind of funny,
> considering it's the initial commit.
>
> I agree with removing the line, but i wonder what exactly improved
> beyond line and column ("geometry information").
I think it's more of a reference to the error recovery. There's only so
many ways to report errors for a grammar as simple as JSON.
But just for kicks I looked at the very first implementation and the
current state, and diff-ed the errors. These are the differences:
renamed:
"expected separator in dict" -> "expected ',' or '}'"
"expected separator in list" -> "expected ',' or ']'"
"missing : in object pair" -> "expected ':'"
removed (merged with "expecting value" or "premature end of input"):
"Missing value in dict"
split (out of just "expecting value"):
"invalid interpolation '%s'"
new errors:
"can't interpolate into string"
"%.*s is not a valid Unicode character"
"duplicate key"
"invalid UTF-8 sequence in string"
"premature end of input"
So I don't know if they're "meaningful again", but there were some
changes over the years and the current set seems good enough to me!
Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 5/5] json-parser: add location to JSON parsing errors
2026-02-10 9:44 ` Paolo Bonzini
@ 2026-02-11 7:18 ` Markus Armbruster
0 siblings, 0 replies; 26+ messages in thread
From: Markus Armbruster @ 2026-02-11 7:18 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 2/10/26 10:21, Markus Armbruster wrote:
>>> - *
>>> - * 0) make errors meaningful again
>>
>> The errors weren't meaningless, just bad. The "again" is kind of funny,
>> considering it's the initial commit.
>>
>> I agree with removing the line, but i wonder what exactly improved
>> beyond line and column ("geometry information").
>
> I think it's more of a reference to the error recovery. There's only so
> many ways to report errors for a grammar as simple as JSON.
Point!
> But just for kicks I looked at the very first implementation and the
> current state, and diff-ed the errors. These are the differences:
>
> renamed:
> "expected separator in dict" -> "expected ',' or '}'"
> "expected separator in list" -> "expected ',' or ']'"
> "missing : in object pair" -> "expected ':'"
>
> removed (merged with "expecting value" or "premature end of input"):
> "Missing value in dict"
>
> split (out of just "expecting value"):
> "invalid interpolation '%s'"
Improvements.
> new errors:
> "can't interpolate into string"
> "%.*s is not a valid Unicode character"
> "duplicate key"
> "invalid UTF-8 sequence in string"
> "premature end of input"
Bug fixes :)
Thanks!
> So I don't know if they're "meaningful again", but there were some
> changes over the years and the current set seems good enough to me!
Yes. And further improvement, if desired, wouldn't be this series' job
anyway.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
` (4 preceding siblings ...)
2026-01-07 8:48 ` [PATCH 5/5] json-parser: add location to JSON parsing errors Paolo Bonzini
@ 2026-01-21 5:57 ` Paolo Bonzini
2026-01-30 13:00 ` Markus Armbruster
6 siblings, 0 replies; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-21 5:57 UTC (permalink / raw)
To: qemu-devel; +Cc: Armbruster, Markus
[-- Attachment #1: Type: text/plain, Size: 1727 bytes --]
Ping.
Paolo
Il mer 7 gen 2026, 09:48 Paolo Bonzini <pbonzini@redhat.com> ha scritto:
> This rewrites the json-parser to use a push parser aka state machine.
> While push parsers are inherently more complex than recursive descent,
> the grammar for JSON is simple enough that the parser remains readable.
> There is therefore no need to use e.g. QEMU coroutines.
>
> Unlike the suggestion in commit 62815d85aed ("json: Redesign the callback
> to consume JSON values", 2018-08-24), I kept the json-streamer concept.
> It helps in handling input limits, it performs error recovery, and it
> converts the token-at-a-time push interface to callbacks---all things
> that are more easily done in a separate layer to keep the parser clean.
> However, there is no need anymore for it to store partial JSON objects
> in tokenized form.
>
> Another benefit is that QEMU can report the first parsing error
> immediately, without waiting for delimiters to be balanced.
>
> On top of the benefits intrinsic in the push architecture, it so happens
> that it's really easy to add a location to JSON parsing errors now, so
> do that as well.
>
> Paolo
>
>
> Paolo Bonzini (5):
> json-parser: pass around lookahead token, constify
> json-parser: replace with a push parser
> json-streamer: remove token queue
> json-streamer: do not heap-allocate JSONToken
> json-parser: add location to JSON parsing errors
>
> include/qobject/json-parser.h | 12 +-
> qobject/json-parser-int.h | 13 +-
> qobject/json-lexer.c | 11 +-
> qobject/json-parser.c | 493 ++++++++++++++++------------------
> qobject/json-streamer.c | 107 ++++----
> 5 files changed, 310 insertions(+), 326 deletions(-)
>
> --
> 2.52.0
>
[-- Attachment #2: Type: text/html, Size: 2246 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-01-07 8:48 [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
` (5 preceding siblings ...)
2026-01-21 5:57 ` [PATCH 0/5] qobject: switch JSON parser to push Paolo Bonzini
@ 2026-01-30 13:00 ` Markus Armbruster
2026-01-30 13:36 ` Paolo Bonzini
6 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-01-30 13:00 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel, armbru
Paolo Bonzini <pbonzini@redhat.com> writes:
> This rewrites the json-parser to use a push parser aka state machine.
> While push parsers are inherently more complex than recursive descent,
> the grammar for JSON is simple enough that the parser remains readable.
> There is therefore no need to use e.g. QEMU coroutines.
>
> Unlike the suggestion in commit 62815d85aed ("json: Redesign the callback
> to consume JSON values", 2018-08-24), I kept the json-streamer concept.
> It helps in handling input limits, it performs error recovery, and it
> converts the token-at-a-time push interface to callbacks---all things
> that are more easily done in a separate layer to keep the parser clean.
> However, there is no need anymore for it to store partial JSON objects
> in tokenized form.
>
> Another benefit is that QEMU can report the first parsing error
> immediately, without waiting for delimiters to be balanced.
Sounds promising! Let's see...
Before the series:
$ socat "READLINE,prompt=QMP> " UNIX-CONNECT:$HOME/work/images/test-qmp
{"QMP": {"version": {"qemu": {"micro": 50, "minor": 2, "major": 10}, "package": "v10.2.0-567-gfb6b66de43-dirty"}, "capabilities": ["oob"]}}
QMP> [{"a"]
Parse error not diagnosed right away, but ...
QMP> }
{"error": {"class": "GenericError", "desc": "JSON parse error, missing : in object pair"}}
.... only when the streamer decides the expression is complete.
After the series:
QMP> [{"a"]
{"error": {"class": "GenericError", "desc": "JSON parse error at line 1, column 6, expecting ':'"}}
Cool! However, if I do it again, things fall apart:
QMP> [{"a"
QMP> }
QMP> }
QMP> }
QMP> ]
QMP> ]
{"error": {"class": "GenericError", "desc": "JSON parse error at line 7, column 1, expecting value"}}
Parse error recovery not quite right?
> On top of the benefits intrinsic in the push architecture, it so happens
> that it's really easy to add a location to JSON parsing errors now, so
> do that as well.
>
> Paolo
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-01-30 13:00 ` Markus Armbruster
@ 2026-01-30 13:36 ` Paolo Bonzini
2026-02-10 13:06 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-01-30 13:36 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
[-- Attachment #1: Type: text/plain, Size: 2092 bytes --]
Il ven 30 gen 2026, 14:00 Markus Armbruster <armbru@redhat.com> ha scritto:
> > Another benefit is that QEMU can report the first parsing error
> > immediately, without waiting for delimiters to be balanced.
>
> Sounds promising! Let's see...
>
> Before the series:
>
> $ socat "READLINE,prompt=QMP> "
> UNIX-CONNECT:$HOME/work/images/test-qmp
> {"QMP": {"version": {"qemu": {"micro": 50, "minor": 2, "major": 10},
> "package": "v10.2.0-567-gfb6b66de43-dirty"}, "capabilities": ["oob"]}}
> QMP> [{"a"]
>
> Parse error not diagnosed right away, but ...
>
> QMP> }
> {"error": {"class": "GenericError", "desc": "JSON parse error, missing
> : in object pair"}}
>
> .... only when the streamer decides the expression is complete.
>
> After the series:
>
> QMP> [{"a"]
> {"error": {"class": "GenericError", "desc": "JSON parse error at line
> 1, column 6, expecting ':'"}}
>
> Cool! However, if I do it again, things fall apart:
>
> QMP> [{"a"
> QMP> }
> QMP> }
> QMP> }
> QMP> ]
> QMP> ]
> {"error": {"class": "GenericError", "desc": "JSON parse error at line
> 7, column 1, expecting value"}}
>
> Parse error recovery not quite right?
>
Well, if you read the above very carefully :) the error is *reported*
immediately, but recovery still waits for delimiters to be balanced.
In testing, when I got an error I just typed a long enough variant on
"]}]}]}" and that is enough to recover—just like in the old parser. Still
the difference in error reporting matters, because it gives feedback that
is immediately useful, rather than possibly delayed forever.
The policy is easy to change, either in v2 or in subsequent work, because
recovery is layered on top of json-parser and its code is nothing more than
"if you believe it's a good time to recover, reset the parser".
Paolo
> On top of the benefits intrinsic in the push architecture, it so happens
> > that it's really easy to add a location to JSON parsing errors now, so
> > do that as well.
> >
> > Paolo
>
>
[-- Attachment #2: Type: text/html, Size: 3318 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-01-30 13:36 ` Paolo Bonzini
@ 2026-02-10 13:06 ` Markus Armbruster
2026-02-10 13:12 ` Paolo Bonzini
0 siblings, 1 reply; 26+ messages in thread
From: Markus Armbruster @ 2026-02-10 13:06 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: Markus Armbruster, qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> Il ven 30 gen 2026, 14:00 Markus Armbruster <armbru@redhat.com> ha scritto:
>
>> > Another benefit is that QEMU can report the first parsing error
>> > immediately, without waiting for delimiters to be balanced.
>>
>> Sounds promising! Let's see...
>>
>> Before the series:
>>
>> $ socat "READLINE,prompt=QMP> " UNIX-CONNECT:$HOME/work/images/test-qmp
>> {"QMP": {"version": {"qemu": {"micro": 50, "minor": 2, "major": 10}, "package": "v10.2.0-567-gfb6b66de43-dirty"}, "capabilities": ["oob"]}}
>> QMP> [{"a"]
>>
>> Parse error not diagnosed right away, but ...
>>
>> QMP> }
>> {"error": {"class": "GenericError", "desc": "JSON parse error, missing : in object pair"}}
>>
>> .... only when the streamer decides the expression is complete.
>>
>> After the series:
>>
>> QMP> [{"a"]
>> {"error": {"class": "GenericError", "desc": "JSON parse error at line 1, column 6, expecting ':'"}}
>>
>> Cool! However, if I do it again, things fall apart:
>>
>> QMP> [{"a"
>> QMP> }
>> QMP> }
>> QMP> }
>> QMP> ]
>> QMP> ]
>> {"error": {"class": "GenericError", "desc": "JSON parse error at line 7, column 1, expecting value"}}
>>
>> Parse error recovery not quite right?
>
> Well, if you read the above very carefully :) the error is *reported*
> immediately, but recovery still waits for delimiters to be balanced.
Yes, but the recovery regressed a bit.
Having read the entire series now, I wonder whether it's related to
capping @brace_count and @bracket_count at zero [PATCH 3].
My example input [{"a"] puts the parser in error recovery state with
brace_count == 1, bracket_count == 0. Your parser will silently throw
away further input until brace_count drops to zero.
The old parser additionally snaps out of this unresponsive state when
bracked_count goes negative.
> In testing, when I got an error I just typed a long enough variant on
> "]}]}]}" and that is enough to recover—just like in the old parser.
With the old parser, I can park my finger on ']' or '}' for a moment,
hit return, and be good. With the new one, I have to park on both.
More seriously, the new parser seems to break a recovery technique
documented in docs/interop/qmp-spec.rst:
Forcing the JSON parser into known-good state
---------------------------------------------
Incomplete or invalid input can leave the server's JSON parser in a
state where it can't parse additional commands. To get it back into
known-good state, the client should provoke a lexical error.
The cleanest way to do that is sending an ASCII control character
other than ``\t`` (horizontal tab), ``\r`` (carriage return), or
``\n`` (new line).
Works with old parser:
QMP> [{"a"]
QMP> ^L
{"error": {"class": "GenericError", "desc": "JSON parse error, stray '\f'"}}
QMP> {}
{"error": {"class": "GenericError", "desc": "QMP input lacks member 'execute'"}}
Note: the ^L is a formfeed character.
New parser:
QMP> [{"a"]
{"error": {"class": "GenericError", "desc": "JSON parse error at line 1, column 6, expecting ':'"}}
QMP> ^L
QMP> {}
Good: the parse error is reported immediately.
Bad: the formfeed no longer forces the parser into known-good state.
This needs fixing.
> Still
> the difference in error reporting matters, because it gives feedback that
> is immediately useful, rather than possibly delayed forever.
Reporting parse errors immediately is such a lovely quality of life
improvement. May I have that without regressing error recovery?
> The policy is easy to change, either in v2 or in subsequent work, because
> recovery is layered on top of json-parser and its code is nothing more than
> "if you believe it's a good time to recover, reset the parser".
>
> Paolo
>> > On top of the benefits intrinsic in the push architecture, it so happens
>> > that it's really easy to add a location to JSON parsing errors now, so
>> > do that as well.
>> >
>> > Paolo
>>
>>
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-02-10 13:06 ` Markus Armbruster
@ 2026-02-10 13:12 ` Paolo Bonzini
2026-02-10 15:52 ` Markus Armbruster
0 siblings, 1 reply; 26+ messages in thread
From: Paolo Bonzini @ 2026-02-10 13:12 UTC (permalink / raw)
To: Markus Armbruster; +Cc: qemu-devel
On Tue, Feb 10, 2026 at 2:06 PM Markus Armbruster <armbru@redhat.com> wrote:
>
> Paolo Bonzini <pbonzini@redhat.com> writes:
> > Still
> > the difference in error reporting matters, because it gives feedback that
> > is immediately useful, rather than possibly delayed forever.
>
> Reporting parse errors immediately is such a lovely quality of life
> improvement. May I have that without regressing error recovery?
If it is a regression, of course.
Especially the one for ^L, which needs a testcase. Something like this
should do:
diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
index f6380684897..287a345b49a 100644
--- a/qobject/json-streamer.c
+++ b/qobject/json-streamer.c
@@ -35,17 +35,25 @@
parser->brace_count++;
break;
case JSON_RCURLY:
- if (parser->brace_count > 0) {
- parser->brace_count--;
+ if (parser->brace_count <= 0) {
+ goto end_error_recovery;
}
+ parser->brace_count--;
break;
case JSON_LSQUARE:
parser->bracket_count++;
break;
case JSON_RSQUARE:
- if (parser->bracket_count > 0) {
- parser->bracket_count--;
+ if (parser->bracket_count <= 0) {
+ goto end_error_recovery;
}
+ parser->bracket_count--;
+ break;
+ case JSON_ERROR:
+ end_error_recovery:
+ parser->brace_count = 0;
+ parser->bracket_count = 0;
break;
default:
break;
Thanks,
Paolo
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH 0/5] qobject: switch JSON parser to push
2026-02-10 13:12 ` Paolo Bonzini
@ 2026-02-10 15:52 ` Markus Armbruster
0 siblings, 0 replies; 26+ messages in thread
From: Markus Armbruster @ 2026-02-10 15:52 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: Markus Armbruster, qemu-devel
Paolo Bonzini <pbonzini@redhat.com> writes:
> On Tue, Feb 10, 2026 at 2:06 PM Markus Armbruster <armbru@redhat.com> wrote:
>>
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>> > Still
>> > the difference in error reporting matters, because it gives feedback that
>> > is immediately useful, rather than possibly delayed forever.
>>
>> Reporting parse errors immediately is such a lovely quality of life
>> improvement. May I have that without regressing error recovery?
>
> If it is a regression, of course.
>
> Especially the one for ^L, which needs a testcase.
I totally should've added one when I documented the technique back in
2018.
> Something like this
> should do:
>
> diff --git a/qobject/json-streamer.c b/qobject/json-streamer.c
> index f6380684897..287a345b49a 100644
> --- a/qobject/json-streamer.c
> +++ b/qobject/json-streamer.c
> @@ -35,17 +35,25 @@
> parser->brace_count++;
> break;
> case JSON_RCURLY:
> - if (parser->brace_count > 0) {
> - parser->brace_count--;
> + if (parser->brace_count <= 0) {
> + goto end_error_recovery;
> }
> + parser->brace_count--;
> break;
> case JSON_LSQUARE:
> parser->bracket_count++;
> break;
> case JSON_RSQUARE:
> - if (parser->bracket_count > 0) {
> - parser->bracket_count--;
> + if (parser->bracket_count <= 0) {
> + goto end_error_recovery;
> }
> + parser->bracket_count--;
> + break;
> + case JSON_ERROR:
> + end_error_recovery:
> + parser->brace_count = 0;
> + parser->bracket_count = 0;
> break;
> default:
> break;
Passes a quick smoke test!
Fine print: different errors can get reported, but that's probably a
slight improvement.
^ permalink raw reply [flat|nested] 26+ messages in thread