All of lore.kernel.org
 help / color / mirror / Atom feed
From: John Cai <johncai86@gmail.com>
To: git@vger.kernel.org
Cc: avarab@gmail.com, me@ttaylorr.com, phillip.wood123@gmail.com,
	John Cai <johncai86@gmail.com>
Subject: [RFC v2] cat-file: add a --stdin-cmd mode
Date: Tue, 25 Jan 2022 17:50:08 -0500	[thread overview]
Message-ID: <20220125225008.45944-1-johncai86@gmail.com> (raw)

This RFC patch proposes a new flag --stdin-cmd that works with
git-cat-file --batch. Similar to git-update-ref --stdin, it will accept
commands and arguments from stdin.

The start of this idea was discussed in [1], where the original
motivation was to be able to control when the buffer was flushed to
stdout in --buffer mode.

However, this can actually be much more useful in situations when
git-cat-file --batch is being used as a long lived backend query
process. At GitLab, we use a pair of cat-file processes. One for
iterating over object metadata with --batch-check, and the other to grab
object contents with --batch. However, if we had --stdin-cmd, we could
get rid of the second --batch-check process, and just have one progress
where we can flip between getting object info, and getting object contents.
This can lead to huge savings.

git cat-file --batch --stdin-cmd

$ <command> [arg1] [arg2] NL

We can also add a -z mode to allow for NUL-terminated lines

$ <command> [arg1] [arg2] NUL

This patch adds three commands: object, info, fflush

$ object <sha1> NL
$ info <sha1> NL
$ fflush NL

These three would be immediately useful in GitLab's context, but one can
imagine this mode to be further extended for other things.

For instance, a non-trivial part of "cat-file --batch" time is spent
on parsing its argument and seeing if it's a revision, ref etc. So we
could add a command that only accepts a full-length 40
character SHA-1.

This would be the first step in adding such an interface to
git-cat-file.

[1] https://lore.kernel.org/git/pull.1124.git.git.1636149400.gitgitgadget@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: John Cai <johncai86@gmail.com>
---
Changes from v1:

- changed option name to batch-command.
- changed command function interface to receive the whole line after the command
  name to put the onus of parsing arguments to each individual command function.
- pass in whole line to batch_one_object in both parse_cmd_object and
  parse_cmd_info to support spaces in the object reference.
- removed addition of -z to include in a separate patch series
- added documentation.
---
 Documentation/git-cat-file.txt |  15 +++++
 builtin/cat-file.c             | 114 ++++++++++++++++++++++++++++++++-
 strvec.c                       |  23 +++++++
 strvec.h                       |   8 +++
 t/t1006-cat-file.sh            |  32 +++++++++
 5 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-cat-file.txt b/Documentation/git-cat-file.txt
index bef76f4dd0..8aefa45e4c 100644
--- a/Documentation/git-cat-file.txt
+++ b/Documentation/git-cat-file.txt
@@ -96,6 +96,21 @@ OPTIONS
 	need to specify the path, separated by whitespace.  See the
 	section `BATCH OUTPUT` below for details.
 
+-batch-command::
+	Enter a command mode that reads from stdin. May not be combined with any
+	other options or arguments except `--textconv` or `--filters`, in which
+	case the input lines also need to specify the path, separated by
+	whitespace.  See the section `BATCH OUTPUT` below for details.
+
+object <object>::
+	Print object contents for object reference <object>
+
+info <object>::
+	Print object info for object reference <object>
+
+flush::
+	Flush to stdout immediately when used with --buffer
+
 --batch-all-objects::
 	Instead of reading a list of objects on stdin, perform the
 	requested batch operation on all objects in the repository and
diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 7b3f42950e..30794284d5 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -16,6 +16,7 @@
 #include "packfile.h"
 #include "object-store.h"
 #include "promisor-remote.h"
+#include "strvec.h"
 
 struct batch_options {
 	int enabled;
@@ -26,7 +27,10 @@ struct batch_options {
 	int unordered;
 	int cmdmode; /* may be 'w' or 'c' for --filters or --textconv */
 	const char *format;
+	int stdin_cmd;
+	int end_null;
 };
+static char line_termination = '\n';
 
 static const char *force_path;
 
@@ -508,6 +512,102 @@ static int batch_unordered_packed(const struct object_id *oid,
 				      data);
 }
 
+enum batch_state {
+	/* Non-transactional state open for commands. */
+	BATCH_STATE_OPEN,
+};
+
+static void parse_cmd_object(struct batch_options *opt,
+			     const char *line,
+			     struct strbuf *output,
+			     struct expand_data *data)
+{
+	opt->print_contents = 1;
+	batch_one_object(line, output, opt, data);
+}
+
+static void parse_cmd_info(struct batch_options *opt,
+			   const char *line,
+			   struct strbuf *output,
+			   struct expand_data *data)
+{
+	opt->print_contents = 0;
+	batch_one_object(line, output, opt, data);
+}
+
+static void parse_cmd_fflush(struct batch_options *opt,
+			     const char *line,
+			     struct strbuf *output,
+			     struct expand_data *data)
+{
+	fflush(stdout);
+}
+
+typedef void (*parse_cmd_fn_t)(struct batch_options *, const char *,
+			       struct strbuf *, struct expand_data *);
+
+static const struct parse_cmd {
+	const char *prefix;
+	parse_cmd_fn_t fn;
+	unsigned args;
+	enum batch_state state;
+} commands[] = {
+	{ "object", parse_cmd_object, 1, BATCH_STATE_OPEN },
+	{ "info", parse_cmd_info, 1, BATCH_STATE_OPEN },
+	{ "fflush", parse_cmd_fflush, 0, BATCH_STATE_OPEN },
+};
+
+static void batch_objects_stdin_cmd(struct batch_options *opt,
+				    struct strbuf *output,
+				    struct expand_data *data)
+{
+	struct strbuf input = STRBUF_INIT;
+	enum batch_state state = BATCH_STATE_OPEN;
+
+	/* Read each line dispatch its command */
+	while (!strbuf_getwholeline(&input, stdin, line_termination)) {
+		int i;
+		const struct parse_cmd *cmd = NULL;
+		const char *p;
+
+		if (*input.buf == line_termination)
+			die("empty command in input");
+		else if (isspace(*input.buf))
+			die("whitespace before command: %s", input.buf);
+
+		for (i = 0; i < ARRAY_SIZE(commands); i++) {
+			const char *prefix = commands[i].prefix;
+			char c;
+			const char *cmd_name;
+			if (!skip_prefix(input.buf, prefix, &cmd_name))
+				continue;
+			/*
+			 * If the command has arguments, verify that it's
+			 * followed by a space. Otherwise, it shall be followed
+			 * by a line terminator.
+			 */
+			c = commands[i].args ? ' ' : line_termination;
+			if (input.buf[strlen(prefix)] != c)
+				continue;
+
+			cmd = &commands[i];
+			break;
+		}
+		if (!cmd)
+			die("unknown command: %s", input.buf);
+
+		p = input.buf + strlen(cmd->prefix) + 1;
+		const char *pos = strstr(p, &line_termination);
+
+		switch (state) {
+		case BATCH_STATE_OPEN:
+			break;
+		}
+		cmd->fn(opt, xstrndup(p, pos-p), output, data);
+	}
+	strbuf_release(&input);
+}
+
 static int batch_objects(struct batch_options *opt)
 {
 	struct strbuf input = STRBUF_INIT;
@@ -515,6 +615,7 @@ static int batch_objects(struct batch_options *opt)
 	struct expand_data data;
 	int save_warning;
 	int retval = 0;
+	const int stdin_cmd = opt->stdin_cmd;
 
 	if (!opt->format)
 		opt->format = "%(objectname) %(objecttype) %(objectsize)";
@@ -590,7 +691,8 @@ static int batch_objects(struct batch_options *opt)
 	save_warning = warn_on_object_refname_ambiguity;
 	warn_on_object_refname_ambiguity = 0;
 
-	while (strbuf_getline(&input, stdin) != EOF) {
+	while (!stdin_cmd &&
+	       strbuf_getline(&input, stdin) != EOF) {
 		if (data.split_on_whitespace) {
 			/*
 			 * Split at first whitespace, tying off the beginning
@@ -608,6 +710,9 @@ static int batch_objects(struct batch_options *opt)
 		batch_one_object(input.buf, &output, opt, &data);
 	}
 
+	if (stdin_cmd)
+		batch_objects_stdin_cmd(opt, &output, &data);
+
 	strbuf_release(&input);
 	strbuf_release(&output);
 	warn_on_object_refname_ambiguity = save_warning;
@@ -636,6 +741,7 @@ static int batch_option_callback(const struct option *opt,
 
 	bo->enabled = 1;
 	bo->print_contents = !strcmp(opt->long_name, "batch");
+	bo->stdin_cmd = !strcmp(opt->long_name, "batch-command");
 	bo->format = arg;
 
 	return 0;
@@ -683,6 +789,10 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix)
 			N_("like --batch, but don't emit <contents>"),
 			PARSE_OPT_OPTARG | PARSE_OPT_NONEG,
 			batch_option_callback),
+		OPT_CALLBACK_F(0, "batch-command", &batch, N_(""),
+			 N_("enters batch mode that accepts commands"),
+			 PARSE_OPT_NOARG | PARSE_OPT_NONEG,
+			 batch_option_callback),
 		OPT_CMDMODE(0, "batch-all-objects", &opt,
 			    N_("with --batch[-check]: ignores stdin, batches all known objects"), 'b'),
 		/* Batch-specific options */
@@ -738,6 +848,8 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix)
 	/* Batch defaults */
 	if (batch.buffer_output < 0)
 		batch.buffer_output = batch.all_objects;
+	if (batch.end_null)
+		line_termination = '\0';
 
 	/* Return early if we're in batch mode? */
 	if (batch.enabled) {
diff --git a/strvec.c b/strvec.c
index 61a76ce6cb..7dca04bf7a 100644
--- a/strvec.c
+++ b/strvec.c
@@ -85,6 +85,29 @@ void strvec_split(struct strvec *array, const char *to_split)
 	}
 }
 
+size_t strvec_split_delim(struct strvec *array, const char *string,
+			  int delim, int maxsplit)
+{
+	size_t count = 0;
+	const char *p = string, *end;
+
+	for (;;) {
+		count++;
+		if (maxsplit >= 0 && count > maxsplit) {
+			strvec_push(array, p);
+			return count;
+		}
+		end = strchr(p, delim);
+		if (end) {
+			strvec_push_nodup(array, xmemdupz(p, end - p));
+			p = end + 1;
+		} else {
+			strvec_push(array, p);
+			return count;
+		}
+	}
+}
+
 void strvec_clear(struct strvec *array)
 {
 	if (array->v != empty_strvec) {
diff --git a/strvec.h b/strvec.h
index 9f55c8766b..c474918b91 100644
--- a/strvec.h
+++ b/strvec.h
@@ -73,6 +73,14 @@ void strvec_pop(struct strvec *);
 /* Splits by whitespace; does not handle quoted arguments! */
 void strvec_split(struct strvec *, const char *);
 
+/**
+ * strvec_split_delim() is a split function that behaves more like its
+ * string_list_split() cousin than the whitespace-splitting
+ * strvec_split().
+ */
+size_t strvec_split_delim(struct strvec *array, const char *string,
+			  int delim, int maxsplit);
+
 /**
  * Free all memory associated with the array and return it to the
  * initial, empty state.
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index 145eee11df..935ab1cd34 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -964,4 +964,36 @@ test_expect_success 'cat-file --batch-all-objects --batch-check ignores replace'
 	test_cmp expect actual
 '
 
+F='%s\0'
+
+test_expect_success 'batch-command unknown command' '
+	echo unknown_command >cmd &&
+	test_expect_code 128 git cat-file --batch-command < cmd 2>err &&
+	grep -E "^fatal:.*unknown command.*" err
+'
+
+test_expect_success 'setup object data' '
+	content="Object Data" &&
+	size=$(strlen "$content") &&
+	sha1=$(echo_without_newline "$content" | git hash-object -w --stdin)
+'
+
+test_expect_success 'batch-command calling object works' '
+	echo "object $sha1" | git cat-file --batch-command >actual &&
+	echo "$sha1 blob $size" >expect &&
+	echo `git cat-file -p "$sha1"` >>expect &&
+	test_cmp expect actual
+'
+
+test_expect_success 'batch-command calling info works' '
+	echo "info $sha1" | git cat-file --batch-command >actual &&
+	echo "$sha1 blob $size" >expect &&
+	test_cmp expect actual
+'
+
+test_expect_success 'batch-command fflush works' '
+	printf "fflush\n" > cmd &&
+	test_expect_code 0 git cat-file --batch-command < cmd 2>err
+'
+
 test_done
-- 
2.34.1


             reply	other threads:[~2022-01-25 22:51 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-25 22:50 John Cai [this message]
2022-01-25 23:00 ` [RFC v2] cat-file: add a --stdin-cmd mode John Cai
2022-01-27 11:25 ` Phillip Wood
2022-01-27 21:04   ` John Cai
2022-01-28  4:16     ` John Cai
2022-01-28  6:07 ` Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220125225008.45944-1-johncai86@gmail.com \
    --to=johncai86@gmail.com \
    --cc=avarab@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=me@ttaylorr.com \
    --cc=phillip.wood123@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.