git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Patrick Steinhardt <ps@pks.im>
To: Derrick Stolee via GitGitGadget <gitgitgadget@gmail.com>
Cc: git@vger.kernel.org, gitster@pobox.com,
	johannes.schindelin@gmx.de, peff@peff.net, me@ttaylorr.com,
	johncai86@gmail.com, newren@gmail.com,
	christian.couder@gmail.com, kristofferhaugsbakk@fastmail.com,
	jonathantanmy@google.com, karthik nayak <karthik.188@gmail.com>,
	Derrick Stolee <stolee@gmail.com>
Subject: Re: [PATCH v3 1/7] path-walk: introduce an object walk by path
Date: Fri, 13 Dec 2024 12:58:41 +0100	[thread overview]
Message-ID: <Z1whcUJE-MHAhULO@pks.im> (raw)
In-Reply-To: <b7e9b81e8b32313f00d38257ba731e73d17224cb.1733514358.git.gitgitgadget@gmail.com>

On Fri, Dec 06, 2024 at 07:45:52PM +0000, Derrick Stolee via GitGitGadget wrote:
> --- /dev/null
> +++ b/path-walk.c
> @@ -0,0 +1,263 @@
> +/*
> + * path-walk.c: implementation for path-based walks of the object graph.
> + */
> +#include "git-compat-util.h"
> +#include "path-walk.h"
> +#include "blob.h"
> +#include "commit.h"
> +#include "dir.h"
> +#include "hashmap.h"
> +#include "hex.h"
> +#include "object.h"
> +#include "oid-array.h"
> +#include "revision.h"
> +#include "string-list.h"
> +#include "strmap.h"
> +#include "trace2.h"
> +#include "tree.h"
> +#include "tree-walk.h"
> +
> +struct type_and_oid_list
> +{
> +	enum object_type type;
> +	struct oid_array oids;
> +};

Nit: formatting of this struct is off.

> +static void push_to_stack(struct path_walk_context *ctx,
> +			  const char *path)
> +{
> +	if (strset_contains(&ctx->path_stack_pushed, path))
> +		return;
> +
> +	strset_add(&ctx->path_stack_pushed, path);
> +	string_list_append(&ctx->path_stack, path);
> +}
> +
> +static int add_children(struct path_walk_context *ctx,
> +			const char *base_path,
> +			struct object_id *oid)
> +{

So this function assumes that `oid` always refers to a tree? I think it
would make sense to clarify this by calling the function accordingly,
like e.g. `add_tree_entries()`.

> +	struct tree_desc desc;
> +	struct name_entry entry;
> +	struct strbuf path = STRBUF_INIT;
> +	size_t base_len;
> +	struct tree *tree = lookup_tree(ctx->repo, oid);
> +
> +	if (!tree) {
> +		error(_("failed to walk children of tree %s: not found"),
> +		      oid_to_hex(oid));
> +		return -1;
> +	} else if (parse_tree_gently(tree, 1)) {
> +		die("bad tree object %s", oid_to_hex(oid));

I wonder whether we maybe shouldn't die but instead return an error in
the spirit of libification.

> +	}
> +	strbuf_addstr(&path, base_path);
> +	base_len = path.len;
> +
> +	parse_tree(tree);
> +	init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
> +	while (tree_entry(&desc, &entry)) {
> +		struct type_and_oid_list *list;
> +		struct object *o;
> +		/* Not actually true, but we will ignore submodules later. */
> +		enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
> +
> +		/* Skip submodules. */
> +		if (S_ISGITLINK(entry.mode))
> +			continue;
> +
> +		if (type == OBJ_TREE) {
> +			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
> +			o = child ? &child->object : NULL;
> +		} else if (type == OBJ_BLOB) {
> +			struct blob *child = lookup_blob(ctx->repo, &entry.oid);
> +			o = child ? &child->object : NULL;
> +		} else {
> +			/* Wrong type? */
> +			continue;

This code is unreachable, so we could make this a `BUG()`. Might also
use a switch instead, but that's more of a stylistic question.

> +		}
> +
> +		if (!o) /* report error?*/
> +			continue;

So this can only happen in case `lookup_tree()` or `lookup_blob()`
run into an error. I think this error should definitely be bubbled up so
that we don't silently skip tree entries in case of repo corruption.

> +		strbuf_setlen(&path, base_len);
> +		strbuf_add(&path, entry.path, entry.pathlen);
> +
> +		/*
> +		 * Trees will end with "/" for concatenation and distinction
> +		 * from blobs at the same path.
> +		 */
> +		if (type == OBJ_TREE)
> +			strbuf_addch(&path, '/');
> +
> +		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
> +			CALLOC_ARRAY(list, 1);
> +			list->type = type;
> +			strmap_put(&ctx->paths_to_lists, path.buf, list);
> +		}
> +		push_to_stack(ctx, path.buf);
> +
> +		/* Skip this object if already seen. */
> +		if (o->flags & SEEN)
> +			continue;
> +		o->flags |= SEEN;

This made me wonder: why do we only skip the object this late? Couldn't
we already have done so immediately after we have looked up the object
to avoid some work? If not, it might be useful to add a comment why it
has to come this late.

> +		oid_array_append(&list->oids, &entry.oid);
> +	}
> +
> +	free_tree_buffer(tree);
> +	strbuf_release(&path);
> +	return 0;
> +}
> +
> +/*
> + * For each path in paths_to_explore, walk the trees another level
> + * and add any found blobs to the batch (but only if they exist and
> + * haven't been added yet).
> + */
> +static int walk_path(struct path_walk_context *ctx,
> +		     const char *path)
> +{
> +	struct type_and_oid_list *list;
> +	int ret = 0;

Nit: needless initialization.

> +
> +	list = strmap_get(&ctx->paths_to_lists, path);
> +
> +	if (!list->oids.nr)
> +		return 0;
> +
> +	/* Evaluate function pointer on this data. */
> +	ret = ctx->info->path_fn(path, &list->oids, list->type,
> +				 ctx->info->path_fn_data);
> +
> +	/* Expand data for children. */
> +	if (list->type == OBJ_TREE) {
> +		for (size_t i = 0; i < list->oids.nr; i++) {
> +			ret |= add_children(ctx,
> +					    path,
> +					    &list->oids.oid[i]);
> +		}
> +	}
> +
> +	oid_array_clear(&list->oids);
> +	strmap_remove(&ctx->paths_to_lists, path, 1);
> +	return ret;
> +}
> +
> +static void clear_strmap(struct strmap *map)

Nit: this isn't clearing a generic strmap, but rather `paths_to_lists`.
Should we maybe rename it to `clear_paths_to_lists()`?

> +{
> +	struct hashmap_iter iter;
> +	struct strmap_entry *e;
> +
> +	hashmap_for_each_entry(&map->map, &iter, e, ent) {
> +		struct type_and_oid_list *list = e->value;
> +		oid_array_clear(&list->oids);
> +	}
> +	strmap_clear(map, 1);
> +	strmap_init(map);
> +}
> +
> +/**
> + * Given the configuration of 'info', walk the commits based on 'info->revs' and
> + * call 'info->path_fn' on each discovered path.
> + *
> + * Returns nonzero on an error.
> + */
> +int walk_objects_by_path(struct path_walk_info *info)
> +{
> +	const char *root_path = "";
> +	int ret = 0;
> +	size_t commits_nr = 0, paths_nr = 0;
> +	struct commit *c;
> +	struct type_and_oid_list *root_tree_list;
> +	struct path_walk_context ctx = {
> +		.repo = info->revs->repo,
> +		.revs = info->revs,
> +		.info = info,
> +		.path_stack = STRING_LIST_INIT_DUP,
> +		.path_stack_pushed = STRSET_INIT,
> +		.paths_to_lists = STRMAP_INIT
> +	};
> +
> +	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
> +
> +	/* Insert a single list for the root tree into the paths. */
> +	CALLOC_ARRAY(root_tree_list, 1);
> +	root_tree_list->type = OBJ_TREE;
> +	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
> +	push_to_stack(&ctx, root_path);
> +
> +	if (prepare_revision_walk(info->revs))
> +		die(_("failed to setup revision walk"));
> +
> +	while ((c = get_revision(info->revs))) {
> +		struct object_id *oid = get_commit_tree_oid(c);
> +		struct tree *t;
> +		commits_nr++;
> +
> +		oid = get_commit_tree_oid(c);
> +		t = lookup_tree(info->revs->repo, oid);
> +
> +		if (!t) {
> +			warning("could not find tree %s", oid_to_hex(oid));
> +			continue;
> +		}

Is this error something we should bubble up to the caller? As mentioned
above, I'm being cautious about silently accepting potentially-corrupt
data. Silent in the spirit of the caller not noticing it, not in the
sense of the user not noticing it.

Patrick

  reply	other threads:[~2024-12-13 11:59 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-31  6:26 [PATCH 0/6] PATH WALK I: The path-walk API Derrick Stolee via GitGitGadget
2024-10-31  6:26 ` [PATCH 1/6] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-11-01 13:12   ` karthik nayak
2024-11-01 13:44     ` Derrick Stolee
     [not found]   ` <draft-87r07v14kl.fsf@archlinux.mail-host-address-is-not-set>
2024-11-01 13:42     ` karthik nayak
2024-10-31  6:26 ` [PATCH 2/6] test-lib-functions: add test_cmp_sorted Derrick Stolee via GitGitGadget
2024-10-31  6:27 ` [PATCH 3/6] t6601: add helper for testing path-walk API Derrick Stolee via GitGitGadget
2024-11-01 13:46   ` karthik nayak
2024-11-01 22:23   ` Jonathan Tan
2024-11-04 15:56     ` Derrick Stolee
2024-11-04 23:39       ` Jonathan Tan
2024-11-08 14:53         ` Derrick Stolee
2024-11-06 14:04   ` Patrick Steinhardt
2024-11-08 14:58     ` Derrick Stolee
2024-10-31  6:27 ` [PATCH 4/6] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-10-31  6:27 ` [PATCH 5/6] path-walk: visit tags and cached objects Derrick Stolee via GitGitGadget
2024-11-01 14:25   ` karthik nayak
2024-11-04 15:56     ` Derrick Stolee
2024-10-31  6:27 ` [PATCH 6/6] path-walk: mark trees and blobs as UNINTERESTING Derrick Stolee via GitGitGadget
2024-10-31 12:36 ` [PATCH 0/6] PATH WALK I: The path-walk API Derrick Stolee
2024-11-01 19:23 ` Taylor Blau
2024-11-04 15:48   ` Derrick Stolee
2024-11-04 17:25     ` Jeff King
2024-11-05  0:11       ` Junio C Hamano
2024-11-08 15:17         ` Derrick Stolee
2024-11-11  2:56           ` Junio C Hamano
2024-11-11 13:20             ` Derrick Stolee
2024-11-11 21:55             ` Jeff King
2024-11-11 22:29               ` Junio C Hamano
2024-11-11 22:04           ` Jeff King
2024-11-09 19:41 ` [PATCH v2 " Derrick Stolee via GitGitGadget
2024-11-09 19:41   ` [PATCH v2 1/6] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-11-09 19:41   ` [PATCH v2 2/6] test-lib-functions: add test_cmp_sorted Derrick Stolee via GitGitGadget
2024-11-09 19:41   ` [PATCH v2 3/6] t6601: add helper for testing path-walk API Derrick Stolee via GitGitGadget
2024-11-21 22:39     ` Taylor Blau
2024-11-09 19:41   ` [PATCH v2 4/6] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-11-21 22:44     ` Taylor Blau
2024-11-09 19:41   ` [PATCH v2 5/6] path-walk: visit tags and cached objects Derrick Stolee via GitGitGadget
2024-11-09 19:41   ` [PATCH v2 6/6] path-walk: mark trees and blobs as UNINTERESTING Derrick Stolee via GitGitGadget
2024-11-21 22:57   ` [PATCH v2 0/6] PATH WALK I: The path-walk API Taylor Blau
2024-11-25  8:56     ` Patrick Steinhardt
2024-11-26  7:39       ` Junio C Hamano
2024-11-26  7:43         ` Patrick Steinhardt
2024-11-26  8:16           ` Junio C Hamano
2024-12-06 19:45   ` [PATCH v3 0/7] " Derrick Stolee via GitGitGadget
2024-12-06 19:45     ` [PATCH v3 1/7] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-12-13 11:58       ` Patrick Steinhardt [this message]
2024-12-18 14:21         ` Derrick Stolee
2024-12-27 14:18           ` Patrick Steinhardt
2024-12-06 19:45     ` [PATCH v3 2/7] test-lib-functions: add test_cmp_sorted Derrick Stolee via GitGitGadget
2024-12-06 19:45     ` [PATCH v3 3/7] t6601: add helper for testing path-walk API Derrick Stolee via GitGitGadget
2024-12-06 19:45     ` [PATCH v3 4/7] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-12-06 19:45     ` [PATCH v3 5/7] path-walk: visit tags and cached objects Derrick Stolee via GitGitGadget
2024-12-13 11:58       ` Patrick Steinhardt
2024-12-18 14:23         ` Derrick Stolee
2024-12-06 19:45     ` [PATCH v3 6/7] path-walk: mark trees and blobs as UNINTERESTING Derrick Stolee via GitGitGadget
2024-12-06 19:45     ` [PATCH v3 7/7] path-walk: reorder object visits Derrick Stolee via GitGitGadget
2024-12-13 11:58     ` [PATCH v3 0/7] PATH WALK I: The path-walk API Patrick Steinhardt
2024-12-20 16:21     ` [PATCH v4 " Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 1/7] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-12-27 14:18         ` Patrick Steinhardt
2024-12-20 16:21       ` [PATCH v4 2/7] test-lib-functions: add test_cmp_sorted Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 3/7] t6601: add helper for testing path-walk API Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 4/7] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 5/7] path-walk: visit tags and cached objects Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 6/7] path-walk: mark trees and blobs as UNINTERESTING Derrick Stolee via GitGitGadget
2024-12-20 16:21       ` [PATCH v4 7/7] path-walk: reorder object visits Derrick Stolee via GitGitGadget

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z1whcUJE-MHAhULO@pks.im \
    --to=ps@pks.im \
    --cc=christian.couder@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitgitgadget@gmail.com \
    --cc=gitster@pobox.com \
    --cc=johannes.schindelin@gmx.de \
    --cc=johncai86@gmail.com \
    --cc=jonathantanmy@google.com \
    --cc=karthik.188@gmail.com \
    --cc=kristofferhaugsbakk@fastmail.com \
    --cc=me@ttaylorr.com \
    --cc=newren@gmail.com \
    --cc=peff@peff.net \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).