From: "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, johannes.schindelin@gmx.de, peff@peff.net,
ps@pks.im, me@ttaylorr.com, johncai86@gmail.com,
newren@gmail.com, Derrick Stolee <stolee@gmail.com>,
Derrick Stolee <stolee@gmail.com>
Subject: [PATCH 01/30] path-walk: introduce an object walk by path
Date: Tue, 10 Sep 2024 02:28:26 +0000 [thread overview]
Message-ID: <a53bd0d37606c890d2fd2b715ce0bc92ff787b66.1725935335.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.1786.git.1725935335.gitgitgadget@gmail.com>
From: Derrick Stolee <stolee@gmail.com>
In anticipation of a few planned applications, introduce the most basic form
of a path-walk API. It currently assumes that there are no UNINTERESTING
objects, and does not include any complicated filters. It calls a function
pointer on groups of tree and blob objects as grouped by path. This only
includes objects the first time they are discovered, so an object that
appears at multiple paths will not be included in two batches.
There are many future adaptations that could be made, but they are left for
future updates when consumers are ready to take advantage of those features.
RFC TODO: It would be helpful to create a test-tool that allows printing of
each batch for strong testing.
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
Makefile | 1 +
path-walk.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++
path-walk.h | 43 ++++++++++
3 files changed, 279 insertions(+)
create mode 100644 path-walk.c
create mode 100644 path-walk.h
diff --git a/Makefile b/Makefile
index deb175a0408..e83f6de9a2c 100644
--- a/Makefile
+++ b/Makefile
@@ -1090,6 +1090,7 @@ LIB_OBJS += parse-options.o
LIB_OBJS += patch-delta.o
LIB_OBJS += patch-ids.o
LIB_OBJS += path.o
+LIB_OBJS += path-walk.o
LIB_OBJS += pathspec.o
LIB_OBJS += pkt-line.o
LIB_OBJS += preload-index.o
diff --git a/path-walk.c b/path-walk.c
new file mode 100644
index 00000000000..2edfa0572e4
--- /dev/null
+++ b/path-walk.c
@@ -0,0 +1,235 @@
+/*
+ * path-walk.c: implementation for path-based walks of the object graph.
+ */
+#include "git-compat-util.h"
+#include "path-walk.h"
+#include "blob.h"
+#include "commit.h"
+#include "dir.h"
+#include "hashmap.h"
+#include "hex.h"
+#include "object.h"
+#include "oid-array.h"
+#include "revision.h"
+#include "string-list.h"
+#include "strmap.h"
+#include "trace2.h"
+#include "tree.h"
+#include "tree-walk.h"
+
+struct type_and_oid_list
+{
+ enum object_type type;
+ struct oid_array oids;
+};
+
+#define TYPE_AND_OID_LIST_INIT { \
+ .type = OBJ_NONE, \
+ .oids = OID_ARRAY_INIT \
+}
+
+struct path_walk_context {
+ /**
+ * Repeats of data in 'struct path_walk_info' for
+ * access with fewer characters.
+ */
+ struct repository *repo;
+ struct rev_info *revs;
+ struct path_walk_info *info;
+
+ /**
+ * Map a path to a 'struct type_and_oid_list'
+ * containing the objects discovered at that
+ * path.
+ */
+ struct strmap paths_to_lists;
+
+ /**
+ * Store the current list of paths in a stack, to
+ * facilitate depth-first-search without recursion.
+ */
+ struct string_list path_stack;
+};
+
+static int add_children(struct path_walk_context *ctx,
+ const char *base_path,
+ struct object_id *oid)
+{
+ struct tree_desc desc;
+ struct name_entry entry;
+ struct strbuf path = STRBUF_INIT;
+ size_t base_len;
+ struct tree *tree = lookup_tree(ctx->repo, oid);
+
+ if (!tree) {
+ error(_("failed to walk children of tree %s: not found"),
+ oid_to_hex(oid));
+ return -1;
+ }
+
+ strbuf_addstr(&path, base_path);
+ base_len = path.len;
+
+ parse_tree(tree);
+ init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
+ while (tree_entry(&desc, &entry)) {
+ struct type_and_oid_list *list;
+ struct object *o;
+ /* Not actually true, but we will ignore submodules later. */
+ enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
+
+ /* Skip submodules. */
+ if (S_ISGITLINK(entry.mode))
+ continue;
+
+ if (type == OBJ_TREE) {
+ struct tree *child = lookup_tree(ctx->repo, &entry.oid);
+ o = child ? &child->object : NULL;
+ } else if (type == OBJ_BLOB) {
+ struct blob *child = lookup_blob(ctx->repo, &entry.oid);
+ o = child ? &child->object : NULL;
+ } else {
+ /* Wrong type? */
+ continue;
+ }
+
+ if (!o) /* report error?*/
+ continue;
+
+ /* Skip this object if already seen. */
+ if (o->flags & SEEN)
+ continue;
+ o->flags |= SEEN;
+
+ strbuf_setlen(&path, base_len);
+ strbuf_add(&path, entry.path, entry.pathlen);
+
+ /*
+ * Trees will end with "/" for concatenation and distinction
+ * from blobs at the same path.
+ */
+ if (type == OBJ_TREE)
+ strbuf_addch(&path, '/');
+
+ if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
+ CALLOC_ARRAY(list, 1);
+ list->type = type;
+ strmap_put(&ctx->paths_to_lists, path.buf, list);
+ string_list_append(&ctx->path_stack, path.buf);
+ }
+ oid_array_append(&list->oids, &entry.oid);
+ }
+
+ free_tree_buffer(tree);
+ strbuf_release(&path);
+ return 0;
+}
+
+/*
+ * For each path in paths_to_explore, walk the trees another level
+ * and add any found blobs to the batch (but only if they don't
+ * exist and haven't been added yet).
+ */
+static int walk_path(struct path_walk_context *ctx,
+ const char *path)
+{
+ struct type_and_oid_list *list;
+ int ret = 0;
+
+ list = strmap_get(&ctx->paths_to_lists, path);
+
+ /* Evaluate function pointer on this data. */
+ ret = ctx->info->path_fn(path, &list->oids, list->type,
+ ctx->info->path_fn_data);
+
+ /* Expand data for children. */
+ if (list->type == OBJ_TREE) {
+ for (size_t i = 0; i < list->oids.nr; i++) {
+ ret |= add_children(ctx,
+ path,
+ &list->oids.oid[i]);
+ }
+ }
+
+ oid_array_clear(&list->oids);
+ strmap_remove(&ctx->paths_to_lists, path, 1);
+ return ret;
+}
+
+static void clear_strmap(struct strmap *map)
+{
+ struct hashmap_iter iter;
+ struct strmap_entry *e;
+
+ hashmap_for_each_entry(&map->map, &iter, e, ent) {
+ struct type_and_oid_list *list = e->value;
+ oid_array_clear(&list->oids);
+ }
+ strmap_clear(map, 1);
+ strmap_init(map);
+}
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info)
+{
+ const char *root_path = "";
+ int ret = 0;
+ size_t commits_nr = 0, paths_nr = 0;
+ struct commit *c;
+ struct type_and_oid_list *root_tree_list;
+ struct path_walk_context ctx = {
+ .repo = info->revs->repo,
+ .revs = info->revs,
+ .info = info,
+ .path_stack = STRING_LIST_INIT_DUP,
+ .paths_to_lists = STRMAP_INIT
+ };
+
+ trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
+
+ /* Insert a single list for the root tree into the paths. */
+ CALLOC_ARRAY(root_tree_list, 1);
+ root_tree_list->type = OBJ_TREE;
+ strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
+
+ if (prepare_revision_walk(info->revs))
+ die(_("failed to setup revision walk"));
+
+ while ((c = get_revision(info->revs))) {
+ struct object_id *oid = get_commit_tree_oid(c);
+ struct tree *t = lookup_tree(info->revs->repo, oid);
+ commits_nr++;
+
+ if (t)
+ oid_array_append(&root_tree_list->oids, oid);
+ else
+ warning("could not find tree %s", oid_to_hex(oid));
+ }
+
+ trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
+ trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
+
+ string_list_append(&ctx.path_stack, root_path);
+
+ trace2_region_enter("path-walk", "path-walk", info->revs->repo);
+ while (!ret && ctx.path_stack.nr) {
+ char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
+ ctx.path_stack.nr--;
+ paths_nr++;
+
+ ret = walk_path(&ctx, path);
+
+ free(path);
+ }
+ trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
+ trace2_region_leave("path-walk", "path-walk", info->revs->repo);
+
+ clear_strmap(&ctx.paths_to_lists);
+ string_list_clear(&ctx.path_stack, 0);
+ return ret;
+}
diff --git a/path-walk.h b/path-walk.h
new file mode 100644
index 00000000000..c9e94a98bc8
--- /dev/null
+++ b/path-walk.h
@@ -0,0 +1,43 @@
+/*
+ * path-walk.h : Methods and structures for walking the object graph in batches
+ * by the paths that can reach those objects.
+ */
+#include "object.h" /* Required for 'enum object_type'. */
+
+struct rev_info;
+struct oid_array;
+
+/**
+ * The type of a function pointer for the method that is called on a list of
+ * objects reachable at a given path.
+ */
+typedef int (*path_fn)(const char *path,
+ struct oid_array *oids,
+ enum object_type type,
+ void *data);
+
+struct path_walk_info {
+ /**
+ * revs provides the definitions for the commit walk, including
+ * which commits are UNINTERESTING or not.
+ */
+ struct rev_info *revs;
+
+ /**
+ * The caller wishes to execute custom logic on objects reachable at a
+ * given path. Every reachable object will be visited exactly once, and
+ * the first path to see an object wins. This may not be a stable choice.
+ */
+ path_fn path_fn;
+ void *path_fn_data;
+};
+
+#define PATH_WALK_INFO_INIT { 0 }
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info);
--
gitgitgadget
next prev parent reply other threads:[~2024-09-10 2:29 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-10 2:28 [PATCH 00/30] [RFC] Path-walk API and applications Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` Derrick Stolee via GitGitGadget [this message]
2024-09-10 2:28 ` [PATCH 02/30] backfill: add builtin boilerplate Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 03/30] backfill: basic functionality and tests Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 04/30] backfill: add --batch-size=<n> option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 05/30] backfill: add --sparse option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 06/30] backfill: assume --sparse when sparse-checkout is enabled Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 07/30] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 08/30] path-walk: allow visiting tags Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 09/30] survey: stub in new experimental `git-survey` command Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 10/30] survey: add command line opts to select references Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 11/30] survey: collect the set of requested refs Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 12/30] survey: start pretty printing data in table form Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 13/30] survey: add object count summary Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 14/30] survey: summarize total sizes by object type Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 15/30] survey: show progress during object walk Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 16/30] survey: add ability to track prioritized lists Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 17/30] survey: add report of "largest" paths Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 18/30] revision: create mark_trees_uninteresting_dense() Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 19/30] path-walk: add prune_all_uninteresting option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 20/30] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 21/30] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 22/30] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 23/30] p5313: add size comparison test Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 24/30] repack: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 25/30] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 26/30] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 27/30] pack-objects: add --full-name-hash option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 28/30] test-name-hash: add helper to compute name-hash functions Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 29/30] p5314: add a size test for name-hash collisions Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 30/30] pack-objects: output debug info about deltas Derrick Stolee via GitGitGadget
2024-09-11 21:32 ` [PATCH 00/30] [RFC] Path-walk API and applications Junio C Hamano
2024-09-17 10:41 ` Christian Couder
2024-09-18 23:18 ` Derrick Stolee
2024-09-22 18:37 ` Junio C Hamano
2024-09-23 1:22 ` Derrick Stolee
2024-09-23 16:56 ` Junio C Hamano
2024-09-22 21:08 ` Kristoffer Haugsbakk
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a53bd0d37606c890d2fd2b715ce0bc92ff787b66.1725935335.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=johannes.schindelin@gmx.de \
--cc=johncai86@gmail.com \
--cc=me@ttaylorr.com \
--cc=newren@gmail.com \
--cc=peff@peff.net \
--cc=ps@pks.im \
--cc=stolee@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).