All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff Hostetler <git@jeffhostetler.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, peff@peff.net, jonathantanmy@google.com,
	Jeff Hostetler <jeffhost@microsoft.com>
Subject: [PATCH 06/13] list-objects-filter-sparse: add sparse filter
Date: Tue, 24 Oct 2017 18:53:25 +0000	[thread overview]
Message-ID: <20171024185332.57261-7-git@jeffhostetler.com> (raw)
In-Reply-To: <20171024185332.57261-1-git@jeffhostetler.com>

From: Jeff Hostetler <jeffhost@microsoft.com>

Create a filter for traverse_commit_list_worker() to only include
the blobs the would be referenced by a sparse-checkout using the
given specification.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                     |   1 +
 list-objects-filter-sparse.c | 241 +++++++++++++++++++++++++++++++++++++++++++
 list-objects-filter-sparse.h |  30 ++++++
 3 files changed, 272 insertions(+)
 create mode 100644 list-objects-filter-sparse.c
 create mode 100644 list-objects-filter-sparse.h

diff --git a/Makefile b/Makefile
index 0fdeabb..fc82664 100644
--- a/Makefile
+++ b/Makefile
@@ -810,6 +810,7 @@ LIB_OBJS += list-objects.o
 LIB_OBJS += list-objects-filter-blobs-limit.o
 LIB_OBJS += list-objects-filter-blobs-none.o
 LIB_OBJS += list-objects-filter-map.o
+LIB_OBJS += list-objects-filter-sparse.o
 LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
diff --git a/list-objects-filter-sparse.c b/list-objects-filter-sparse.c
new file mode 100644
index 0000000..386b667
--- /dev/null
+++ b/list-objects-filter-sparse.c
@@ -0,0 +1,241 @@
+#include "cache.h"
+#include "dir.h"
+#include "tag.h"
+#include "commit.h"
+#include "tree.h"
+#include "blob.h"
+#include "diff.h"
+#include "tree-walk.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "list-objects-filter-sparse.h"
+
+#define DEFAULT_MAP_SIZE (16*1024)
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID or from a local pathname.  We allow an OID because
+ * the repo may be bare or we may be doing the filtering on the
+ * server.
+ */
+struct frame {
+	int defval;
+	int child_prov_omit : 1;
+};
+
+struct filter_use_sparse_data {
+	struct oidmap *omits;
+	struct exclude_list el;
+
+	size_t nr, alloc;
+	struct frame *array_frame;
+};
+
+static list_objects_filter_result filter_use_sparse(
+	list_objects_filter_type filter_type,
+	struct object *obj,
+	const char *pathname,
+	const char *filename,
+	void *filter_data_)
+{
+	struct filter_use_sparse_data *filter_data = filter_data_;
+	struct list_objects_filter_map_entry *entry_prev = NULL;
+	int val, dtype;
+	struct frame *frame;
+
+	switch (filter_type) {
+	default:
+		die("unkown filter_type");
+		return LOFR_ZERO;
+
+	case LOFT_BEGIN_TREE:
+		assert(obj->type == OBJ_TREE);
+		dtype = DT_DIR;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = filter_data->array_frame[filter_data->nr].defval;
+
+		ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1,
+			   filter_data->alloc);
+		filter_data->nr++;
+		filter_data->array_frame[filter_data->nr].defval = val;
+		filter_data->array_frame[filter_data->nr].child_prov_omit = 0;
+
+		/*
+		 * A directory with this tree OID may appear in multiple
+		 * places in the tree. (Think of a directory move, with
+		 * no other changes.)  And with a different pathname, the
+		 * is_excluded...() results for this directory and items
+		 * contained within it may be different.  So we cannot
+		 * mark it SEEN (yet), since that will prevent process_tree()
+		 * from revisiting this tree object with other pathnames.
+		 *
+		 * Only SHOW the tree object the first time we visit this
+		 * tree object.
+		 *
+		 * We always show all tree objects.  A future optimization
+		 * may want to attempt to narrow this.
+		 */
+		if (obj->flags & FILTER_REVISIT)
+			return LOFR_ZERO;
+		obj->flags |= FILTER_REVISIT;
+		return LOFR_SHOW;
+
+	case LOFT_END_TREE:
+		assert(obj->type == OBJ_TREE);
+		assert(filter_data->nr > 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+		filter_data->nr--;
+
+		/*
+		 * Tell our parent directory if any of our children were
+		 * provisionally omitted.
+		 */
+		filter_data->array_frame[filter_data->nr].child_prov_omit |=
+			frame->child_prov_omit;
+
+		/*
+		 * If there are NO provisionally omitted child objects (ALL child
+		 * objects in this folder were INCLUDED), then we can mark the
+		 * folder as SEEN (so we will not have to revisit it again).
+		 */
+		if (!frame->child_prov_omit)
+			return LOFR_MARK_SEEN;
+		return LOFR_ZERO;
+
+	case LOFT_BLOB:
+		assert(obj->type == OBJ_BLOB);
+		assert((obj->flags & SEEN) == 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+
+		/*
+		 * If we are keeping a list of the omitted objects
+		 * for the caller *AND* we previsously provisionally
+		 * omitted this object (because the THEN pathname
+		 * is excluded) *AND* it has the same pathname, we
+		 * can avoid duplicating the is_excluded lookup
+		 * costs and continue provisionally omitting it.
+		 */
+		if (filter_data->omits) {
+			entry_prev = oidmap_get(
+				filter_data->omits, &obj->oid);
+			if (entry_prev &&
+			    !strcmp(pathname, entry_prev->pathname)) {
+				frame->child_prov_omit = 1;
+				return LOFR_ZERO;
+			}
+		}
+
+		dtype = DT_REG;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = frame->defval;
+		if (val > 0) {
+			if (entry_prev) {
+				entry_prev = oidmap_remove(filter_data->omits,
+							   &obj->oid);
+				free(entry_prev);
+			}
+			return LOFR_MARK_SEEN | LOFR_SHOW;
+		}
+
+		/*
+		 * Provisionally omit it.  We've already established that
+		 * this pathname is not in the sparse-checkout specification
+		 * with the CURRENT pathname, so we *WANT* to omit this blob.
+		 *
+		 * However, a pathname elsewhere in the tree may also
+		 * reference this same blob, so we cannot reject it yet.
+		 * Leave the LOFR_ bits unset so that if the blob appears
+		 * again in the traversal, we will be asked again.
+		 *
+		 * The pathname that we associate with this omit is just
+		 * the first one we saw for this blob.  Other instances of
+		 * this blob may have other pathnames and that is fine.
+		 * We just use it for perf to do the entry_prev lookup
+		 * above (because most of the time, the blob will be in
+		 * the same place as we walk the commits).
+		 */
+		if (filter_data->omits)
+			list_objects_filter_map_insert(filter_data->omits,
+						       &obj->oid, pathname,
+						       obj->type);
+
+		frame->child_prov_omit = 1;
+		return LOFR_ZERO;
+	}
+}
+
+static void do_sparse(
+	struct filter_use_sparse_data *d,
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	list_objects_filter_map_foreach_cb print_omitted_object,
+	void *ctx_data)
+{
+	ALLOC_GROW(d->array_frame, d->nr + 1, d->alloc);
+	d->array_frame[d->nr].defval = 0; /* default to include */
+	d->array_frame[d->nr].child_prov_omit = 0;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_use_sparse, d);
+
+	if (print_omitted_object) {
+		list_objects_filter_map_foreach(d->omits, print_omitted_object, ctx_data);
+		oidmap_free(d->omits, 1);
+	}
+}
+
+void traverse_commit_list__sparse_oid(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	list_objects_filter_map_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (print_omitted_object) {
+		d.omits = xcalloc(1, sizeof(*d.omits));
+		oidmap_init(d.omits, DEFAULT_MAP_SIZE);
+	}
+	if (add_excludes_from_blob_to_list(oid, NULL, 0, &d.el) < 0)
+		die("could not load filter specification");
+
+	do_sparse(&d, revs, show_commit, show_object, print_omitted_object,
+		  ctx_data);
+}
+
+void traverse_commit_list__sparse_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	list_objects_filter_map_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (print_omitted_object) {
+		d.omits = xcalloc(1, sizeof(*d.omits));
+		oidmap_init(d.omits, DEFAULT_MAP_SIZE);
+	}
+	if (add_excludes_from_file_to_list(path, NULL, 0, &d.el, NULL) < 0)
+		die("could not load filter specification");
+
+	do_sparse(&d, revs, show_commit, show_object, print_omitted_object,
+		  ctx_data);
+}
diff --git a/list-objects-filter-sparse.h b/list-objects-filter-sparse.h
new file mode 100644
index 0000000..6c715bf
--- /dev/null
+++ b/list-objects-filter-sparse.h
@@ -0,0 +1,30 @@
+#ifndef LIST_OBJECTS_FILTERS_SPARSE_H
+#define LIST_OBJECTS_FILTERS_SPARSE_H
+
+#include "list-objects-filter-map.h"
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID, a blob with a blob-ish path, or from a local pathname.
+ * We allow an OID because the repo may be bare or we may be doing
+ * the filtering on the server.
+ */
+void traverse_commit_list__sparse_oid(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	list_objects_filter_map_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid);
+void traverse_commit_list__sparse_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	list_objects_filter_map_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path);
+
+#endif /* LIST_OBJECTS_FILTERS_SPARSE_H */
-- 
2.9.3


  parent reply	other threads:[~2017-10-24 18:54 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-24 18:53 [PATCH 00/13] WIP Partial clone part 1: object filtering Jeff Hostetler
2017-10-24 18:53 ` [PATCH 01/13] dir: allow exclusions from blob in addition to file Jeff Hostetler
2017-10-25  4:05   ` Eric Sunshine
2017-10-25  6:43   ` Junio C Hamano
2017-10-25 14:54     ` Jeff Hostetler
2017-10-26  3:47       ` Junio C Hamano
2017-10-26 18:11         ` Jeff Hostetler
2017-10-24 18:53 ` [PATCH 02/13] list-objects-filter-map: extend oidmap to collect omitted objects Jeff Hostetler
2017-10-25  7:10   ` Junio C Hamano
2017-10-25 19:22     ` Jeff Hostetler
2017-10-26  4:12       ` Junio C Hamano
2017-10-24 18:53 ` [PATCH 03/13] list-objects: filter objects in traverse_commit_list Jeff Hostetler
2017-10-25  4:05   ` Jonathan Tan
2017-10-25 19:25     ` Jeff Hostetler
2017-10-24 18:53 ` [PATCH 04/13] list-objects-filter-blobs-none: add filter to omit all blobs Jeff Hostetler
2017-10-24 18:53 ` [PATCH 05/13] list-objects-filter-blobs-limit: add large blob filtering Jeff Hostetler
2017-10-24 18:53 ` Jeff Hostetler [this message]
2017-10-24 18:53 ` [PATCH 07/13] list-objects-filter-options: common argument parsing Jeff Hostetler
2017-10-25  4:14   ` Jonathan Tan
2017-10-25 19:28     ` Jeff Hostetler
2017-10-24 18:53 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
2017-10-25  4:24   ` Jonathan Tan
2017-10-25 19:29     ` Jeff Hostetler
2017-10-24 18:53 ` [PATCH 09/13] extension.partialclone: introduce partial clone extension Jeff Hostetler
2017-10-24 18:53 ` [PATCH 10/13] rev-list: add list-objects filtering support Jeff Hostetler
2017-10-25  4:41   ` Jonathan Tan
2017-10-25 19:37     ` Jeff Hostetler
2017-10-24 18:53 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
2017-10-24 18:53 ` [PATCH 12/13] pack-objects: add list-objects filtering Jeff Hostetler
2017-10-24 18:53 ` [PATCH 13/13] t5317: pack-objects object filtering test Jeff Hostetler
2017-10-25  4:57 ` [PATCH 00/13] WIP Partial clone part 1: object filtering Jonathan Tan
2017-10-25  5:00 ` Junio C Hamano
2017-10-25  6:46   ` Jonathan Tan
2017-10-25 15:39     ` Jeff Hostetler
2017-10-26  2:09       ` Junio C Hamano
2017-10-26  2:01     ` Junio C Hamano
2017-10-30 22:27     ` Jonathan Nieder

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171024185332.57261-7-git@jeffhostetler.com \
    --to=git@jeffhostetler.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jeffhost@microsoft.com \
    --cc=jonathantanmy@google.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.