Git development

Git development
 help / color / mirror / Atom feed

* [PATCH v3 07/10] bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
From: Taylor Blau @ 2023-10-18 17:08 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

The existing `stream_blob_to_pack()` function is named based on the fact
that it knows only how to stream blobs into a bulk-checkin pack.

But there is no longer anything in this function which prevents us from
writing objects of arbitrary types to the bulk-checkin pack. Prepare to
write OBJ_TREEs by removing this assumption, adding an `enum
object_type` parameter to this function's argument list, and renaming it
to `stream_obj_to_pack()` accordingly.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 133e02ce36..f0115efb2e 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -204,10 +204,10 @@ static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
  * status before calling us just in case we ask it to call us again
  * with a new pack.
  */
-static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
-			       git_hash_ctx *ctx, off_t *already_hashed_to,
-			       struct bulk_checkin_source *source,
-			       unsigned flags)
+static int stream_obj_to_pack(struct bulk_checkin_packfile *state,
+			      git_hash_ctx *ctx, off_t *already_hashed_to,
+			      struct bulk_checkin_source *source,
+			      enum object_type type, unsigned flags)
 {
 	git_zstream s;
 	unsigned char ibuf[16384];
@@ -220,8 +220,7 @@ static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 
 	git_deflate_init(&s, pack_compression_level);
 
-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
-					      size);
+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size);
 	s.next_out = obuf + hdrlen;
 	s.avail_out = sizeof(obuf) - hdrlen;
 
@@ -402,8 +401,8 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 
 	while (1) {
 		prepare_checkpoint(state, &checkpoint, idx, flags);
-		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
-					 &source, flags))
+		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
+					&source, OBJ_BLOB, flags))
 			break;
 		truncate_checkpoint(state, &checkpoint, idx);
 		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 08/10] bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
From: Taylor Blau @ 2023-10-18 17:08 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

Now that we have factored out many of the common routines necessary to
index a new object into a pack created by the bulk-checkin machinery, we
can introduce a variant of `index_blob_bulk_checkin()` that acts on
blobs whose contents we can fit in memory.

This will be useful in a couple of more commits in order to provide the
`merge-tree` builtin with a mechanism to create a new pack containing
any objects it created during the merge, instead of storing those
objects individually as loose.

Similar to the existing `index_blob_bulk_checkin()` function, the
entrypoint delegates to `deflate_blob_to_pack_incore()`, which is
responsible for formatting the pack header and then deflating the
contents into the pack. The latter is accomplished by calling
deflate_obj_contents_to_pack_incore(), which takes advantage of the
earlier refactorings and is responsible for writing the object to the
pack and handling any overage from pack.packSizeLimit.

The bulk of the new functionality is implemented in the function
`stream_obj_to_pack()`, which can handle streaming objects from memory
to the bulk-checkin pack as a result of the earlier refactoring.

Consistent with the rest of the bulk-checkin mechanism, there are no
direct tests here. In future commits when we expose this new
functionality via the `merge-tree` builtin, we will test it indirectly
there.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h |  4 ++++
 2 files changed, 68 insertions(+)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index f0115efb2e..9ae43648ba 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -370,6 +370,59 @@ static void finalize_checkpoint(struct bulk_checkin_packfile *state,
 	}
 }
 
+static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state,
+					       git_hash_ctx *ctx,
+					       struct hashfile_checkpoint *checkpoint,
+					       struct object_id *result_oid,
+					       const void *buf, size_t size,
+					       enum object_type type,
+					       const char *path, unsigned flags)
+{
+	struct pack_idx_entry *idx = NULL;
+	off_t already_hashed_to = 0;
+	struct bulk_checkin_source source = {
+		.type = SOURCE_INCORE,
+		.buf = buf,
+		.size = size,
+		.read = 0,
+		.path = path,
+	};
+
+	/* Note: idx is non-NULL when we are writing */
+	if (flags & HASH_WRITE_OBJECT)
+		CALLOC_ARRAY(idx, 1);
+
+	while (1) {
+		prepare_checkpoint(state, checkpoint, idx, flags);
+
+		if (!stream_obj_to_pack(state, ctx, &already_hashed_to, &source,
+					type, flags))
+			break;
+		truncate_checkpoint(state, checkpoint, idx);
+		bulk_checkin_source_seek_to(&source, 0);
+	}
+
+	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
+
+	return 0;
+}
+
+static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
+				       struct object_id *result_oid,
+				       const void *buf, size_t size,
+				       const char *path, unsigned flags)
+{
+	git_hash_ctx ctx;
+	struct hashfile_checkpoint checkpoint = {0};
+
+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
+				  size);
+
+	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
+						   result_oid, buf, size,
+						   OBJ_BLOB, path, flags);
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -456,6 +509,17 @@ int index_blob_bulk_checkin(struct object_id *oid,
 	return status;
 }
 
+int index_blob_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags)
+{
+	int status = deflate_blob_to_pack_incore(&bulk_checkin_packfile, oid,
+						 buf, size, path, flags);
+	if (!odb_transaction_nesting)
+		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
+	return status;
+}
+
 void begin_odb_transaction(void)
 {
 	odb_transaction_nesting += 1;
diff --git a/bulk-checkin.h b/bulk-checkin.h
index aa7286a7b3..1b91daeaee 100644
--- a/bulk-checkin.h
+++ b/bulk-checkin.h
@@ -13,6 +13,10 @@ int index_blob_bulk_checkin(struct object_id *oid,
 			    int fd, size_t size,
 			    const char *path, unsigned flags);
 
+int index_blob_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags);
+
 /*
  * Tell the object database to optimize for adding
  * multiple objects. end_odb_transaction must be called
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* Re: [PATCH 01/11] t: add helpers to test for reference existence
From: Eric Sunshine @ 2023-10-18 17:08 UTC (permalink / raw)
  To: Patrick Steinhardt; +Cc: git, Han-Wen Nienhuys
In-Reply-To: <e947feb1c77f7e9f3c7f983bbe47137fbce42367.1697607222.git.ps@pks.im>

On Wed, Oct 18, 2023 at 1:35 AM Patrick Steinhardt <ps@pks.im> wrote:
> Introduce a new subcommand for our ref-store test helper that explicitly
> checks only for the presence or absence of a reference. This addresses
> these limitations:
> [...]
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
> diff --git a/t/helper/test-ref-store.c b/t/helper/test-ref-store.c
> @@ -221,6 +221,30 @@ static int cmd_verify_ref(struct ref_store *refs, const char **argv)
> +static int cmd_ref_exists(struct ref_store *refs, const char **argv)
> +{
> +       const char *refname = notnull(*argv++, "refname");
> +       struct strbuf unused_referent = STRBUF_INIT;
> +       struct object_id unused_oid;
> +       unsigned int unused_type;
> +       int failure_errno;
> +
> +       if (refs_read_raw_ref(refs, refname, &unused_oid, &unused_referent,
> +                             &unused_type, &failure_errno)) {
> +               /*
> +                * We handle ENOENT separately here such that it is possible to
> +                * distinguish actually-missing references from any kind of
> +                * generic error.
> +                */
> +               if (failure_errno == ENOENT)
> +                       return 17;
> +               return -1;
> +       }
> +
> +       strbuf_release(&unused_referent);
> +       return 0;
> +}

Unless refs_read_raw_ref() guarantees that `unused_referent` remains
unallocated upon failure[*], then the early returns inside the
conditional leak the strbuf. True, the program is exiting immediately
anyhow, so this (potential) leak isn't significant, but it seems odd
to clean up in one case (return 0) but not in the others (return -1 &
17).

[*] In my (admittedly brief) scan of the code and documentation, I
didn't see any such promise.

^ permalink raw reply

* [PATCH v3 00/10] merge-ort: implement support for packing objects together
From: Taylor Blau @ 2023-10-18 17:07 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1696629697.git.me@ttaylorr.com>

This series implements support for a new merge-tree option,
`--write-pack`, which causes any newly-written objects to be packed
together instead of being stored individually as loose.

The notable change from last time is in response to a suggestion[1] from
Junio to factor out an abstract bulk-checkin "source", which ended up
reducing the duplication between a couple of functions in the earlier
round by a significant degree.

Beyond that, the changes since last time can be viewed in the range-diff
below. Thanks in advance for any review!

[1]: https://lore.kernel.org/git/xmqq5y34wu5f.fsf@gitster.g/

Taylor Blau (10):
  bulk-checkin: factor out `format_object_header_hash()`
  bulk-checkin: factor out `prepare_checkpoint()`
  bulk-checkin: factor out `truncate_checkpoint()`
  bulk-checkin: factor out `finalize_checkpoint()`
  bulk-checkin: extract abstract `bulk_checkin_source`
  bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
  bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
  bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
  bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
  builtin/merge-tree.c: implement support for `--write-pack`

 Documentation/git-merge-tree.txt |   4 +
 builtin/merge-tree.c             |   5 +
 bulk-checkin.c                   | 288 +++++++++++++++++++++++++------
 bulk-checkin.h                   |   8 +
 merge-ort.c                      |  42 ++++-
 merge-recursive.h                |   1 +
 t/t4301-merge-tree-write-tree.sh |  93 ++++++++++
 7 files changed, 381 insertions(+), 60 deletions(-)

Range-diff against v2:
 1:  edf1cbafc1 =  1:  2dffa45183 bulk-checkin: factor out `format_object_header_hash()`
 2:  b3f89d5853 =  2:  7a10dc794a bulk-checkin: factor out `prepare_checkpoint()`
 3:  abe4fb0a59 =  3:  20c32d2178 bulk-checkin: factor out `truncate_checkpoint()`
 4:  0b855a6eb7 !  4:  893051d0b7 bulk-checkin: factor our `finalize_checkpoint()`
    @@ Metadata
     Author: Taylor Blau <me@ttaylorr.com>
     
      ## Commit message ##
    -    bulk-checkin: factor our `finalize_checkpoint()`
    +    bulk-checkin: factor out `finalize_checkpoint()`
     
         In a similar spirit as previous commits, factor out the routine to
         finalize the just-written object from the bulk-checkin mechanism.
 -:  ---------- >  5:  da52ec8380 bulk-checkin: extract abstract `bulk_checkin_source`
 -:  ---------- >  6:  4e9bac5bc1 bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
 -:  ---------- >  7:  04ec74e357 bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
 5:  239bf39bfb !  8:  8667b76365 bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
    @@ Commit message
         entrypoint delegates to `deflate_blob_to_pack_incore()`, which is
         responsible for formatting the pack header and then deflating the
         contents into the pack. The latter is accomplished by calling
    -    deflate_blob_contents_to_pack_incore(), which takes advantage of the
    -    earlier refactoring and is responsible for writing the object to the
    +    deflate_obj_contents_to_pack_incore(), which takes advantage of the
    +    earlier refactorings and is responsible for writing the object to the
         pack and handling any overage from pack.packSizeLimit.
     
         The bulk of the new functionality is implemented in the function
    -    `stream_obj_to_pack_incore()`, which is a generic implementation for
    -    writing objects of arbitrary type (whose contents we can fit in-core)
    -    into a bulk-checkin pack.
    -
    -    The new function shares an unfortunate degree of similarity to the
    -    existing `stream_blob_to_pack()` function. But DRY-ing up these two
    -    would likely be more trouble than it's worth, since the latter has to
    -    deal with reading and writing the contents of the object.
    +    `stream_obj_to_pack()`, which can handle streaming objects from memory
    +    to the bulk-checkin pack as a result of the earlier refactoring.
     
         Consistent with the rest of the bulk-checkin mechanism, there are no
         direct tests here. In future commits when we expose this new
    @@ Commit message
         Signed-off-by: Taylor Blau <me@ttaylorr.com>
     
      ## bulk-checkin.c ##
    -@@ bulk-checkin.c: static int already_written(struct bulk_checkin_packfile *state, struct object_id
    - 	return 0;
    - }
    - 
    -+static int stream_obj_to_pack_incore(struct bulk_checkin_packfile *state,
    -+				     git_hash_ctx *ctx,
    -+				     off_t *already_hashed_to,
    -+				     const void *buf, size_t size,
    -+				     enum object_type type,
    -+				     const char *path, unsigned flags)
    -+{
    -+	git_zstream s;
    -+	unsigned char obuf[16384];
    -+	unsigned hdrlen;
    -+	int status = Z_OK;
    -+	int write_object = (flags & HASH_WRITE_OBJECT);
    -+
    -+	git_deflate_init(&s, pack_compression_level);
    -+
    -+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size);
    -+	s.next_out = obuf + hdrlen;
    -+	s.avail_out = sizeof(obuf) - hdrlen;
    -+
    -+	if (*already_hashed_to < size) {
    -+		size_t hsize = size - *already_hashed_to;
    -+		if (hsize) {
    -+			the_hash_algo->update_fn(ctx, buf, hsize);
    -+		}
    -+		*already_hashed_to = size;
    -+	}
    -+	s.next_in = (void *)buf;
    -+	s.avail_in = size;
    -+
    -+	while (status != Z_STREAM_END) {
    -+		status = git_deflate(&s, Z_FINISH);
    -+		if (!s.avail_out || status == Z_STREAM_END) {
    -+			if (write_object) {
    -+				size_t written = s.next_out - obuf;
    -+
    -+				/* would we bust the size limit? */
    -+				if (state->nr_written &&
    -+				    pack_size_limit_cfg &&
    -+				    pack_size_limit_cfg < state->offset + written) {
    -+					git_deflate_abort(&s);
    -+					return -1;
    -+				}
    -+
    -+				hashwrite(state->f, obuf, written);
    -+				state->offset += written;
    -+			}
    -+			s.next_out = obuf;
    -+			s.avail_out = sizeof(obuf);
    -+		}
    -+
    -+		switch (status) {
    -+		case Z_OK:
    -+		case Z_BUF_ERROR:
    -+		case Z_STREAM_END:
    -+			continue;
    -+		default:
    -+			die("unexpected deflate failure: %d", status);
    -+		}
    -+	}
    -+	git_deflate_end(&s);
    -+	return 0;
    -+}
    -+
    - /*
    -  * Read the contents from fd for size bytes, streaming it to the
    -  * packfile in state while updating the hash in ctx. Signal a failure
     @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *state,
      	}
      }
    @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st
     +{
     +	struct pack_idx_entry *idx = NULL;
     +	off_t already_hashed_to = 0;
    ++	struct bulk_checkin_source source = {
    ++		.type = SOURCE_INCORE,
    ++		.buf = buf,
    ++		.size = size,
    ++		.read = 0,
    ++		.path = path,
    ++	};
     +
     +	/* Note: idx is non-NULL when we are writing */
     +	if (flags & HASH_WRITE_OBJECT)
    @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st
     +
     +	while (1) {
     +		prepare_checkpoint(state, checkpoint, idx, flags);
    -+		if (!stream_obj_to_pack_incore(state, ctx, &already_hashed_to,
    -+					       buf, size, type, path, flags))
    ++
    ++		if (!stream_obj_to_pack(state, ctx, &already_hashed_to, &source,
    ++					type, flags))
     +			break;
     +		truncate_checkpoint(state, checkpoint, idx);
    ++		bulk_checkin_source_seek_to(&source, 0);
     +	}
     +
     +	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
 6:  57613807d8 =  9:  cba043ef14 bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
 7:  f21400f56c = 10:  ae70508037 builtin/merge-tree.c: implement support for `--write-pack`
-- 
2.42.0.408.g97fac66ae4

^ permalink raw reply

* [PATCH v3 09/10] bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
From: Taylor Blau @ 2023-10-18 17:08 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

The remaining missing piece in order to teach the `merge-tree` builtin
how to write the contents of a merge into a pack is a function to index
tree objects into a bulk-checkin pack.

This patch implements that missing piece, which is a thin wrapper around
all of the functionality introduced in previous commits.

If and when Git gains support for a "compatibility" hash algorithm, the
changes to support that here will be minimal. The bulk-checkin machinery
will need to convert the incoming tree to compute its length under the
compatibility hash, necessary to reconstruct its header. With that
information (and the converted contents of the tree), the bulk-checkin
machinery will have enough to keep track of the converted object's hash
in order to update the compatibility mapping.

Within `deflate_tree_to_pack_incore()`, the changes should be limited
to something like:

    struct strbuf converted = STRBUF_INIT;
    if (the_repository->compat_hash_algo) {
      if (convert_object_file(&compat_obj,
                              the_repository->hash_algo,
                              the_repository->compat_hash_algo, ...) < 0)
        die(...);

      format_object_header_hash(the_repository->compat_hash_algo,
                                OBJ_TREE, size);
    }
    /* compute the converted tree's hash using the compat algorithm */
    strbuf_release(&converted);

, assuming related changes throughout the rest of the bulk-checkin
machinery necessary to update the hash of the converted object, which
are likewise minimal in size.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 27 +++++++++++++++++++++++++++
 bulk-checkin.h |  4 ++++
 2 files changed, 31 insertions(+)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 9ae43648ba..d088a9c10b 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -423,6 +423,22 @@ static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
 						   OBJ_BLOB, path, flags);
 }
 
+static int deflate_tree_to_pack_incore(struct bulk_checkin_packfile *state,
+				       struct object_id *result_oid,
+				       const void *buf, size_t size,
+				       const char *path, unsigned flags)
+{
+	git_hash_ctx ctx;
+	struct hashfile_checkpoint checkpoint = {0};
+
+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_TREE,
+				  size);
+
+	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
+						   result_oid, buf, size,
+						   OBJ_TREE, path, flags);
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -520,6 +536,17 @@ int index_blob_bulk_checkin_incore(struct object_id *oid,
 	return status;
 }
 
+int index_tree_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags)
+{
+	int status = deflate_tree_to_pack_incore(&bulk_checkin_packfile, oid,
+						 buf, size, path, flags);
+	if (!odb_transaction_nesting)
+		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
+	return status;
+}
+
 void begin_odb_transaction(void)
 {
 	odb_transaction_nesting += 1;
diff --git a/bulk-checkin.h b/bulk-checkin.h
index 1b91daeaee..89786b3954 100644
--- a/bulk-checkin.h
+++ b/bulk-checkin.h
@@ -17,6 +17,10 @@ int index_blob_bulk_checkin_incore(struct object_id *oid,
 				   const void *buf, size_t size,
 				   const char *path, unsigned flags);
 
+int index_tree_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags);
+
 /*
  * Tell the object database to optimize for adding
  * multiple objects. end_odb_transaction must be called
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 01/10] bulk-checkin: factor out `format_object_header_hash()`
From: Taylor Blau @ 2023-10-18 17:07 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

Before deflating a blob into a pack, the bulk-checkin mechanism prepares
the pack object header by calling `format_object_header()`, and writing
into a scratch buffer, the contents of which eventually makes its way
into the pack.

Future commits will add support for deflating multiple kinds of objects
into a pack, and will likewise need to perform a similar operation as
below.

This is a mostly straightforward extraction, with one notable exception.
Instead of hard-coding `the_hash_algo`, pass it in to the new function
as an argument. This isn't strictly necessary for our immediate purposes
here, but will prove useful in the future if/when the bulk-checkin
mechanism grows support for the hash transition plan.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6ce62999e5..fd3c110d1c 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -247,6 +247,22 @@ static void prepare_to_stream(struct bulk_checkin_packfile *state,
 		die_errno("unable to write pack header");
 }
 
+static void format_object_header_hash(const struct git_hash_algo *algop,
+				      git_hash_ctx *ctx,
+				      struct hashfile_checkpoint *checkpoint,
+				      enum object_type type,
+				      size_t size)
+{
+	unsigned char header[16384];
+	unsigned header_len = format_object_header((char *)header,
+						   sizeof(header),
+						   type, size);
+
+	algop->init_fn(ctx);
+	algop->update_fn(ctx, header, header_len);
+	algop->init_fn(&checkpoint->ctx);
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -254,8 +270,6 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 {
 	off_t seekback, already_hashed_to;
 	git_hash_ctx ctx;
-	unsigned char obuf[16384];
-	unsigned header_len;
 	struct hashfile_checkpoint checkpoint = {0};
 	struct pack_idx_entry *idx = NULL;
 
@@ -263,11 +277,8 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = format_object_header((char *)obuf, sizeof(obuf),
-					  OBJ_BLOB, size);
-	the_hash_algo->init_fn(&ctx);
-	the_hash_algo->update_fn(&ctx, obuf, header_len);
-	the_hash_algo->init_fn(&checkpoint.ctx);
+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
+				  size);
 
 	/* Note: idx is non-NULL when we are writing */
 	if ((flags & HASH_WRITE_OBJECT) != 0)
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 02/10] bulk-checkin: factor out `prepare_checkpoint()`
From: Taylor Blau @ 2023-10-18 17:07 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

In a similar spirit as the previous commit, factor out the routine to
prepare streaming into a bulk-checkin pack into its own function. Unlike
the previous patch, this is a verbatim copy and paste.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index fd3c110d1c..c1f5450583 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -263,6 +263,19 @@ static void format_object_header_hash(const struct git_hash_algo *algop,
 	algop->init_fn(&checkpoint->ctx);
 }
 
+static void prepare_checkpoint(struct bulk_checkin_packfile *state,
+			       struct hashfile_checkpoint *checkpoint,
+			       struct pack_idx_entry *idx,
+			       unsigned flags)
+{
+	prepare_to_stream(state, flags);
+	if (idx) {
+		hashfile_checkpoint(state->f, checkpoint);
+		idx->offset = state->offset;
+		crc32_begin(state->f);
+	}
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -287,12 +300,7 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	already_hashed_to = 0;
 
 	while (1) {
-		prepare_to_stream(state, flags);
-		if (idx) {
-			hashfile_checkpoint(state->f, &checkpoint);
-			idx->offset = state->offset;
-			crc32_begin(state->f);
-		}
+		prepare_checkpoint(state, &checkpoint, idx, flags);
 		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
 					 fd, size, path, flags))
 			break;
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 04/10] bulk-checkin: factor out `finalize_checkpoint()`
From: Taylor Blau @ 2023-10-18 17:07 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

In a similar spirit as previous commits, factor out the routine to
finalize the just-written object from the bulk-checkin mechanism.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index b92d7a6f5a..f4914fb6d1 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -292,6 +292,30 @@ static void truncate_checkpoint(struct bulk_checkin_packfile *state,
 	flush_bulk_checkin_packfile(state);
 }
 
+static void finalize_checkpoint(struct bulk_checkin_packfile *state,
+				git_hash_ctx *ctx,
+				struct hashfile_checkpoint *checkpoint,
+				struct pack_idx_entry *idx,
+				struct object_id *result_oid)
+{
+	the_hash_algo->final_oid_fn(result_oid, ctx);
+	if (!idx)
+		return;
+
+	idx->crc32 = crc32_end(state->f);
+	if (already_written(state, result_oid)) {
+		hashfile_truncate(state->f, checkpoint);
+		state->offset = checkpoint->offset;
+		free(idx);
+	} else {
+		oidcpy(&idx->oid, result_oid);
+		ALLOC_GROW(state->written,
+			   state->nr_written + 1,
+			   state->alloc_written);
+		state->written[state->nr_written++] = idx;
+	}
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -324,22 +348,7 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
 			return error("cannot seek back");
 	}
-	the_hash_algo->final_oid_fn(result_oid, &ctx);
-	if (!idx)
-		return 0;
-
-	idx->crc32 = crc32_end(state->f);
-	if (already_written(state, result_oid)) {
-		hashfile_truncate(state->f, &checkpoint);
-		state->offset = checkpoint.offset;
-		free(idx);
-	} else {
-		oidcpy(&idx->oid, result_oid);
-		ALLOC_GROW(state->written,
-			   state->nr_written + 1,
-			   state->alloc_written);
-		state->written[state->nr_written++] = idx;
-	}
+	finalize_checkpoint(state, &ctx, &checkpoint, idx, result_oid);
 	return 0;
 }
 
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 05/10] bulk-checkin: extract abstract `bulk_checkin_source`
From: Taylor Blau @ 2023-10-18 17:08 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

A future commit will want to implement a very similar routine as in
`stream_blob_to_pack()` with two notable changes:

  - Instead of streaming just OBJ_BLOBs, this new function may want to
    stream objects of arbitrary type.

  - Instead of streaming the object's contents from an open
    file-descriptor, this new function may want to "stream" its contents
    from memory.

To avoid duplicating a significant chunk of code between the existing
`stream_blob_to_pack()`, extract an abstract `bulk_checkin_source`. This
concept currently is a thin layer of `lseek()` and `read_in_full()`, but
will grow to understand how to perform analogous operations when writing
out an object's contents from memory.

Suggested-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 61 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 8 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index f4914fb6d1..fc1d902018 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -140,8 +140,41 @@ static int already_written(struct bulk_checkin_packfile *state, struct object_id
 	return 0;
 }
 
+struct bulk_checkin_source {
+	enum { SOURCE_FILE } type;
+
+	/* SOURCE_FILE fields */
+	int fd;
+
+	/* common fields */
+	size_t size;
+	const char *path;
+};
+
+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
+					 off_t offset)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return lseek(source->fd, offset, SEEK_SET);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
+					void *buf, size_t nr)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return read_in_full(source->fd, buf, nr);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
 /*
- * Read the contents from fd for size bytes, streaming it to the
+ * Read the contents from 'source' for 'size' bytes, streaming it to the
  * packfile in state while updating the hash in ctx. Signal a failure
  * by returning a negative value when the resulting pack would exceed
  * the pack size limit and this is not the first object in the pack,
@@ -157,7 +190,7 @@ static int already_written(struct bulk_checkin_packfile *state, struct object_id
  */
 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 			       git_hash_ctx *ctx, off_t *already_hashed_to,
-			       int fd, size_t size, const char *path,
+			       struct bulk_checkin_source *source,
 			       unsigned flags)
 {
 	git_zstream s;
@@ -167,22 +200,28 @@ static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 	int status = Z_OK;
 	int write_object = (flags & HASH_WRITE_OBJECT);
 	off_t offset = 0;
+	size_t size = source->size;
 
 	git_deflate_init(&s, pack_compression_level);
 
-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
+					      size);
 	s.next_out = obuf + hdrlen;
 	s.avail_out = sizeof(obuf) - hdrlen;
 
 	while (status != Z_STREAM_END) {
 		if (size && !s.avail_in) {
 			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
-			ssize_t read_result = read_in_full(fd, ibuf, rsize);
+			ssize_t read_result;
+
+			read_result = bulk_checkin_source_read(source, ibuf,
+							       rsize);
 			if (read_result < 0)
-				die_errno("failed to read from '%s'", path);
+				die_errno("failed to read from '%s'",
+					  source->path);
 			if (read_result != rsize)
 				die("failed to read %d bytes from '%s'",
-				    (int)rsize, path);
+				    (int)rsize, source->path);
 			offset += rsize;
 			if (*already_hashed_to < offset) {
 				size_t hsize = offset - *already_hashed_to;
@@ -325,6 +364,12 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	git_hash_ctx ctx;
 	struct hashfile_checkpoint checkpoint = {0};
 	struct pack_idx_entry *idx = NULL;
+	struct bulk_checkin_source source = {
+		.type = SOURCE_FILE,
+		.fd = fd,
+		.size = size,
+		.path = path,
+	};
 
 	seekback = lseek(fd, 0, SEEK_CUR);
 	if (seekback == (off_t) -1)
@@ -342,10 +387,10 @@ static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	while (1) {
 		prepare_checkpoint(state, &checkpoint, idx, flags);
 		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
-					 fd, size, path, flags))
+					 &source, flags))
 			break;
 		truncate_checkpoint(state, &checkpoint, idx);
-		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
 			return error("cannot seek back");
 	}
 	finalize_checkpoint(state, &ctx, &checkpoint, idx, result_oid);
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* [PATCH v3 06/10] bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
From: Taylor Blau @ 2023-10-18 17:08 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697648864.git.me@ttaylorr.com>

Continue to prepare for streaming an object's contents directly from
memory by teaching `bulk_checkin_source` how to perform reads and seeks
based on an address in memory.

Unlike file descriptors, which manage their own offset internally, we
have to keep track of how many bytes we've read out of the buffer, and
make sure we don't read past the end of the buffer.

Suggested-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index fc1d902018..133e02ce36 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -141,11 +141,15 @@ static int already_written(struct bulk_checkin_packfile *state, struct object_id
 }
 
 struct bulk_checkin_source {
-	enum { SOURCE_FILE } type;
+	enum { SOURCE_FILE, SOURCE_INCORE } type;
 
 	/* SOURCE_FILE fields */
 	int fd;
 
+	/* SOURCE_INCORE fields */
+	const void *buf;
+	size_t read;
+
 	/* common fields */
 	size_t size;
 	const char *path;
@@ -157,6 +161,11 @@ static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
 	switch (source->type) {
 	case SOURCE_FILE:
 		return lseek(source->fd, offset, SEEK_SET);
+	case SOURCE_INCORE:
+		if (!(0 <= offset && offset < source->size))
+			return (off_t)-1;
+		source->read = offset;
+		return source->read;
 	default:
 		BUG("unknown bulk-checkin source: %d", source->type);
 	}
@@ -168,6 +177,13 @@ static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
 	switch (source->type) {
 	case SOURCE_FILE:
 		return read_in_full(source->fd, buf, nr);
+	case SOURCE_INCORE:
+		assert(source->read <= source->size);
+		if (nr > source->size - source->read)
+			nr = source->size - source->read;
+		memcpy(buf, (unsigned char *)source->buf + source->read, nr);
+		source->read += nr;
+		return nr;
 	default:
 		BUG("unknown bulk-checkin source: %d", source->type);
 	}
-- 
2.42.0.408.g97fac66ae4


^ permalink raw reply related

* Re: [PATCH v3 05/17] t/helper/test-read-graph.c: extract `dump_graph_info()`
From: Taylor Blau @ 2023-10-18 17:37 UTC (permalink / raw)
  To: Patrick Steinhardt
  Cc: git, Jonathan Tan, Junio C Hamano, Jeff King, SZEDER Gábor
In-Reply-To: <ZS5JnkybxvetTUzu@tanuki>

On Tue, Oct 17, 2023 at 10:45:18AM +0200, Patrick Steinhardt wrote:
> On Tue, Oct 10, 2023 at 04:33:33PM -0400, Taylor Blau wrote:
> > Prepare for the 'read-graph' test helper to perform other tasks besides
> > dumping high-level information about the commit-graph by extracting its
> > main routine into a separate function.
> >
> > Signed-off-by: Taylor Blau <me@ttaylorr.com>
> > Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> > Signed-off-by: Junio C Hamano <gitster@pobox.com>
> > Signed-off-by: Taylor Blau <me@ttaylorr.com>
>
> Nit: your signoff is duplicated here. This is also still the case for
> some of the other commits.

Yeah, this is an artifact of having tossed these patches back and forth
(originally Jonathan sent some of these, then I sent another round, then
Jonathan, now me again). It's a little verbose, but accurately tracks
the DCO across multiple rounds.

Thanks,
Taylor

^ permalink raw reply

* Re: [PATCH v3 08/17] t4216: test changed path filters with high bit paths
From: Taylor Blau @ 2023-10-18 17:41 UTC (permalink / raw)
  To: Patrick Steinhardt
  Cc: git, Jonathan Tan, Junio C Hamano, Jeff King, SZEDER Gábor
In-Reply-To: <ZS5JmQib3onjirC6@tanuki>

On Tue, Oct 17, 2023 at 10:45:13AM +0200, Patrick Steinhardt wrote:
> > +test_expect_success 'setup check value of version 1 changed-path' '
> > +	(
> > +		cd highbit1 &&
> > +		echo "52a9" >expect &&
> > +		get_first_changed_path_filter >actual &&
> > +		test_cmp expect actual
> > +	)
> > +'
> > +
> > +# expect will not match actual if char is unsigned by default. Write the test
> > +# in this way, so that a user running this test script can still see if the two
> > +# files match. (It will appear as an ordinary success if they match, and a skip
> > +# if not.)
> > +if test_cmp highbit1/expect highbit1/actual
> > +then
> > +	test_set_prereq SIGNED_CHAR_BY_DEFAULT
> > +fi
> > +test_expect_success SIGNED_CHAR_BY_DEFAULT 'check value of version 1 changed-path' '
> > +	# Only the prereq matters for this test.
> > +	true
> > +'
>
> Doesn't this mean that the preceding test where we `test_cmp expect
> actual` can fail on some platforms depending on the signedness of
> `char`?

Great catch, I am surprised this slipped by in earlier rounds. This
should do the trick, since we don't actually care about conditioning
that test's passing on test_cmp coming up clean. We check that in the if
statement you pointed out here, so:

--- 8< ---
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 114672e904..400dce2193 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -502,8 +502,7 @@ test_expect_success 'setup check value of version 1 changed-path' '
 	(
 		cd highbit1 &&
 		echo "52a9" >expect &&
-		get_first_changed_path_filter >actual &&
-		test_cmp expect actual
+		get_first_changed_path_filter >actual
 	)
 '
--- >8 ---

Thanks,
Taylor

^ permalink raw reply related

* Re: [PATCH v3 10/17] commit-graph: new filter ver. that fixes murmur3
From: Taylor Blau @ 2023-10-18 17:46 UTC (permalink / raw)
  To: Patrick Steinhardt
  Cc: git, Jonathan Tan, Junio C Hamano, Jeff King, SZEDER Gábor
In-Reply-To: <ZS5JpJCw-MY256xo@tanuki>

On Tue, Oct 17, 2023 at 10:45:24AM +0200, Patrick Steinhardt wrote:
> > @@ -314,17 +314,26 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start,
> >  	return 0;
> >  }
> >
> > +struct graph_read_bloom_data_context {
> > +	struct commit_graph *g;
> > +	int *commit_graph_changed_paths_version;
> > +};
> > +
> >  static int graph_read_bloom_data(const unsigned char *chunk_start,
> >  				  size_t chunk_size, void *data)
> >  {
> > -	struct commit_graph *g = data;
> > +	struct graph_read_bloom_data_context *c = data;
> > +	struct commit_graph *g = c->g;
> >  	uint32_t hash_version;
> > -	g->chunk_bloom_data = chunk_start;
> >  	hash_version = get_be32(chunk_start);
> >
> > -	if (hash_version != 1)
> > +	if (*c->commit_graph_changed_paths_version == -1) {
> > +		*c->commit_graph_changed_paths_version = hash_version;
> > +	} else if (hash_version != *c->commit_graph_changed_paths_version) {
> >  		return 0;
> > +	}
>
> In case we have `c->commit_graph_changed_paths_version == -1` we lose
> the check that the hash version is something that we know and support,
> don't we? And while we do start to handle `-1` in the writing path, I
> think we don't in the reading path unless I missed something.

We don't have to deal with c->commit_graph_changed_paths_version being
-1 here, since we normalize it when reading the BDAT chunk. See
commit-graph.c::graph_read_bloom_data(), particularly:

    if (*c->commit_graph_changed_paths_version == -1)
        *c->commit_graph_changed_paths_version = hash_version;
    else if (hash_version != *c->commit_graph_changed_paths_version)
        return 0;

> > +test_expect_success 'when writing commit graph, do not reuse changed-path of another version' '
> > +	git init doublewrite &&
> > +	test_commit -C doublewrite c "$CENT" &&
> > +	git -C doublewrite config --add commitgraph.changedPathsVersion 1 &&
> > +	git -C doublewrite commit-graph write --reachable --changed-paths &&
> > +	git -C doublewrite config --add commitgraph.changedPathsVersion 2 &&
> > +	git -C doublewrite commit-graph write --reachable --changed-paths &&
> > +	(
> > +		cd doublewrite &&
> > +		echo "c01f" >expect &&
> > +		get_first_changed_path_filter >actual &&
> > +		test_cmp expect actual
> > +	)
> > +'
> > +
>
> With the supposedly missing check in mind, should we also add tests for
> currently unknown versions like 3 or -2?

Good idea, I'll update the test to reflect.

Thanks,
Taylor

^ permalink raw reply

* Re: [PATCH v3 00/17] bloom: changed-path Bloom filters v2 (& sundries)
From: Taylor Blau @ 2023-10-18 17:47 UTC (permalink / raw)
  To: Patrick Steinhardt
  Cc: git, Jonathan Tan, Junio C Hamano, Jeff King, SZEDER Gábor
In-Reply-To: <ZS5JsDKk8RioQfOA@tanuki>

On Tue, Oct 17, 2023 at 10:45:36AM +0200, Patrick Steinhardt wrote:
> > Thanks to Jonathan, Peff, and SZEDER who have helped a great deal in
> > assembling these patches. As usual, a range-diff is included below.
> > Thanks in advance for your
> > review!
>
> As this patch series has been sitting around without reviews for a week
> I've tried my best to give it a go. Note though that this area is mostly
> outside of my own comfort zone, so some of the questions and suggestions
> might ultimately not apply.

Thanks for giving it a look! I generated a few small tweaks on top of
what I already had here based on your review, so I'll send a reroll
shortly.

Thanks,
Taylor

^ permalink raw reply

* Re: [Outreachy][PATCH] branch.c: adjust error messages to coding guidelines
From: Rubén Justo @ 2023-10-18 18:19 UTC (permalink / raw)
  To: Isoken June Ibizugbe, git; +Cc: christian.couder, Junio C Hamano
In-Reply-To: <20231018051223.13955-1-isokenjune@gmail.com>

On 18-oct-2023 06:12:22, Isoken June Ibizugbe wrote:

> Signed-off-by: Isoken June Ibizugbe <isokenjune@gmail.com>
> ---
>  builtin/branch.c | 66 ++++++++++++++++++++++++------------------------
>  1 file changed, 33 insertions(+), 33 deletions(-)

Only builtin/branch.c is touched.

The changes in this patch break some tests, therefore this patch must
also include the fixes for those tests.

> @@ -965,11 +965,11 @@ int cmd_branch(int argc, const char **argv, const char *prefix)
>  		const char *start_name = argc == 2 ? argv[1] : head;
>  
>  		if (filter.kind != FILTER_REFS_BRANCHES)
> -			die(_("The -a, and -r, options to 'git branch' do not take a branch name.\n"
> +			die(_("the -a, and -r, options to 'git branch' do not take a branch name\n"
>  				  "Did you mean to use: -a|-r --list <pattern>?"));

OK.  The initial 'T' is fixed, but as Junio explained [1], the full stop
must stay.


Thanks.

 [1] https://lore.kernel.org/git/xmqqttqxkmaq.fsf@gitster.g/

^ permalink raw reply

* [PATCH v4 00/17] bloom: changed-path Bloom filters v2 (& sundries)
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1696629697.git.me@ttaylorr.com>

(Rebased onto the tip of 'master', which is 3a06386e31 (The fifteenth
batch, 2023-10-04), at the time of writing).

This series is a reroll of the combined efforts of [1] and [2] to
introduce the v2 changed-path Bloom filters, which fixes a bug in our
existing implementation of murmur3 paths with non-ASCII characters (when
the "char" type is signed).

In large part, this is the same as the previous round. But this round
includes some extra bits that address issues pointed out by SZEDER
Gábor, which are:

  - not reading Bloom filters for root commits
  - corrupting Bloom filter reads by tweaking the filter settings
    between layers.

These issues were discussed in (among other places) [3], and [4],
respectively.

Thanks to Jonathan, Peff, and SZEDER who have helped a great deal in
assembling these patches. As usual, a range-diff is included below.
Thanks in advance for your
review!

[1]: https://lore.kernel.org/git/cover.1684790529.git.jonathantanmy@google.com/
[2]: https://lore.kernel.org/git/cover.1691426160.git.me@ttaylorr.com/
[3]: https://public-inbox.org/git/20201015132147.GB24954@szeder.dev/
[4]: https://lore.kernel.org/git/20230830200218.GA5147@szeder.dev/

Jonathan Tan (4):
  gitformat-commit-graph: describe version 2 of BDAT
  t4216: test changed path filters with high bit paths
  repo-settings: introduce commitgraph.changedPathsVersion
  commit-graph: new filter ver. that fixes murmur3

Taylor Blau (13):
  t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
  revision.c: consult Bloom filters for root commits
  commit-graph: ensure Bloom filters are read with consistent settings
  t/helper/test-read-graph.c: extract `dump_graph_info()`
  bloom.h: make `load_bloom_filter_from_graph()` public
  t/helper/test-read-graph: implement `bloom-filters` mode
  bloom: annotate filters with hash version
  bloom: prepare to discard incompatible Bloom filters
  commit-graph.c: unconditionally load Bloom filters
  commit-graph: drop unnecessary `graph_read_bloom_data_context`
  object.h: fix mis-aligned flag bits table
  commit-graph: reuse existing Bloom filters where possible
  bloom: introduce `deinit_bloom_filters()`

 Documentation/config/commitgraph.txt     |  26 ++-
 Documentation/gitformat-commit-graph.txt |   9 +-
 bloom.c                                  | 208 ++++++++++++++++-
 bloom.h                                  |  38 ++-
 commit-graph.c                           |  61 ++++-
 object.h                                 |   3 +-
 oss-fuzz/fuzz-commit-graph.c             |   2 +-
 repo-settings.c                          |   6 +-
 repository.h                             |   2 +-
 revision.c                               |  26 ++-
 t/helper/test-bloom.c                    |   9 +-
 t/helper/test-read-graph.c               |  67 ++++--
 t/t0095-bloom.sh                         |   8 +
 t/t4216-log-bloom.sh                     | 282 ++++++++++++++++++++++-
 14 files changed, 692 insertions(+), 55 deletions(-)

Range-diff against v3:
 1:  fe671d616c =  1:  e0fc51c3fb t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
 2:  7d0fa93543 =  2:  87b09e6266 revision.c: consult Bloom filters for root commits
 3:  2ecc0a2d58 !  3:  46d8a41005 commit-graph: ensure Bloom filters are read with consistent settings
    @@ t/t4216-log-bloom.sh: test_expect_success 'Bloom generation backfills empty comm
     +	done
     +'
     +
    -+test_expect_success 'split' '
    ++test_expect_success 'ensure incompatible Bloom filters are ignored' '
     +	# Compute Bloom filters with "unusual" settings.
     +	git -C $repo rev-parse one >in &&
     +	GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=3 git -C $repo commit-graph write \
    @@ t/t4216-log-bloom.sh: test_expect_success 'Bloom generation backfills empty comm
     +
     +test_expect_success 'merge graph layers with incompatible Bloom settings' '
     +	# Ensure that incompatible Bloom filters are ignored when
    -+	# generating new layers.
    ++	# merging existing layers.
     +	git -C $repo commit-graph write --reachable --changed-paths 2>err &&
     +	grep "disabling Bloom filters for commit-graph layer .$layer." err &&
     +
     +	test_path_is_file $repo/$graph &&
     +	test_dir_is_empty $repo/$graphdir &&
     +
    -+	# ...and merging existing ones.
    -+	git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- file \
    -+		>expect 2>err &&
    -+	GIT_TRACE2_PERF="$(pwd)/trace.perf" \
    ++	git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- \
    ++		file >expect &&
    ++	trace_out="$(pwd)/trace.perf" &&
    ++	GIT_TRACE2_PERF="$trace_out" \
     +		git -C $repo log --oneline --no-decorate -- file >actual 2>err &&
     +
    -+	test_cmp expect actual && cat err &&
    -+	grep "statistics:{\"filter_not_present\":0" trace.perf &&
    -+	! grep "disabling Bloom filters" err
    ++	test_cmp expect actual &&
    ++	grep "statistics:{\"filter_not_present\":0," trace.perf &&
    ++	test_must_be_empty err
     +'
     +
      test_done
 4:  17703ed89a =  4:  4d0190a992 gitformat-commit-graph: describe version 2 of BDAT
 5:  94552abf45 =  5:  3c2057c11c t/helper/test-read-graph.c: extract `dump_graph_info()`
 6:  3d81efa27b =  6:  e002e35004 bloom.h: make `load_bloom_filter_from_graph()` public
 7:  d23cd89037 =  7:  c7016f51cd t/helper/test-read-graph: implement `bloom-filters` mode
 8:  cba766f224 !  8:  cef2aac8ba t4216: test changed path filters with high bit paths
    @@ Commit message
     
      ## t/t4216-log-bloom.sh ##
     @@ t/t4216-log-bloom.sh: test_expect_success 'merge graph layers with incompatible Bloom settings' '
    - 	! grep "disabling Bloom filters" err
    + 	test_must_be_empty err
      '
      
     +get_first_changed_path_filter () {
    @@ t/t4216-log-bloom.sh: test_expect_success 'merge graph layers with incompatible
     +	(
     +		cd highbit1 &&
     +		echo "52a9" >expect &&
    -+		get_first_changed_path_filter >actual &&
    -+		test_cmp expect actual
    ++		get_first_changed_path_filter >actual
     +	)
     +'
     +
 9:  a08a961f41 =  9:  36d4e2202e repo-settings: introduce commitgraph.changedPathsVersion
10:  61d44519a5 ! 10:  f6ab427ead commit-graph: new filter ver. that fixes murmur3
    @@ t/t4216-log-bloom.sh: test_expect_success 'version 1 changed-path used when vers
     +	test_commit -C doublewrite c "$CENT" &&
     +	git -C doublewrite config --add commitgraph.changedPathsVersion 1 &&
     +	git -C doublewrite commit-graph write --reachable --changed-paths &&
    ++	for v in -2 3
    ++	do
    ++		git -C doublewrite config --add commitgraph.changedPathsVersion $v &&
    ++		git -C doublewrite commit-graph write --reachable --changed-paths 2>err &&
    ++		cat >expect <<-EOF &&
    ++		warning: attempting to write a commit-graph, but ${SQ}commitgraph.changedPathsVersion${SQ} ($v) is not supported
    ++		EOF
    ++		test_cmp expect err || return 1
    ++	done &&
     +	git -C doublewrite config --add commitgraph.changedPathsVersion 2 &&
     +	git -C doublewrite commit-graph write --reachable --changed-paths &&
     +	(
11:  a8c10f8de8 = 11:  dc69b28329 bloom: annotate filters with hash version
12:  2ba10a4b4b = 12:  85dbdc4ed2 bloom: prepare to discard incompatible Bloom filters
13:  09d8669c3a = 13:  3ff669a622 commit-graph.c: unconditionally load Bloom filters
14:  0d4f9dc4ee = 14:  1c78e3d178 commit-graph: drop unnecessary `graph_read_bloom_data_context`
15:  1f7f27bc47 = 15:  a289514faa object.h: fix mis-aligned flag bits table
16:  abbef95ae8 ! 16:  6a12e39e7f commit-graph: reuse existing Bloom filters where possible
    @@ t/t4216-log-bloom.sh: test_expect_success 'when writing another commit graph, pr
      	test_commit -C doublewrite c "$CENT" &&
     +
      	git -C doublewrite config --add commitgraph.changedPathsVersion 1 &&
    --	git -C doublewrite commit-graph write --reachable --changed-paths &&
     +	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
     +		git -C doublewrite commit-graph write --reachable --changed-paths &&
     +	test_filter_computed 1 trace2.txt &&
     +	test_filter_upgraded 0 trace2.txt &&
    ++
    + 	git -C doublewrite commit-graph write --reachable --changed-paths &&
    + 	for v in -2 3
    + 	do
    +@@ t/t4216-log-bloom.sh: test_expect_success 'when writing commit graph, do not reuse changed-path of ano
    + 		EOF
    + 		test_cmp expect err || return 1
    + 	done &&
     +
      	git -C doublewrite config --add commitgraph.changedPathsVersion 2 &&
     -	git -C doublewrite commit-graph write --reachable --changed-paths &&
17:  ca362408d5 ! 17:  8942f205c8 bloom: introduce `deinit_bloom_filters()`
    @@ bloom.h: void add_key_to_filter(const struct bloom_key *key,
      	BLOOM_NOT_COMPUTED = (1 << 0),
     
      ## commit-graph.c ##
    -@@ commit-graph.c: static void close_commit_graph_one(struct commit_graph *g)
    +@@ commit-graph.c: struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r)
      void close_commit_graph(struct raw_object_store *o)
      {
    - 	close_commit_graph_one(o->commit_graph);
    + 	clear_commit_graph_data_slab(&commit_graph_data_slab);
     +	deinit_bloom_filters();
    + 	free_commit_graph(o->commit_graph);
      	o->commit_graph = NULL;
      }
    - 
     @@ commit-graph.c: int write_commit_graph(struct object_directory *odb,
      
      	res = write_commit_graph_file(ctx);
-- 
2.42.0.415.g8942f205c8

^ permalink raw reply

* [PATCH v4 01/17] t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

The existing implementation of test_bloom_filters_not_used() asserts
that the Bloom filter sub-system has not been initialized at all, by
checking for the absence of any data from it from trace2.

In the following commit, it will become possible to load Bloom filters
without using them (e.g., because `commitGraph.changedPathVersion` is
incompatible with the hash version with which the commit-graph's Bloom
filters were written).

When this is the case, it's possible to initialize the Bloom filter
sub-system, while still not using any Bloom filters. When this is the
case, check that the data dump from the Bloom sub-system is all zeros,
indicating that no filters were used.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t4216-log-bloom.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index fa9d32facf..487fc3d6b9 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -81,7 +81,19 @@ test_bloom_filters_used () {
 test_bloom_filters_not_used () {
 	log_args=$1
 	setup "$log_args" &&
-	! grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf" &&
+
+	if grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf"
+	then
+		# if the Bloom filter system is initialized, ensure that no
+		# filters were used
+		data="statistics:{"
+		data="$data\"filter_not_present\":0,"
+		data="$data\"maybe\":0,"
+		data="$data\"definitely_not\":0,"
+		data="$data\"false_positive\":0}"
+
+		grep -q "$data" "$TRASH_DIRECTORY/trace.perf"
+	fi &&
 	test_cmp log_wo_bloom log_w_bloom
 }

-- 
2.42.0.415.g8942f205c8

^ permalink raw reply related

* [PATCH v4 02/17] revision.c: consult Bloom filters for root commits
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

The commit-graph stores changed-path Bloom filters which represent the
set of paths included in a tree-level diff between a commit's root tree
and that of its parent.

When a commit has no parents, the tree-diff is computed against that
commit's root tree and the empty tree. In other words, every path in
that commit's tree is stored in the Bloom filter (since they all appear
in the diff).

Consult these filters during pathspec-limited traversals in the function
`rev_same_tree_as_empty()`. Doing so yields a performance improvement
where we can avoid enumerating the full set of paths in a parentless
commit's root tree when we know that the path(s) of interest were not
listed in that commit's changed-path Bloom filter.

Suggested-by: SZEDER Gábor <szeder.dev@gmail.com>
Original-patch-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 revision.c           | 26 ++++++++++++++++++++++----
 t/t4216-log-bloom.sh |  8 ++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/revision.c b/revision.c
index 219dc76716..6e9da518d9 100644
--- a/revision.c
+++ b/revision.c
@@ -834,17 +834,28 @@ static int rev_compare_tree(struct rev_info *revs,
 	return tree_difference;
 }
 
-static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
+static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit,
+				  int nth_parent)
 {
 	struct tree *t1 = repo_get_commit_tree(the_repository, commit);
+	int bloom_ret = 1;
 
 	if (!t1)
 		return 0;
 
+	if (nth_parent == 1 && revs->bloom_keys_nr) {
+		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
+		if (!bloom_ret)
+			return 1;
+	}
+
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
 	diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
 
+	if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
+		count_bloom_filter_false_positive++;
+
 	return tree_difference == REV_TREE_SAME;
 }
 
@@ -882,7 +893,7 @@ static int compact_treesame(struct rev_info *revs, struct commit *commit, unsign
 		if (nth_parent != 0)
 			die("compact_treesame %u", nth_parent);
 		old_same = !!(commit->object.flags & TREESAME);
-		if (rev_same_tree_as_empty(revs, commit))
+		if (rev_same_tree_as_empty(revs, commit, nth_parent))
 			commit->object.flags |= TREESAME;
 		else
 			commit->object.flags &= ~TREESAME;
@@ -978,7 +989,14 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 		return;
 
 	if (!commit->parents) {
-		if (rev_same_tree_as_empty(revs, commit))
+		/*
+		 * Pretend as if we are comparing ourselves to the
+		 * (non-existent) first parent of this commit object. Even
+		 * though no such parent exists, its changed-path Bloom filter
+		 * (if one exists) is relative to the empty tree, using Bloom
+		 * filters is allowed here.
+		 */
+		if (rev_same_tree_as_empty(revs, commit, 1))
 			commit->object.flags |= TREESAME;
 		return;
 	}
@@ -1059,7 +1077,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 
 		case REV_TREE_NEW:
 			if (revs->remove_empty_trees &&
-			    rev_same_tree_as_empty(revs, p)) {
+			    rev_same_tree_as_empty(revs, p, nth_parent)) {
 				/* We are adding all the specified
 				 * paths from this parent, so the
 				 * history beyond this parent is not
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 487fc3d6b9..322640feeb 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -87,7 +87,11 @@ test_bloom_filters_not_used () {
 		# if the Bloom filter system is initialized, ensure that no
 		# filters were used
 		data="statistics:{"
-		data="$data\"filter_not_present\":0,"
+		# unusable filters (e.g., those computed with a
+		# different value of commitGraph.changedPathsVersion)
+		# are counted in the filter_not_present bucket, so any
+		# value is OK there.
+		data="$data\"filter_not_present\":[0-9][0-9]*,"
 		data="$data\"maybe\":0,"
 		data="$data\"definitely_not\":0,"
 		data="$data\"false_positive\":0}"
@@ -174,7 +178,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
 
 test_bloom_filters_used_when_some_filters_are_missing () {
 	log_args=$1
-	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":9"
+	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":10"
 	setup "$log_args" &&
 	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
 	test_cmp log_wo_bloom log_w_bloom
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 03/17] commit-graph: ensure Bloom filters are read with consistent settings
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

The changed-path Bloom filter mechanism is parameterized by a couple of
variables, notably the number of bits per hash (typically "m" in Bloom
filter literature) and the number of hashes themselves (typically "k").

It is critically important that filters are read with the Bloom filter
settings that they were written with. Failing to do so would mean that
each query is liable to compute different fingerprints, meaning that the
filter itself could return a false negative. This goes against a basic
assumption of using Bloom filters (that they may return false positives,
but never false negatives) and can lead to incorrect results.

We have some existing logic to carry forward existing Bloom filter
settings from one layer to the next. In `write_commit_graph()`, we have
something like:

    if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {
        struct commit_graph *g = ctx->r->objects->commit_graph;

        /* We have changed-paths already. Keep them in the next graph */
        if (g && g->chunk_bloom_data) {
            ctx->changed_paths = 1;
            ctx->bloom_settings = g->bloom_filter_settings;
        }
    }

, which drags forward Bloom filter settings across adjacent layers.

This doesn't quite address all cases, however, since it is possible for
intermediate layers to contain no Bloom filters at all. For example,
suppose we have two layers in a commit-graph chain, say, {G1, G2}. If G1
contains Bloom filters, but G2 doesn't, a new G3 (whose base graph is
G2) may be written with arbitrary Bloom filter settings, because we only
check the immediately adjacent layer's settings for compatibility.

This behavior has existed since the introduction of changed-path Bloom
filters. But in practice, this is not such a big deal, since the only
way up until this point to modify the Bloom filter settings at write
time is with the undocumented environment variables:

  - GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY
  - GIT_TEST_BLOOM_SETTINGS_NUM_HASHES
  - GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS

(it is still possible to tweak MAX_CHANGED_PATHS between layers, but
this does not affect reads, so is allowed to differ across multiple
graph layers).

But in future commits, we will introduce another parameter to change the
hash algorithm used to compute Bloom fingerprints itself. This will be
exposed via a configuration setting, making this foot-gun easier to use.

To prevent this potential issue, validate that all layers of a split
commit-graph have compatible settings with the newest layer which
contains Bloom filters.

Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Original-test-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 commit-graph.c       | 25 +++++++++++++++++
 t/t4216-log-bloom.sh | 64 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index fd2f700b2e..0ac79aff5a 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -498,6 +498,30 @@ static int validate_mixed_generation_chain(struct commit_graph *g)
 	return 0;
 }
 
+static void validate_mixed_bloom_settings(struct commit_graph *g)
+{
+	struct bloom_filter_settings *settings = NULL;
+	for (; g; g = g->base_graph) {
+		if (!g->bloom_filter_settings)
+			continue;
+		if (!settings) {
+			settings = g->bloom_filter_settings;
+			continue;
+		}
+
+		if (g->bloom_filter_settings->bits_per_entry != settings->bits_per_entry ||
+		    g->bloom_filter_settings->num_hashes != settings->num_hashes) {
+			g->chunk_bloom_indexes = NULL;
+			g->chunk_bloom_data = NULL;
+			FREE_AND_NULL(g->bloom_filter_settings);
+
+			warning(_("disabling Bloom filters for commit-graph "
+				  "layer '%s' due to incompatible settings"),
+				oid_to_hex(&g->oid));
+		}
+	}
+}
+
 static int add_graph_to_chain(struct commit_graph *g,
 			      struct commit_graph *chain,
 			      struct object_id *oids,
@@ -616,6 +640,7 @@ struct commit_graph *load_commit_graph_chain_fd_st(struct repository *r,
 	}
 
 	validate_mixed_generation_chain(graph_chain);
+	validate_mixed_bloom_settings(graph_chain);
 
 	free(oids);
 	fclose(fp);
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 322640feeb..2ea5e90955 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -420,4 +420,68 @@ test_expect_success 'Bloom generation backfills empty commits' '
 	)
 '
 
+graph=.git/objects/info/commit-graph
+graphdir=.git/objects/info/commit-graphs
+chain=$graphdir/commit-graph-chain
+
+test_expect_success 'setup for mixed Bloom setting tests' '
+	repo=mixed-bloom-settings &&
+
+	git init $repo &&
+	for i in one two three
+	do
+		test_commit -C $repo $i file || return 1
+	done
+'
+
+test_expect_success 'ensure incompatible Bloom filters are ignored' '
+	# Compute Bloom filters with "unusual" settings.
+	git -C $repo rev-parse one >in &&
+	GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=3 git -C $repo commit-graph write \
+		--stdin-commits --changed-paths --split <in &&
+	layer=$(head -n 1 $repo/$chain) &&
+
+	# A commit-graph layer without Bloom filters "hides" the layers
+	# below ...
+	git -C $repo rev-parse two >in &&
+	git -C $repo commit-graph write --stdin-commits --no-changed-paths \
+		--split=no-merge <in &&
+
+	# Another commit-graph layer that has Bloom filters, but with
+	# standard settings, and is thus incompatible with the base
+	# layer written above.
+	git -C $repo rev-parse HEAD >in &&
+	git -C $repo commit-graph write --stdin-commits --changed-paths \
+		--split=no-merge <in &&
+
+	test_line_count = 3 $repo/$chain &&
+
+	# Ensure that incompatible Bloom filters are ignored.
+	git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- file \
+		>expect 2>err &&
+	git -C $repo log --oneline --no-decorate -- file >actual 2>err &&
+	test_cmp expect actual &&
+	grep "disabling Bloom filters for commit-graph layer .$layer." err
+'
+
+test_expect_success 'merge graph layers with incompatible Bloom settings' '
+	# Ensure that incompatible Bloom filters are ignored when
+	# merging existing layers.
+	git -C $repo commit-graph write --reachable --changed-paths 2>err &&
+	grep "disabling Bloom filters for commit-graph layer .$layer." err &&
+
+	test_path_is_file $repo/$graph &&
+	test_dir_is_empty $repo/$graphdir &&
+
+	git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- \
+		file >expect &&
+	trace_out="$(pwd)/trace.perf" &&
+	GIT_TRACE2_PERF="$trace_out" \
+		git -C $repo log --oneline --no-decorate -- file >actual 2>err &&
+
+	test_cmp expect actual &&
+	grep "statistics:{\"filter_not_present\":0," trace.perf &&
+	test_must_be_empty err
+'
+
 test_done
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 04/17] gitformat-commit-graph: describe version 2 of BDAT
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

From: Jonathan Tan <jonathantanmy@google.com>

The code change to Git to support version 2 will be done in subsequent
commits.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/gitformat-commit-graph.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Documentation/gitformat-commit-graph.txt b/Documentation/gitformat-commit-graph.txt
index 31cad585e2..3e906e8030 100644
--- a/Documentation/gitformat-commit-graph.txt
+++ b/Documentation/gitformat-commit-graph.txt
@@ -142,13 +142,16 @@ All multi-byte numbers are in network byte order.
 
 ==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
     * It starts with header consisting of three unsigned 32-bit integers:
-      - Version of the hash algorithm being used. We currently only support
-	value 1 which corresponds to the 32-bit version of the murmur3 hash
+      - Version of the hash algorithm being used. We currently support
+	value 2 which corresponds to the 32-bit version of the murmur3 hash
 	implemented exactly as described in
 	https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
 	hashing technique using seed values 0x293ae76f and 0x7e646e2 as
 	described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
-	in Probabilistic Verification"
+	in Probabilistic Verification". Version 1 Bloom filters have a bug that appears
+	when char is signed and the repository has path names that have characters >=
+	0x80; Git supports reading and writing them, but this ability will be removed
+	in a future version of Git.
       - The number of times a path is hashed and hence the number of bit positions
 	      that cumulatively determine whether a file is present in the commit.
       - The minimum number of bits 'b' per entry in the Bloom filter. If the filter
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 05/17] t/helper/test-read-graph.c: extract `dump_graph_info()`
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

Prepare for the 'read-graph' test helper to perform other tasks besides
dumping high-level information about the commit-graph by extracting its
main routine into a separate function.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-read-graph.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c
index 8c7a83f578..3375392f6c 100644
--- a/t/helper/test-read-graph.c
+++ b/t/helper/test-read-graph.c
@@ -5,20 +5,8 @@
 #include "bloom.h"
 #include "setup.h"
 
-int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
+static void dump_graph_info(struct commit_graph *graph)
 {
-	struct commit_graph *graph = NULL;
-	struct object_directory *odb;
-
-	setup_git_directory();
-	odb = the_repository->objects->odb;
-
-	prepare_repo_settings(the_repository);
-
-	graph = read_commit_graph_one(the_repository, odb);
-	if (!graph)
-		return 1;
-
 	printf("header: %08x %d %d %d %d\n",
 		ntohl(*(uint32_t*)graph->data),
 		*(unsigned char*)(graph->data + 4),
@@ -57,6 +45,23 @@ int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
 	if (graph->topo_levels)
 		printf(" topo_levels");
 	printf("\n");
+}
+
+int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
+{
+	struct commit_graph *graph = NULL;
+	struct object_directory *odb;
+
+	setup_git_directory();
+	odb = the_repository->objects->odb;
+
+	prepare_repo_settings(the_repository);
+
+	graph = read_commit_graph_one(the_repository, odb);
+	if (!graph)
+		return 1;
+
+	dump_graph_info(graph);
 
 	UNLEAK(graph);
 
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 06/17] bloom.h: make `load_bloom_filter_from_graph()` public
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

Prepare for a future commit to use the load_bloom_filter_from_graph()
function directly to load specific Bloom filters out of the commit-graph
for manual inspection (to be used during tests).

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bloom.c | 6 +++---
 bloom.h | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/bloom.c b/bloom.c
index aef6b5fea2..3e78cfe79d 100644
--- a/bloom.c
+++ b/bloom.c
@@ -29,9 +29,9 @@ static inline unsigned char get_bitmask(uint32_t pos)
 	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
 }
 
-static int load_bloom_filter_from_graph(struct commit_graph *g,
-					struct bloom_filter *filter,
-					uint32_t graph_pos)
+int load_bloom_filter_from_graph(struct commit_graph *g,
+				 struct bloom_filter *filter,
+				 uint32_t graph_pos)
 {
 	uint32_t lex_pos, start_index, end_index;
 
diff --git a/bloom.h b/bloom.h
index adde6dfe21..1e4f612d2c 100644
--- a/bloom.h
+++ b/bloom.h
@@ -3,6 +3,7 @@
 
 struct commit;
 struct repository;
+struct commit_graph;
 
 struct bloom_filter_settings {
 	/*
@@ -68,6 +69,10 @@ struct bloom_key {
 	uint32_t *hashes;
 };
 
+int load_bloom_filter_from_graph(struct commit_graph *g,
+				 struct bloom_filter *filter,
+				 uint32_t graph_pos);
+
 /*
  * Calculate the murmur3 32-bit hash value for the given data
  * using the given seed.
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 07/17] t/helper/test-read-graph: implement `bloom-filters` mode
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

Implement a mode of the "read-graph" test helper to dump out the
hexadecimal contents of the Bloom filter(s) contained in a commit-graph.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-read-graph.c | 44 +++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c
index 3375392f6c..da9ac8584d 100644
--- a/t/helper/test-read-graph.c
+++ b/t/helper/test-read-graph.c
@@ -47,10 +47,32 @@ static void dump_graph_info(struct commit_graph *graph)
 	printf("\n");
 }
 
-int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
+static void dump_graph_bloom_filters(struct commit_graph *graph)
+{
+	uint32_t i;
+
+	for (i = 0; i < graph->num_commits + graph->num_commits_in_base; i++) {
+		struct bloom_filter filter = { 0 };
+		size_t j;
+
+		if (load_bloom_filter_from_graph(graph, &filter, i) < 0) {
+			fprintf(stderr, "missing Bloom filter for graph "
+				"position %"PRIu32"\n", i);
+			continue;
+		}
+
+		for (j = 0; j < filter.len; j++)
+			printf("%02x", filter.data[j]);
+		if (filter.len)
+			printf("\n");
+	}
+}
+
+int cmd__read_graph(int argc, const char **argv)
 {
 	struct commit_graph *graph = NULL;
 	struct object_directory *odb;
+	int ret = 0;
 
 	setup_git_directory();
 	odb = the_repository->objects->odb;
@@ -58,12 +80,24 @@ int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
 	prepare_repo_settings(the_repository);
 
 	graph = read_commit_graph_one(the_repository, odb);
-	if (!graph)
-		return 1;
+	if (!graph) {
+		ret = 1;
+		goto done;
+	}
 
-	dump_graph_info(graph);
+	if (argc <= 1)
+		dump_graph_info(graph);
+	else if (!strcmp(argv[1], "bloom-filters"))
+		dump_graph_bloom_filters(graph);
+	else {
+		fprintf(stderr, "unknown sub-command: '%s'\n", argv[1]);
+		ret = 1;
+	}
 
+done:
 	UNLEAK(graph);
 
-	return 0;
+	return ret;
 }
+
+
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 08/17] t4216: test changed path filters with high bit paths
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

From: Jonathan Tan <jonathantanmy@google.com>

Subsequent commits will teach Git another version of changed path
filter that has different behavior with paths that contain at least
one character with its high bit set, so test the existing behavior as
a baseline.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t4216-log-bloom.sh | 51 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 2ea5e90955..400dce2193 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -484,4 +484,55 @@ test_expect_success 'merge graph layers with incompatible Bloom settings' '
 	test_must_be_empty err
 '
 
+get_first_changed_path_filter () {
+	test-tool read-graph bloom-filters >filters.dat &&
+	head -n 1 filters.dat
+}
+
+# chosen to be the same under all Unicode normalization forms
+CENT=$(printf "\302\242")
+
+test_expect_success 'set up repo with high bit path, version 1 changed-path' '
+	git init highbit1 &&
+	test_commit -C highbit1 c1 "$CENT" &&
+	git -C highbit1 commit-graph write --reachable --changed-paths
+'
+
+test_expect_success 'setup check value of version 1 changed-path' '
+	(
+		cd highbit1 &&
+		echo "52a9" >expect &&
+		get_first_changed_path_filter >actual
+	)
+'
+
+# expect will not match actual if char is unsigned by default. Write the test
+# in this way, so that a user running this test script can still see if the two
+# files match. (It will appear as an ordinary success if they match, and a skip
+# if not.)
+if test_cmp highbit1/expect highbit1/actual
+then
+	test_set_prereq SIGNED_CHAR_BY_DEFAULT
+fi
+test_expect_success SIGNED_CHAR_BY_DEFAULT 'check value of version 1 changed-path' '
+	# Only the prereq matters for this test.
+	true
+'
+
+test_expect_success 'setup make another commit' '
+	# "git log" does not use Bloom filters for root commits - see how, in
+	# revision.c, rev_compare_tree() (the only code path that eventually calls
+	# get_bloom_filter()) is only called by try_to_simplify_commit() when the commit
+	# has one parent. Therefore, make another commit so that we perform the tests on
+	# a non-root commit.
+	test_commit -C highbit1 anotherc1 "another$CENT"
+'
+
+test_expect_success 'version 1 changed-path used when version 1 requested' '
+	(
+		cd highbit1 &&
+		test_bloom_filters_used "-- another$CENT"
+	)
+'
+
 test_done
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

* [PATCH v4 09/17] repo-settings: introduce commitgraph.changedPathsVersion
From: Taylor Blau @ 2023-10-18 18:32 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt
In-Reply-To: <cover.1697653929.git.me@ttaylorr.com>

From: Jonathan Tan <jonathantanmy@google.com>

A subsequent commit will introduce another version of the changed-path
filter in the commit graph file. In order to control which version to
write (and read), a config variable is needed.

Therefore, introduce this config variable. For forwards compatibility,
teach Git to not read commit graphs when the config variable
is set to an unsupported version. Because we teach Git this,
commitgraph.readChangedPaths is now redundant, so deprecate it and
define its behavior in terms of the config variable we introduce.

This commit does not change the behavior of writing (Git writes changed
path filters when explicitly instructed regardless of any config
variable), but a subsequent commit will restrict Git such that it will
only write when commitgraph.changedPathsVersion is a recognized value.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config/commitgraph.txt | 23 ++++++++++++++++++++---
 commit-graph.c                       |  2 +-
 oss-fuzz/fuzz-commit-graph.c         |  2 +-
 repo-settings.c                      |  6 +++++-
 repository.h                         |  2 +-
 5 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt
index 30604e4a4c..2dc9170622 100644
--- a/Documentation/config/commitgraph.txt
+++ b/Documentation/config/commitgraph.txt
@@ -9,6 +9,23 @@ commitGraph.maxNewFilters::
 	commit-graph write` (c.f., linkgit:git-commit-graph[1]).
 
 commitGraph.readChangedPaths::
-	If true, then git will use the changed-path Bloom filters in the
-	commit-graph file (if it exists, and they are present). Defaults to
-	true. See linkgit:git-commit-graph[1] for more information.
+	Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and
+	commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion
+	is also set, commitGraph.changedPathsVersion takes precedence.)
+
+commitGraph.changedPathsVersion::
+	Specifies the version of the changed-path Bloom filters that Git will read and
+	write. May be -1, 0 or 1.
++
+Defaults to -1.
++
+If -1, Git will use the version of the changed-path Bloom filters in the
+repository, defaulting to 1 if there are none.
++
+If 0, Git will not read any Bloom filters, and will write version 1 Bloom
+filters when instructed to write.
++
+If 1, Git will only read version 1 Bloom filters, and will write version 1
+Bloom filters.
++
+See linkgit:git-commit-graph[1] for more information.
diff --git a/commit-graph.c b/commit-graph.c
index 0ac79aff5a..bcc9a15cfa 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -411,7 +411,7 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 			graph->read_generation_data = 1;
 	}
 
-	if (s->commit_graph_read_changed_paths) {
+	if (s->commit_graph_changed_paths_version) {
 		pair_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES,
 			   &graph->chunk_bloom_indexes);
 		read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA,
diff --git a/oss-fuzz/fuzz-commit-graph.c b/oss-fuzz/fuzz-commit-graph.c
index 2992079dd9..325c0b991a 100644
--- a/oss-fuzz/fuzz-commit-graph.c
+++ b/oss-fuzz/fuzz-commit-graph.c
@@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
 	 * possible.
 	 */
 	the_repository->settings.commit_graph_generation_version = 2;
-	the_repository->settings.commit_graph_read_changed_paths = 1;
+	the_repository->settings.commit_graph_changed_paths_version = 1;
 	g = parse_commit_graph(&the_repository->settings, (void *)data, size);
 	repo_clear(the_repository);
 	free_commit_graph(g);
diff --git a/repo-settings.c b/repo-settings.c
index 525f69c0c7..db8fe817f3 100644
--- a/repo-settings.c
+++ b/repo-settings.c
@@ -24,6 +24,7 @@ void prepare_repo_settings(struct repository *r)
 	int value;
 	const char *strval;
 	int manyfiles;
+	int read_changed_paths;
 
 	if (!r->gitdir)
 		BUG("Cannot add settings for uninitialized repository");
@@ -54,7 +55,10 @@ void prepare_repo_settings(struct repository *r)
 	/* Commit graph config or default, does not cascade (simple) */
 	repo_cfg_bool(r, "core.commitgraph", &r->settings.core_commit_graph, 1);
 	repo_cfg_int(r, "commitgraph.generationversion", &r->settings.commit_graph_generation_version, 2);
-	repo_cfg_bool(r, "commitgraph.readchangedpaths", &r->settings.commit_graph_read_changed_paths, 1);
+	repo_cfg_bool(r, "commitgraph.readchangedpaths", &read_changed_paths, 1);
+	repo_cfg_int(r, "commitgraph.changedpathsversion",
+		     &r->settings.commit_graph_changed_paths_version,
+		     read_changed_paths ? -1 : 0);
 	repo_cfg_bool(r, "gc.writecommitgraph", &r->settings.gc_write_commit_graph, 1);
 	repo_cfg_bool(r, "fetch.writecommitgraph", &r->settings.fetch_write_commit_graph, 0);
 
diff --git a/repository.h b/repository.h
index 5f18486f64..f71154e12c 100644
--- a/repository.h
+++ b/repository.h
@@ -29,7 +29,7 @@ struct repo_settings {
 
 	int core_commit_graph;
 	int commit_graph_generation_version;
-	int commit_graph_read_changed_paths;
+	int commit_graph_changed_paths_version;
 	int gc_write_commit_graph;
 	int fetch_write_commit_graph;
 	int command_requires_full_index;
-- 
2.42.0.415.g8942f205c8


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox