Git development
 help / color / mirror / Atom feed
* [PATCH v2 4/4] files-backend.c: avoid stat in 'loose_fill_ref_dir'
From: Victoria Dye via GitGitGadget @ 2023-10-09 21:58 UTC (permalink / raw)
  To: git; +Cc: Patrick Steinhardt, Victoria Dye, Victoria Dye
In-Reply-To: <pull.1594.v2.git.1696888736.gitgitgadget@gmail.com>

From: Victoria Dye <vdye@github.com>

Modify the 'readdir' loop in 'loose_fill_ref_dir' to, rather than 'stat' a
file to determine whether it is a directory or not, use 'get_dtype'.

Currently, the loop uses 'stat' to determine whether each dirent is a
directory itself or not in order to construct the appropriate ref cache
entry. If 'stat' fails (returning a negative value), the dirent is silently
skipped; otherwise, 'S_ISDIR(st.st_mode)' is used to check whether the entry
is a directory.

On platforms that include an entry's d_type in in the 'dirent' struct, this
extra 'stat' check is redundant. We can use the 'get_dtype' method to
extract this information on platforms that support it (i.e. where
NO_D_TYPE_IN_DIRENT is unset), and derive it with 'stat' on platforms that
don't. Because 'stat' is an expensive call, this confers a
modest-but-noticeable performance improvement when iterating over large
numbers of refs (approximately 20% speedup in 'git for-each-ref' in a 30k
ref repo).

Unlike other existing usage of 'get_dtype', the 'follow_symlinks' arg is set
to 1 to replicate the existing handling of symlink dirents. This
unfortunately requires calling 'stat' on the associated entry regardless of
platform, but symlinks in the loose ref store are highly unlikely since
they'd need to be created manually by a user.

Note that this patch also changes the condition for skipping creation of a
ref entry from "when 'stat' fails" to "when the d_type is anything other
than DT_REG or DT_DIR". If a dirent's d_type is DT_UNKNOWN (either because
the platform doesn't support d_type in dirents or some other reason) or
DT_LNK, 'get_dtype' will try to derive the underlying type with 'stat'. If
the 'stat' fails, the d_type will remain 'DT_UNKNOWN' and dirent will be
skipped. However, it will also be skipped if it is any other valid d_type
(e.g. DT_FIFO for named pipes, DT_LNK for a nested symlink). Git does not
handle these properly anyway, so we can safely constrain accepted types to
directories and regular files.

Signed-off-by: Victoria Dye <vdye@github.com>
---
 refs/files-backend.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/refs/files-backend.c b/refs/files-backend.c
index 341354182bb..db5c0c7a724 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -246,10 +246,8 @@ static void loose_fill_ref_dir(struct ref_store *ref_store,
 	int dirnamelen = strlen(dirname);
 	struct strbuf refname;
 	struct strbuf path = STRBUF_INIT;
-	size_t path_baselen;
 
 	files_ref_path(refs, &path, dirname);
-	path_baselen = path.len;
 
 	d = opendir(path.buf);
 	if (!d) {
@@ -262,23 +260,22 @@ static void loose_fill_ref_dir(struct ref_store *ref_store,
 
 	while ((de = readdir(d)) != NULL) {
 		struct object_id oid;
-		struct stat st;
 		int flag;
+		unsigned char dtype;
 
 		if (de->d_name[0] == '.')
 			continue;
 		if (ends_with(de->d_name, ".lock"))
 			continue;
 		strbuf_addstr(&refname, de->d_name);
-		strbuf_addstr(&path, de->d_name);
-		if (stat(path.buf, &st) < 0) {
-			; /* silently ignore */
-		} else if (S_ISDIR(st.st_mode)) {
+
+		dtype = get_dtype(de, &path, 1);
+		if (dtype == DT_DIR) {
 			strbuf_addch(&refname, '/');
 			add_entry_to_dir(dir,
 					 create_dir_entry(dir->cache, refname.buf,
 							  refname.len));
-		} else {
+		} else if (dtype == DT_REG) {
 			if (!refs_resolve_ref_unsafe(&refs->base,
 						     refname.buf,
 						     RESOLVE_REF_READING,
@@ -308,7 +305,6 @@ static void loose_fill_ref_dir(struct ref_store *ref_store,
 					 create_ref_entry(refname.buf, &oid, flag));
 		}
 		strbuf_setlen(&refname, dirnamelen);
-		strbuf_setlen(&path, path_baselen);
 	}
 	strbuf_release(&refname);
 	strbuf_release(&path);
-- 
gitgitgadget

^ permalink raw reply related

* [PATCH v2 3/4] dir.[ch]: add 'follow_symlink' arg to 'get_dtype'
From: Victoria Dye via GitGitGadget @ 2023-10-09 21:58 UTC (permalink / raw)
  To: git; +Cc: Patrick Steinhardt, Victoria Dye, Victoria Dye
In-Reply-To: <pull.1594.v2.git.1696888736.gitgitgadget@gmail.com>

From: Victoria Dye <vdye@github.com>

Add a 'follow_symlink' boolean option to 'get_type()'. If 'follow_symlink'
is enabled, DT_LNK (in addition to DT_UNKNOWN) d_types triggers the
stat-based d_type resolution, using 'stat' instead of 'lstat' to get the
type of the followed symlink. Note that symlinks are not followed
recursively, so a symlink pointing to another symlink will still resolve to
DT_LNK.

Update callers in 'diagnose.c' to specify 'follow_symlink = 0' to preserve
current behavior.

Signed-off-by: Victoria Dye <vdye@github.com>
---
 diagnose.c |  6 +++---
 dir.c      | 13 +++++++++----
 dir.h      |  7 ++++++-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/diagnose.c b/diagnose.c
index fc4d344bd63..4d096c857f1 100644
--- a/diagnose.c
+++ b/diagnose.c
@@ -81,7 +81,7 @@ static int count_files(struct strbuf *path)
 		return 0;
 
 	while ((e = readdir_skip_dot_and_dotdot(dir)) != NULL)
-		if (get_dtype(e, path) == DT_REG)
+		if (get_dtype(e, path, 0) == DT_REG)
 			count++;
 
 	closedir(dir);
@@ -110,7 +110,7 @@ static void loose_objs_stats(struct strbuf *buf, const char *path)
 	base_path_len = count_path.len;
 
 	while ((e = readdir_skip_dot_and_dotdot(dir)) != NULL)
-		if (get_dtype(e, &count_path) == DT_DIR &&
+		if (get_dtype(e, &count_path, 0) == DT_DIR &&
 		    strlen(e->d_name) == 2 &&
 		    !hex_to_bytes(&c, e->d_name, 1)) {
 			strbuf_setlen(&count_path, base_path_len);
@@ -155,7 +155,7 @@ static int add_directory_to_archiver(struct strvec *archiver_args,
 
 		strbuf_add_absolute_path(&abspath, at_root ? "." : path);
 		strbuf_addch(&abspath, '/');
-		dtype = get_dtype(e, &abspath);
+		dtype = get_dtype(e, &abspath, 0);
 
 		strbuf_setlen(&buf, len);
 		strbuf_addstr(&buf, e->d_name);
diff --git a/dir.c b/dir.c
index 5e01af3a25e..16fdb03f2a5 100644
--- a/dir.c
+++ b/dir.c
@@ -2235,19 +2235,24 @@ static int get_index_dtype(struct index_state *istate,
 	return DT_UNKNOWN;
 }
 
-unsigned char get_dtype(struct dirent *e, struct strbuf *path)
+unsigned char get_dtype(struct dirent *e, struct strbuf *path,
+			int follow_symlink)
 {
 	struct stat st;
 	unsigned char dtype = DTYPE(e);
 	size_t base_path_len;
 
-	if (dtype != DT_UNKNOWN)
+	if (dtype != DT_UNKNOWN && !(follow_symlink && dtype == DT_LNK))
 		return dtype;
 
-	/* d_type unknown in dirent, try to fall back on lstat results */
+	/*
+	 * d_type unknown or unfollowed symlink, try to fall back on [l]stat
+	 * results. If [l]stat fails, explicitly set DT_UNKNOWN.
+	 */
 	base_path_len = path->len;
 	strbuf_addstr(path, e->d_name);
-	if (lstat(path->buf, &st))
+	if ((follow_symlink && stat(path->buf, &st)) ||
+	    (!follow_symlink && lstat(path->buf, &st)))
 		goto cleanup;
 
 	/* determine d_type from st_mode */
diff --git a/dir.h b/dir.h
index 28c630ce806..98aa85fcc0e 100644
--- a/dir.h
+++ b/dir.h
@@ -368,11 +368,16 @@ struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp);
  * stat.st_mode using the path to the dirent's containing directory (path) and
  * the name of the dirent itself.
  *
+ * If 'follow_symlink' is 1, this function will attempt to follow DT_LNK types
+ * using 'stat'. Links are *not* followed recursively, so a symlink pointing
+ * to another symlink will still resolve to 'DT_LNK'.
+ *
  * Note that 'path' is assumed to have a trailing slash. It is also modified
  * in-place during the execution of the function, but is then reverted to its
  * original value before returning.
  */
-unsigned char get_dtype(struct dirent *e, struct strbuf *path);
+unsigned char get_dtype(struct dirent *e, struct strbuf *path,
+			int follow_symlink);
 
 /*Count the number of slashes for string s*/
 int count_slashes(const char *s);
-- 
gitgitgadget


^ permalink raw reply related

* [PATCH v2 2/4] dir.[ch]: expose 'get_dtype'
From: Victoria Dye via GitGitGadget @ 2023-10-09 21:58 UTC (permalink / raw)
  To: git; +Cc: Patrick Steinhardt, Victoria Dye, Victoria Dye
In-Reply-To: <pull.1594.v2.git.1696888736.gitgitgadget@gmail.com>

From: Victoria Dye <vdye@github.com>

Move 'get_dtype()' from 'diagnose.c' to 'dir.c' and add its declaration to
'dir.h' so that it is accessible to callers in other files. The function and
its documentation are moved verbatim except for a small addition to the
description clarifying what the 'path' arg represents.

Signed-off-by: Victoria Dye <vdye@github.com>
---
 diagnose.c | 36 ------------------------------------
 dir.c      | 28 ++++++++++++++++++++++++++++
 dir.h      | 11 +++++++++++
 3 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/diagnose.c b/diagnose.c
index 8430064000b..fc4d344bd63 100644
--- a/diagnose.c
+++ b/diagnose.c
@@ -71,42 +71,6 @@ static int dir_file_stats(struct object_directory *object_dir, void *data)
 	return 0;
 }
 
-/*
- * Get the d_type of a dirent. If the d_type is unknown, derive it from
- * stat.st_mode.
- *
- * Note that 'path' is assumed to have a trailing slash. It is also modified
- * in-place during the execution of the function, but is then reverted to its
- * original value before returning.
- */
-static unsigned char get_dtype(struct dirent *e, struct strbuf *path)
-{
-	struct stat st;
-	unsigned char dtype = DTYPE(e);
-	size_t base_path_len;
-
-	if (dtype != DT_UNKNOWN)
-		return dtype;
-
-	/* d_type unknown in dirent, try to fall back on lstat results */
-	base_path_len = path->len;
-	strbuf_addstr(path, e->d_name);
-	if (lstat(path->buf, &st))
-		goto cleanup;
-
-	/* determine d_type from st_mode */
-	if (S_ISREG(st.st_mode))
-		dtype = DT_REG;
-	else if (S_ISDIR(st.st_mode))
-		dtype = DT_DIR;
-	else if (S_ISLNK(st.st_mode))
-		dtype = DT_LNK;
-
-cleanup:
-	strbuf_setlen(path, base_path_len);
-	return dtype;
-}
-
 static int count_files(struct strbuf *path)
 {
 	DIR *dir = opendir(path->buf);
diff --git a/dir.c b/dir.c
index 8486e4d56ff..5e01af3a25e 100644
--- a/dir.c
+++ b/dir.c
@@ -2235,6 +2235,34 @@ static int get_index_dtype(struct index_state *istate,
 	return DT_UNKNOWN;
 }
 
+unsigned char get_dtype(struct dirent *e, struct strbuf *path)
+{
+	struct stat st;
+	unsigned char dtype = DTYPE(e);
+	size_t base_path_len;
+
+	if (dtype != DT_UNKNOWN)
+		return dtype;
+
+	/* d_type unknown in dirent, try to fall back on lstat results */
+	base_path_len = path->len;
+	strbuf_addstr(path, e->d_name);
+	if (lstat(path->buf, &st))
+		goto cleanup;
+
+	/* determine d_type from st_mode */
+	if (S_ISREG(st.st_mode))
+		dtype = DT_REG;
+	else if (S_ISDIR(st.st_mode))
+		dtype = DT_DIR;
+	else if (S_ISLNK(st.st_mode))
+		dtype = DT_LNK;
+
+cleanup:
+	strbuf_setlen(path, base_path_len);
+	return dtype;
+}
+
 static int resolve_dtype(int dtype, struct index_state *istate,
 			 const char *path, int len)
 {
diff --git a/dir.h b/dir.h
index ad06682fd54..28c630ce806 100644
--- a/dir.h
+++ b/dir.h
@@ -363,6 +363,17 @@ struct dir_struct {
 
 struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp);
 
+/*
+ * Get the d_type of a dirent. If the d_type is unknown, derive it from
+ * stat.st_mode using the path to the dirent's containing directory (path) and
+ * the name of the dirent itself.
+ *
+ * Note that 'path' is assumed to have a trailing slash. It is also modified
+ * in-place during the execution of the function, but is then reverted to its
+ * original value before returning.
+ */
+unsigned char get_dtype(struct dirent *e, struct strbuf *path);
+
 /*Count the number of slashes for string s*/
 int count_slashes(const char *s);
 
-- 
gitgitgadget


^ permalink raw reply related

* [PATCH v2 1/4] ref-cache.c: fix prefix matching in ref iteration
From: Victoria Dye via GitGitGadget @ 2023-10-09 21:58 UTC (permalink / raw)
  To: git; +Cc: Patrick Steinhardt, Victoria Dye, Victoria Dye
In-Reply-To: <pull.1594.v2.git.1696888736.gitgitgadget@gmail.com>

From: Victoria Dye <vdye@github.com>

Update 'cache_ref_iterator_advance' to skip over refs that are not matched
by the given prefix.

Currently, a ref entry is considered "matched" if the entry name is fully
contained within the prefix:

* prefix: "refs/heads/v1"
* entry: "refs/heads/v1.0"

OR if the prefix is fully contained in the entry name:

* prefix: "refs/heads/v1.0"
* entry: "refs/heads/v1"

The first case is always correct, but the second is only correct if the ref
cache entry is a directory, for example:

* prefix: "refs/heads/example"
* entry: "refs/heads/"

Modify the logic in 'cache_ref_iterator_advance' to reflect these
expectations:

1. If 'overlaps_prefix' returns 'PREFIX_EXCLUDES_DIR', then the prefix and
   ref cache entry do not overlap at all. Skip this entry.
2. If 'overlaps_prefix' returns 'PREFIX_WITHIN_DIR', then the prefix matches
   inside this entry if it is a directory. Skip if the entry is not a
   directory, otherwise iterate over it.
3. Otherwise, 'overlaps_prefix' returned 'PREFIX_CONTAINS_DIR', indicating
   that the cache entry (directory or not) is fully contained by or equal to
   the prefix. Iterate over this entry.

Note that condition 2 relies on the names of directory entries having the
appropriate trailing slash. The existing function documentation of
'create_dir_entry' explicitly calls out the trailing slash requirement, so
this is a safe assumption to make.

This bug generally doesn't have any user-facing impact, since it requires:

1. using a non-empty prefix without a trailing slash in an iteration like
   'for_each_fullref_in',
2. the callback to said iteration not reapplying the original filter (as
   for-each-ref does) to ensure unmatched refs are skipped, and
3. the repository having one or more refs that match part of, but not all
   of, the prefix.

However, there are some niche scenarios that meet those criteria
(specifically, 'rev-parse --bisect' and '(log|show|shortlog) --bisect'). Add
tests covering those cases to demonstrate the fix in this patch.

Signed-off-by: Victoria Dye <vdye@github.com>
---
 refs/ref-cache.c              |  3 ++-
 t/t1500-rev-parse.sh          | 23 +++++++++++++++++++++++
 t/t4205-log-pretty-formats.sh | 30 ++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/refs/ref-cache.c b/refs/ref-cache.c
index 2294c4564fb..6e3b725245c 100644
--- a/refs/ref-cache.c
+++ b/refs/ref-cache.c
@@ -412,7 +412,8 @@ static int cache_ref_iterator_advance(struct ref_iterator *ref_iterator)
 
 		if (level->prefix_state == PREFIX_WITHIN_DIR) {
 			entry_prefix_state = overlaps_prefix(entry->name, iter->prefix);
-			if (entry_prefix_state == PREFIX_EXCLUDES_DIR)
+			if (entry_prefix_state == PREFIX_EXCLUDES_DIR ||
+			    (entry_prefix_state == PREFIX_WITHIN_DIR && !(entry->flag & REF_DIR)))
 				continue;
 		} else {
 			entry_prefix_state = level->prefix_state;
diff --git a/t/t1500-rev-parse.sh b/t/t1500-rev-parse.sh
index 37ee5091b5c..3f9e7f62e45 100755
--- a/t/t1500-rev-parse.sh
+++ b/t/t1500-rev-parse.sh
@@ -264,4 +264,27 @@ test_expect_success 'rev-parse --since= unsqueezed ordering' '
 	test_cmp expect actual
 '
 
+test_expect_success 'rev-parse --bisect includes bad, excludes good' '
+	test_commit_bulk 6 &&
+
+	git update-ref refs/bisect/bad-1 HEAD~1 &&
+	git update-ref refs/bisect/b HEAD~2 &&
+	git update-ref refs/bisect/bad-3 HEAD~3 &&
+	git update-ref refs/bisect/good-3 HEAD~3 &&
+	git update-ref refs/bisect/bad-4 HEAD~4 &&
+	git update-ref refs/bisect/go HEAD~4 &&
+
+	# Note: refs/bisect/b and refs/bisect/go should be ignored because they
+	# do not match the refs/bisect/bad or refs/bisect/good prefixes.
+	cat >expect <<-EOF &&
+	refs/bisect/bad-1
+	refs/bisect/bad-3
+	refs/bisect/bad-4
+	^refs/bisect/good-3
+	EOF
+
+	git rev-parse --symbolic-full-name --bisect >actual &&
+	test_cmp expect actual
+'
+
 test_done
diff --git a/t/t4205-log-pretty-formats.sh b/t/t4205-log-pretty-formats.sh
index 16626e4fe96..62c7bfed5d7 100755
--- a/t/t4205-log-pretty-formats.sh
+++ b/t/t4205-log-pretty-formats.sh
@@ -956,6 +956,36 @@ test_expect_success '%S in git log --format works with other placeholders (part
 	test_cmp expect actual
 '
 
+test_expect_success 'setup more commits for %S with --bisect' '
+	test_commit four &&
+	test_commit five &&
+
+	head1=$(git rev-parse --verify HEAD~0) &&
+	head2=$(git rev-parse --verify HEAD~1) &&
+	head3=$(git rev-parse --verify HEAD~2) &&
+	head4=$(git rev-parse --verify HEAD~3)
+'
+
+test_expect_success '%S with --bisect labels commits with refs/bisect/bad ref' '
+	git update-ref refs/bisect/bad-$head1 $head1 &&
+	git update-ref refs/bisect/go $head1 &&
+	git update-ref refs/bisect/bad-$head2 $head2 &&
+	git update-ref refs/bisect/b $head3 &&
+	git update-ref refs/bisect/bad-$head4 $head4 &&
+	git update-ref refs/bisect/good-$head4 $head4 &&
+
+	# We expect to see the range of commits betwee refs/bisect/good-$head4
+	# and refs/bisect/bad-$head1. The "source" ref is the nearest bisect ref
+	# from which the commit is reachable.
+	cat >expect <<-EOF &&
+	$head1 refs/bisect/bad-$head1
+	$head2 refs/bisect/bad-$head2
+	$head3 refs/bisect/bad-$head2
+	EOF
+	git log --bisect --format="%H %S" >actual &&
+	test_cmp expect actual
+'
+
 test_expect_success 'log --pretty=reference' '
 	git log --pretty="tformat:%h (%s, %as)" >expect &&
 	git log --pretty=reference >actual &&
-- 
gitgitgadget


^ permalink raw reply related

* [PATCH v2 0/4] Performance improvement & cleanup in loose ref iteration
From: Victoria Dye via GitGitGadget @ 2023-10-09 21:58 UTC (permalink / raw)
  To: git; +Cc: Patrick Steinhardt, Victoria Dye
In-Reply-To: <pull.1594.git.1696615769.gitgitgadget@gmail.com>

While investigating ref iteration performance in builtins like
'for-each-ref' and 'show-ref', I found two small improvement opportunities.

The first patch tweaks the logic around prefix matching in
'cache_ref_iterator_advance' so that we correctly skip refs that do not
actually match a given prefix. The unnecessary iteration doesn't seem to be
causing any bugs in the ref iteration commands that I've tested, but it
doesn't hurt to be more precise (and it helps with some other patches I'm
working on ;) ).

The next three patches update how 'loose_fill_ref_dir' determines the type
of ref cache entry to create (directory or regular). On platforms that
include d_type information in 'struct dirent' (as far as I can tell, all
except NonStop & certain versions of Cygwin), this allows us to skip calling
'stat'. Benchmarking against repos with various quantities of loose refs
indicates a 5-8% speedup from these changes [1].


Changes since V1
================

 * Added tests in patch 1 to demonstrate the bugfix

Thanks!

 * Victoria

[1]
https://lore.kernel.org/git/28ae03f5-7091-d3f3-8a70-56aba6639640@github.com/

Victoria Dye (4):
  ref-cache.c: fix prefix matching in ref iteration
  dir.[ch]: expose 'get_dtype'
  dir.[ch]: add 'follow_symlink' arg to 'get_dtype'
  files-backend.c: avoid stat in 'loose_fill_ref_dir'

 diagnose.c                    | 42 +++--------------------------------
 dir.c                         | 33 +++++++++++++++++++++++++++
 dir.h                         | 16 +++++++++++++
 refs/files-backend.c          | 14 +++++-------
 refs/ref-cache.c              |  3 ++-
 t/t1500-rev-parse.sh          | 23 +++++++++++++++++++
 t/t4205-log-pretty-formats.sh | 30 +++++++++++++++++++++++++
 7 files changed, 112 insertions(+), 49 deletions(-)


base-commit: 3a06386e314565108ad56a9bdb8f7b80ac52fb69
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-1594%2Fvdye%2Fvdye%2Fref-iteration-cleanup-v2
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-1594/vdye/vdye/ref-iteration-cleanup-v2
Pull-Request: https://github.com/gitgitgadget/git/pull/1594

Range-diff vs v1:

 1:  59276a5b3fd ! 1:  402176246ea ref-cache.c: fix prefix matching in ref iteration
     @@ Commit message
          'create_dir_entry' explicitly calls out the trailing slash requirement, so
          this is a safe assumption to make.
      
     +    This bug generally doesn't have any user-facing impact, since it requires:
     +
     +    1. using a non-empty prefix without a trailing slash in an iteration like
     +       'for_each_fullref_in',
     +    2. the callback to said iteration not reapplying the original filter (as
     +       for-each-ref does) to ensure unmatched refs are skipped, and
     +    3. the repository having one or more refs that match part of, but not all
     +       of, the prefix.
     +
     +    However, there are some niche scenarios that meet those criteria
     +    (specifically, 'rev-parse --bisect' and '(log|show|shortlog) --bisect'). Add
     +    tests covering those cases to demonstrate the fix in this patch.
     +
          Signed-off-by: Victoria Dye <vdye@github.com>
      
       ## refs/ref-cache.c ##
     @@ refs/ref-cache.c: static int cache_ref_iterator_advance(struct ref_iterator *ref
       				continue;
       		} else {
       			entry_prefix_state = level->prefix_state;
     +
     + ## t/t1500-rev-parse.sh ##
     +@@ t/t1500-rev-parse.sh: test_expect_success 'rev-parse --since= unsqueezed ordering' '
     + 	test_cmp expect actual
     + '
     + 
     ++test_expect_success 'rev-parse --bisect includes bad, excludes good' '
     ++	test_commit_bulk 6 &&
     ++
     ++	git update-ref refs/bisect/bad-1 HEAD~1 &&
     ++	git update-ref refs/bisect/b HEAD~2 &&
     ++	git update-ref refs/bisect/bad-3 HEAD~3 &&
     ++	git update-ref refs/bisect/good-3 HEAD~3 &&
     ++	git update-ref refs/bisect/bad-4 HEAD~4 &&
     ++	git update-ref refs/bisect/go HEAD~4 &&
     ++
     ++	# Note: refs/bisect/b and refs/bisect/go should be ignored because they
     ++	# do not match the refs/bisect/bad or refs/bisect/good prefixes.
     ++	cat >expect <<-EOF &&
     ++	refs/bisect/bad-1
     ++	refs/bisect/bad-3
     ++	refs/bisect/bad-4
     ++	^refs/bisect/good-3
     ++	EOF
     ++
     ++	git rev-parse --symbolic-full-name --bisect >actual &&
     ++	test_cmp expect actual
     ++'
     ++
     + test_done
     +
     + ## t/t4205-log-pretty-formats.sh ##
     +@@ t/t4205-log-pretty-formats.sh: test_expect_success '%S in git log --format works with other placeholders (part
     + 	test_cmp expect actual
     + '
     + 
     ++test_expect_success 'setup more commits for %S with --bisect' '
     ++	test_commit four &&
     ++	test_commit five &&
     ++
     ++	head1=$(git rev-parse --verify HEAD~0) &&
     ++	head2=$(git rev-parse --verify HEAD~1) &&
     ++	head3=$(git rev-parse --verify HEAD~2) &&
     ++	head4=$(git rev-parse --verify HEAD~3)
     ++'
     ++
     ++test_expect_success '%S with --bisect labels commits with refs/bisect/bad ref' '
     ++	git update-ref refs/bisect/bad-$head1 $head1 &&
     ++	git update-ref refs/bisect/go $head1 &&
     ++	git update-ref refs/bisect/bad-$head2 $head2 &&
     ++	git update-ref refs/bisect/b $head3 &&
     ++	git update-ref refs/bisect/bad-$head4 $head4 &&
     ++	git update-ref refs/bisect/good-$head4 $head4 &&
     ++
     ++	# We expect to see the range of commits betwee refs/bisect/good-$head4
     ++	# and refs/bisect/bad-$head1. The "source" ref is the nearest bisect ref
     ++	# from which the commit is reachable.
     ++	cat >expect <<-EOF &&
     ++	$head1 refs/bisect/bad-$head1
     ++	$head2 refs/bisect/bad-$head2
     ++	$head3 refs/bisect/bad-$head2
     ++	EOF
     ++	git log --bisect --format="%H %S" >actual &&
     ++	test_cmp expect actual
     ++'
     ++
     + test_expect_success 'log --pretty=reference' '
     + 	git log --pretty="tformat:%h (%s, %as)" >expect &&
     + 	git log --pretty=reference >actual &&
 2:  24014010ea3 = 2:  172538b5e30 dir.[ch]: expose 'get_dtype'
 3:  a382d2ba652 = 3:  295ca94003b dir.[ch]: add 'follow_symlink' arg to 'get_dtype'
 4:  e193a453182 = 4:  e89501cb51f files-backend.c: avoid stat in 'loose_fill_ref_dir'

-- 
gitgitgadget

^ permalink raw reply

* Re: [PATCH 0/4] Performance improvement & cleanup in loose ref iteration
From: Victoria Dye @ 2023-10-09 21:49 UTC (permalink / raw)
  To: Patrick Steinhardt, Victoria Dye via GitGitGadget; +Cc: git
In-Reply-To: <ZSPQI2gkLOSdNWLu@tanuki>

Patrick Steinhardt wrote:
> On Fri, Oct 06, 2023 at 06:09:25PM +0000, Victoria Dye via GitGitGadget wrote:
>> While investigating ref iteration performance in builtins like
>> 'for-each-ref' and 'show-ref', I found two small improvement opportunities.
>>
>> The first patch tweaks the logic around prefix matching in
>> 'cache_ref_iterator_advance' so that we correctly skip refs that do not
>> actually match a given prefix. The unnecessary iteration doesn't seem to be
>> causing any bugs in the ref iteration commands that I've tested, but it
>> doesn't hurt to be more precise (and it helps with some other patches I'm
>> working on ;) ).
>>
>> The next three patches update how 'loose_fill_ref_dir' determines the type
>> of ref cache entry to create (directory or regular). On platforms that
>> include d_type information in 'struct dirent' (as far as I can tell, all
>> except NonStop & certain versions of Cygwin), this allows us to skip calling
>> 'stat'. In ad-hoc testing, this improved performance of 'git for-each-ref'
>> by about 20%.
> 
> I've done a small set of benchmarks with my usual test repositories,
> which is linux.git with a bunch of references added. The repository
> comes in four sizes:
> 
> - small: 50k references
> - medium: 500k references
> - high:  1.1m references
> - huge: 12m references
> 
> Unfortunately, I couldn't really reproduce the performance improvements.
> In fact, the new version runs consistently a tiny bit slower than the
> old version:
> 
>     # Old version, which is 3a06386e31 (The fifteenth batch, 2023-10-04).
> 
>     Benchmark 1: git for-each-ref (revision=old,refcount=small)
>       Time (mean ± σ):     135.5 ms ±   1.2 ms    [User: 76.4 ms, System: 59.0 ms]
>       Range (min … max):   134.8 ms … 136.9 ms    3 runs
> 
>     Benchmark 2: git for-each-ref (revision=old,refcount=medium)
>       Time (mean ± σ):     822.7 ms ±   2.2 ms    [User: 697.4 ms, System: 125.1 ms]
>       Range (min … max):   821.1 ms … 825.2 ms    3 runs
> 
>     Benchmark 3: git for-each-ref (revision=old,refcount=high)
>       Time (mean ± σ):      1.960 s ±  0.015 s    [User: 1.702 s, System: 0.257 s]
>       Range (min … max):    1.944 s …  1.973 s    3 runs
> 
>     # New version, which is your tip.
> 
>     Benchmark 4: git for-each-ref (revision=old,refcount=huge)
>       Time (mean ± σ):     16.815 s ±  0.054 s    [User: 15.091 s, System: 1.722 s]
>       Range (min … max):   16.760 s … 16.869 s    3 runs
> 
>     Benchmark 5: git for-each-ref (revision=new,refcount=small)
>       Time (mean ± σ):     136.0 ms ±   0.2 ms    [User: 78.8 ms, System: 57.1 ms]
>       Range (min … max):   135.8 ms … 136.2 ms    3 runs
> 
>     Benchmark 6: git for-each-ref (revision=new,refcount=medium)
>       Time (mean ± σ):     830.4 ms ±  21.2 ms    [User: 691.3 ms, System: 138.7 ms]
>       Range (min … max):   814.2 ms … 854.5 ms    3 runs
> 
>     Benchmark 7: git for-each-ref (revision=new,refcount=high)
>       Time (mean ± σ):      1.966 s ±  0.013 s    [User: 1.717 s, System: 0.249 s]
>       Range (min … max):    1.952 s …  1.978 s    3 runs
> 
>     Benchmark 8: git for-each-ref (revision=new,refcount=huge)
>       Time (mean ± σ):     16.945 s ±  0.037 s    [User: 15.182 s, System: 1.760 s]
>       Range (min … max):   16.910 s … 16.983 s    3 runs
> 
>     Summary
>       git for-each-ref (revision=old,refcount=small) ran
>         1.00 ± 0.01 times faster than git for-each-ref (revision=new,refcount=small)
>         6.07 ± 0.06 times faster than git for-each-ref (revision=old,refcount=medium)
>         6.13 ± 0.17 times faster than git for-each-ref (revision=new,refcount=medium)
>        14.46 ± 0.17 times faster than git for-each-ref (revision=old,refcount=high)
>        14.51 ± 0.16 times faster than git for-each-ref (revision=new,refcount=high)
>       124.09 ± 1.15 times faster than git for-each-ref (revision=old,refcount=huge)
>       125.05 ± 1.12 times faster than git for-each-ref (revision=new,refcount=huge)
> 
> The performance regression isn't all that concerning, but it makes me
> wonder why I see things becoming slower rather than faster. My guess is
> that this is because all my test repositories are well-packed and don't
> have a lot of loose references. But I just wanted to confirm how you
> benchmarked your change and what the underlying shape of your test repo
> was.

I ran my benchmark on my (Intel) Mac with a test repository (single commit,
one file) containing:

- 10k refs/heads/ references
- 10k refs/tags/ references
- 10k refs/special/ references 

All refs in the repository are loose. My Mac has historically been somewhat
slow and inconsistent when it comes to perf testing, though, so I re-ran the
benchmark a bit more formally on an Ubuntu VM (3 warmup iterations followed
by at least 10 iterations per test):

---

Benchmark 1: git for-each-ref (revision=old,refcount=3k)
  Time (mean ± σ):      40.6 ms ±   3.9 ms    [User: 13.2 ms, System: 27.1 ms]
  Range (min … max):    37.2 ms …  59.1 ms    76 runs
 
  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.
 
Benchmark 2: git for-each-ref (revision=new,refcount=3k)
  Time (mean ± σ):      38.7 ms ±   4.4 ms    [User: 13.8 ms, System: 24.5 ms]
  Range (min … max):    35.1 ms …  57.2 ms    71 runs
 
  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.
 
Benchmark 3: git for-each-ref (revision=old,refcount=30k)
  Time (mean ± σ):     419.4 ms ±  43.9 ms    [User: 136.4 ms, System: 274.1 ms]
  Range (min … max):   385.1 ms … 528.7 ms    10 runs
 
Benchmark 4: git for-each-ref (revision=new,refcount=30k)
  Time (mean ± σ):     390.4 ms ±  27.2 ms    [User: 133.1 ms, System: 251.6 ms]
  Range (min … max):   360.3 ms … 447.6 ms    10 runs
 
Benchmark 5: git for-each-ref (revision=old,refcount=300k)
  Time (mean ± σ):      4.171 s ±  0.052 s    [User: 1.400 s, System: 2.715 s]
  Range (min … max):    4.118 s …  4.283 s    10 runs
 
Benchmark 6: git for-each-ref (revision=new,refcount=300k)
  Time (mean ± σ):      3.939 s ±  0.054 s    [User: 1.403 s, System: 2.466 s]
  Range (min … max):    3.858 s …  4.026 s    10 runs
 
Summary
  'git for-each-ref (revision=new,refcount=3k)' ran
    1.05 ± 0.16 times faster than 'git for-each-ref (revision=old,refcount=3k)'
   10.08 ± 1.34 times faster than 'git for-each-ref (revision=new,refcount=30k)'
   10.83 ± 1.67 times faster than 'git for-each-ref (revision=old,refcount=30k)'
  101.68 ± 11.63 times faster than 'git for-each-ref (revision=new,refcount=300k)'
  107.67 ± 12.30 times faster than 'git for-each-ref (revision=old,refcount=300k)'

---

So it's not the 20% speedup I saw on my local test repo (it's more like
5-8%), but there does appear to be a consistent improvement. As for your
results, the changes in this series shouldn't affect packed ref operations,
and the difference between old & new doesn't seem to indicate a regression. 


^ permalink raw reply

* Re: [PATCH v3] merge-ort: initialize repo in index state
From: Junio C Hamano @ 2023-10-09 21:41 UTC (permalink / raw)
  To: John Cai via GitGitGadget; +Cc: git, Elijah Newren, John Cai
In-Reply-To: <pull.1583.v3.git.git.1696857660374.gitgitgadget@gmail.com>

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

>     Fix this by initializing the repository in the index state.
>     
>     Changes since V2:
>     
>      * fixed test by using printf instead of echo

Much better than using unportable \n with echo.

>      -+		echo "foo\nbar\nbaz" >expect &&
>      ++		printf "foo\nbar\nbaz\n" >expect &&

But if we are using printf, it would be easier to read lines
separately, which would look more like

	printf "%s\n" foo bar baz >expect

And we have

	test_write_lines foo bar baz >expect

to make it even more discoverable.

>       +		git cat-file -p "$tree:file1" >actual &&
>       +		test_cmp expect actual
>       +	)
>
>
>  merge-ort.c           |  1 +
>  t/t4300-merge-tree.sh | 27 +++++++++++++++++++++++++++
>  2 files changed, 28 insertions(+)
>
> diff --git a/merge-ort.c b/merge-ort.c
> index 7857ce9fbd1..36537256613 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -1902,6 +1902,7 @@ static void initialize_attr_index(struct merge_options *opt)
>  	struct index_state *attr_index = &opt->priv->attr_index;
>  	struct cache_entry *ce;
>  
> +	attr_index->repo = opt->repo;
>  	attr_index->initialized = 1;
>  
>  	if (!opt->renormalize)
> diff --git a/t/t4300-merge-tree.sh b/t/t4300-merge-tree.sh
> index 57c4f26e461..c3a03e54187 100755
> --- a/t/t4300-merge-tree.sh
> +++ b/t/t4300-merge-tree.sh
> @@ -86,6 +86,33 @@ EXPECTED
>  	test_cmp expected actual
>  '
>  
> +test_expect_success '3-way merge with --attr-source' '
> +	test_when_finished rm -rf 3-way &&
> +	git init 3-way &&
> +	(
> +		cd 3-way &&
> +		test_commit initial file1 foo &&
> +		base=$(git rev-parse HEAD) &&
> +		git checkout -b brancha &&
> +		echo bar >>file1 &&
> +		git commit -am "adding bar" &&
> +		source=$(git rev-parse HEAD) &&
> +		git checkout @{-1} &&
> +		git checkout -b branchb &&
> +		echo baz >>file1 &&
> +		git commit -am "adding baz" &&
> +		merge=$(git rev-parse HEAD) &&
> +		git checkout -b gitattributes &&
> +		test_commit "gitattributes" .gitattributes "file1 merge=union" &&

OK, the branch "gitattributes" will be used to drive merge of file1
using the union merge to avoid conflicting.

> +		git checkout @{-1} &&

But such attribute will only be available in that branch, not in the
checked out working tree.  And then

> +		tree=$(git --attr-source=gitattributes merge-tree --write-tree \
> +		--merge-base "$base" --end-of-options "$source" "$merge") &&

we use the gitattributes branch as the tree-ish to take the
attribute information from.  Makes sense.

> +		printf "foo\nbar\nbaz\n" >expect &&

I'll squash in the "test_write_lines" change while queuing.

> +		git cat-file -p "$tree:file1" >actual &&
> +		test_cmp expect actual
> +	)
> +'
> +
>  test_expect_success 'file change A, B (same)' '
>  	git reset --hard initial &&
>  	test_commit "change-a-b-same-A" "initial-file" "AAA" &&
>
> base-commit: 493f4622739e9b64f24b465b21aa85870dd9dc09

Thanks.  Looking good.

^ permalink raw reply

* Re: What's cooking in git.git (Oct 2023, #03; Fri, 6)
From: Junio C Hamano @ 2023-10-09 21:07 UTC (permalink / raw)
  To: Taylor Blau; +Cc: git
In-Reply-To: <ZSRC5IaKoXPpTFnq@nand.local>

Taylor Blau <me@ttaylorr.com> writes:

>> > Thanks. On a semi-related note, did you want to pick up my patch in
>> >
>> >   https://lore.kernel.org/git/035393935108d02aaf8927189b05102f4f74f340.1696370003.git.me@ttaylorr.com/
>> >
>> > ? That should address a performance bug that occurs when a repository
>> > (incorrectly) chooses a cruft pack as its preferred pack source when
>> > writing a MIDX bitmap, significantly impeding the pack-reuse mechanism.
>>
>> Isn't that in the above list already as b3ca6df3b9^2?
>
> Oops, duh. I hadn't expected you to group that patch in with the rest of
> tb/repack-max-cruft-size.

Heh, as you said, isn't it at least semi-related ;-)?

^ permalink raw reply

* [PATCH 20/20] chunk-format: drop pair_chunk_unsafe()
From: Jeff King @ 2023-10-09 21:06 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

There are no callers left, and we don't want anybody to add new ones (they
should use the not-unsafe version instead). So let's drop the function.

Signed-off-by: Jeff King <peff@peff.net>
---
 chunk-format.c |  8 --------
 chunk-format.h | 13 -------------
 2 files changed, 21 deletions(-)

diff --git a/chunk-format.c b/chunk-format.c
index 09ef86afa6..cdc7f39b70 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -184,14 +184,6 @@ int pair_chunk(struct chunkfile *cf,
 	return read_chunk(cf, chunk_id, pair_chunk_fn, &pcd);
 }
 
-int pair_chunk_unsafe(struct chunkfile *cf,
-		      uint32_t chunk_id,
-		      const unsigned char **p)
-{
-	size_t dummy;
-	return pair_chunk(cf, chunk_id, p, &dummy);
-}
-
 int read_chunk(struct chunkfile *cf,
 	       uint32_t chunk_id,
 	       chunk_read_fn fn,
diff --git a/chunk-format.h b/chunk-format.h
index d608b8135c..14b76180ef 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -54,19 +54,6 @@ int pair_chunk(struct chunkfile *cf,
 	       const unsigned char **p,
 	       size_t *size);
 
-/*
- * Unsafe version of pair_chunk; it does not return the size,
- * meaning that the caller cannot possibly be careful about
- * reading out of bounds from the mapped memory.
- *
- * No new callers should use this function, and old callers should
- * be audited and migrated over to using the regular pair_chunk()
- * function.
- */
-int pair_chunk_unsafe(struct chunkfile *cf,
-		      uint32_t chunk_id,
-		      const unsigned char **p);
-
 typedef int (*chunk_read_fn)(const unsigned char *chunk_start,
 			     size_t chunk_size, void *data);
 /*
-- 
2.42.0.884.g35e1fe1a6a

^ permalink raw reply related

* [PATCH 19/20] commit-graph: detect out-of-order BIDX offsets
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

The BIDX chunk tells us the offsets at which each commit's Bloom filters
can be found in the BDAT chunk. We compute the length of each filter by
checking the offsets of neighbors and subtracting them.

If the offsets are out of order, then we'll get a negative length, which
we then store as a very large unsigned value. This can cause us to read
out-of-bounds memory, as we access the hash data modulo "filter->len *
BITS_PER_WORD".

We can easily detect this case when loading the individual filters.

Signed-off-by: Jeff King <peff@peff.net>
---
 bloom.c              | 10 ++++++++++
 t/t4216-log-bloom.sh | 13 +++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/bloom.c b/bloom.c
index 61abad7f8c..1474aa19fa 100644
--- a/bloom.c
+++ b/bloom.c
@@ -75,6 +75,16 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
 	    check_bloom_offset(g, lex_pos - 1, start_index) < 0)
 		return 0;
 
+	if (end_index < start_index) {
+		warning("ignoring decreasing changed-path index offsets"
+			" (%"PRIuMAX" > %"PRIuMAX") for positions"
+			" %"PRIuMAX" and %"PRIuMAX" of %s",
+			(uintmax_t)start_index, (uintmax_t)end_index,
+			(uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
+			g->filename);
+		return 0;
+	}
+
 	filter->len = end_index - start_index;
 	filter->data = (unsigned char *)(g->chunk_bloom_data +
 					sizeof(unsigned char) * start_index +
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index f6054cbb27..2ba0324a69 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -441,4 +441,17 @@ test_expect_success 'Bloom reader notices too-small index chunk' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'Bloom reader notices out-of-order index offsets' '
+	# we do not know any real offsets, but we can pick
+	# something plausible; we should not get to the point of
+	# actually reading from the bogus offsets anyway.
+	corrupt_graph BIDX 4 0000000c00000005 &&
+	echo "warning: ignoring decreasing changed-path index offsets" \
+		"(12 > 5) for positions 1 and 2 of .git/objects/info/commit-graph" >expect.err &&
+	git -c core.commitGraph=false log -- A/B/file2 >expect.out &&
+	git -c core.commitGraph=true log -- A/B/file2 >out 2>err &&
+	test_cmp expect.out out &&
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 18/20] commit-graph: check bounds when accessing BIDX chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We load the bloom_filter_indexes chunk using pair_chunk(), so we have no
idea how big it is. This can lead to out-of-bounds reads if it is
smaller than expected, since we index it based on the number of commits
found elsewhere in the graph file.

We can check the chunk size up front, like we do for CDAT and other
chunks with one fixed-size record per commit.

The test case demonstrates the problem. It actually won't segfault,
because we end up reading random data from the follow-on chunk (BDAT in
this case), and the bounds checks added in the previous patch complain.
But this is by no means assured, and you can craft a commit-graph file
with BIDX at the end (or a smaller BDAT) that does segfault.

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c       | 16 ++++++++++++++--
 t/t4216-log-bloom.sh |  9 +++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index f7a42be6d0..1f334987b5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -360,6 +360,18 @@ static int graph_read_generation_data(const unsigned char *chunk_start,
 	return 0;
 }
 
+static int graph_read_bloom_index(const unsigned char *chunk_start,
+				  size_t chunk_size, void *data)
+{
+	struct commit_graph *g = data;
+	if (chunk_size != g->num_commits * 4) {
+		warning("commit-graph changed-path index chunk is too small");
+		return -1;
+	}
+	g->chunk_bloom_indexes = chunk_start;
+	return 0;
+}
+
 static int graph_read_bloom_data(const unsigned char *chunk_start,
 				  size_t chunk_size, void *data)
 {
@@ -470,8 +482,8 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 	}
 
 	if (s->commit_graph_read_changed_paths) {
-		pair_chunk_unsafe(cf, GRAPH_CHUNKID_BLOOMINDEXES,
-			   &graph->chunk_bloom_indexes);
+		read_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES,
+			   graph_read_bloom_index, graph);
 		read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA,
 			   graph_read_bloom_data, graph);
 	}
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 7a727bcddd..f6054cbb27 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -432,4 +432,13 @@ test_expect_success 'Bloom reader notices out-of-bounds filter offsets' '
 	grep "warning: ignoring out-of-range offset (4294967295) for changed-path filter at pos 3 of .git/objects/info/commit-graph" err
 '
 
+test_expect_success 'Bloom reader notices too-small index chunk' '
+	# replace the index with a single entry, making most
+	# lookups out-of-bounds
+	check_corrupt_graph BIDX clear 00000000 &&
+	echo "warning: commit-graph changed-path index chunk" \
+		"is too small" >expect.err &&
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 17/20] commit-graph: check bounds when accessing BDAT chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

When loading Bloom filters from a commit-graph file, we use the offset
values in the BIDX chunk to index into the memory mapped for the BDAT
chunk. But since we don't record how big the BDAT chunk is, we just
trust that the BIDX offsets won't cause us to read outside of the chunk
memory. A corrupted or malicious commit-graph file will cause us to
segfault (in practice this isn't a very interesting attack, since
commit-graph files are local-only, and the worst case is an
out-of-bounds read).

We can't fix this by checking the chunk size during parsing, since the
data in the BDAT chunk doesn't have a fixed size (that's why we need the
BIDX in the first place). So we'll fix it in two parts:

  1. Record the BDAT chunk size during parsing, and then later check
     that the BIDX offsets we look up are within bounds.

  2. Because the offsets are relative to the end of the BDAT header, we
     must also make sure that the BDAT chunk is at least as large as the
     expected header size. Otherwise, we overflow when trying to move
     past the header, even for an offset of "0". We can check this
     early, during the parsing stage.

The error messages are rather verbose, but since this is not something
you'd expect to see outside of severe bugs or corruption, it makes sense
to err on the side of too many details. Sadly we can't mention the
filename during the chunk-parsing stage, as we haven't set g->filename
at this point, nor passed it down through the stack.

Signed-off-by: Jeff King <peff@peff.net>
---
 bloom.c              | 24 ++++++++++++++++++++++++
 commit-graph.c       | 10 ++++++++++
 commit-graph.h       |  1 +
 t/t4216-log-bloom.sh | 28 ++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+)

diff --git a/bloom.c b/bloom.c
index aef6b5fea2..61abad7f8c 100644
--- a/bloom.c
+++ b/bloom.c
@@ -29,6 +29,26 @@ static inline unsigned char get_bitmask(uint32_t pos)
 	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
 }
 
+static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
+			      uint32_t offset)
+{
+	/*
+	 * Note that we allow offsets equal to the data size, which would set
+	 * our pointers at one past the end of the chunk memory. This is
+	 * necessary because the on-disk index points to the end of the
+	 * entries (so we can compute size by comparing adjacent ones). And
+	 * naturally the final entry's end is one-past-the-end of the chunk.
+	 */
+	if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
+		return 0;
+
+	warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
+		" filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
+		(uintmax_t)offset, (uintmax_t)pos,
+		g->filename, (uintmax_t)g->chunk_bloom_data_size);
+	return -1;
+}
+
 static int load_bloom_filter_from_graph(struct commit_graph *g,
 					struct bloom_filter *filter,
 					uint32_t graph_pos)
@@ -51,6 +71,10 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
 	else
 		start_index = 0;
 
+	if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
+	    check_bloom_offset(g, lex_pos - 1, start_index) < 0)
+		return 0;
+
 	filter->len = end_index - start_index;
 	filter->data = (unsigned char *)(g->chunk_bloom_data +
 					sizeof(unsigned char) * start_index +
diff --git a/commit-graph.c b/commit-graph.c
index f446e76c28..f7a42be6d0 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -365,7 +365,17 @@ static int graph_read_bloom_data(const unsigned char *chunk_start,
 {
 	struct commit_graph *g = data;
 	uint32_t hash_version;
+
+	if (chunk_size < BLOOMDATA_CHUNK_HEADER_SIZE) {
+		warning("ignoring too-small changed-path chunk"
+			" (%"PRIuMAX" < %"PRIuMAX") in commit-graph file",
+			(uintmax_t)chunk_size,
+			(uintmax_t)BLOOMDATA_CHUNK_HEADER_SIZE);
+		return -1;
+	}
+
 	g->chunk_bloom_data = chunk_start;
+	g->chunk_bloom_data_size = chunk_size;
 	hash_version = get_be32(chunk_start);
 
 	if (hash_version != 1)
diff --git a/commit-graph.h b/commit-graph.h
index b373f15802..c6870274c5 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -101,6 +101,7 @@ struct commit_graph {
 	size_t chunk_base_graphs_size;
 	const unsigned char *chunk_bloom_indexes;
 	const unsigned char *chunk_bloom_data;
+	size_t chunk_bloom_data_size;
 
 	struct topo_level_slab *topo_levels;
 	struct bloom_filter_settings *bloom_filter_settings;
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index fa9d32facf..7a727bcddd 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -5,6 +5,7 @@ GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
 . ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-chunk.sh
 
 GIT_TEST_COMMIT_GRAPH=0
 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
@@ -404,4 +405,31 @@ test_expect_success 'Bloom generation backfills empty commits' '
 	)
 '
 
+corrupt_graph () {
+	graph=.git/objects/info/commit-graph &&
+	test_when_finished "rm -rf $graph" &&
+	git commit-graph write --reachable --changed-paths &&
+	corrupt_chunk_file $graph "$@"
+}
+
+check_corrupt_graph () {
+	corrupt_graph "$@" &&
+	git -c core.commitGraph=false log -- A/B/file2 >expect.out &&
+	git -c core.commitGraph=true log -- A/B/file2 >out 2>err &&
+	test_cmp expect.out out
+}
+
+test_expect_success 'Bloom reader notices too-small data chunk' '
+	check_corrupt_graph BDAT clear 00000000 &&
+	echo "warning: ignoring too-small changed-path chunk" \
+		"(4 < 12) in commit-graph file" >expect.err &&
+	test_cmp expect.err err
+'
+
+test_expect_success 'Bloom reader notices out-of-bounds filter offsets' '
+	check_corrupt_graph BIDX 12 FFFFFFFF &&
+	# use grep to avoid depending on exact chunk size
+	grep "warning: ignoring out-of-range offset (4294967295) for changed-path filter at pos 3 of .git/objects/info/commit-graph" err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 16/20] commit-graph: bounds-check generation overflow chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

If the generation entry in a commit-graph doesn't fit, we instead insert
an offset into a generation overflow chunk. But since we don't record
the size of the chunk, we may read outside the chunk if the offset we
find on disk is malicious or corrupted.

We can't check the size of the chunk up-front; it will vary based on how
many entries need overflow. So instead, we'll do a bounds-check before
accessing the chunk memory. Unfortunately there is no error-return from
this function, so we'll just have to die(), which is what it does for
other forms of corruption.

As with other cases, we can drop the st_mult() call, since we know our
bounds-checked value will fit within a size_t.

Before this patch, the test here actually "works" because we read
garbage data from the next chunk. And since that garbage data happens
not to provide a generation number which changes the output, it appears
to work. We could construct a case that actually segfaults or produces
wrong output, but it would be a bit tricky. For our purposes its
sufficient to check that we've detected the bounds error.

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c                     | 10 +++++++---
 commit-graph.h                     |  1 +
 t/t5328-commit-graph-64bit-time.sh | 10 ++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index ca26870d1b..f446e76c28 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -451,8 +451,9 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 	if (s->commit_graph_generation_version >= 2) {
 		read_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA,
 			   graph_read_generation_data, graph);
-		pair_chunk_unsafe(cf, GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW,
-			&graph->chunk_generation_data_overflow);
+		pair_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW,
+			   &graph->chunk_generation_data_overflow,
+			   &graph->chunk_generation_data_overflow_size);
 
 		if (graph->chunk_generation_data)
 			graph->read_generation_data = 1;
@@ -896,7 +897,10 @@ static void fill_commit_graph_info(struct commit *item, struct commit_graph *g,
 				die(_("commit-graph requires overflow generation data but has none"));
 
 			offset_pos = offset ^ CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW;
-			graph_data->generation = item->date + get_be64(g->chunk_generation_data_overflow + st_mult(8, offset_pos));
+			if (g->chunk_generation_data_overflow_size / sizeof(uint64_t) <= offset_pos)
+				die(_("commit-graph overflow generation data is too small"));
+			graph_data->generation = item->date +
+				get_be64(g->chunk_generation_data_overflow + sizeof(uint64_t) * offset_pos);
 		} else
 			graph_data->generation = item->date + offset;
 	} else
diff --git a/commit-graph.h b/commit-graph.h
index e4248ea05d..b373f15802 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -94,6 +94,7 @@ struct commit_graph {
 	const unsigned char *chunk_commit_data;
 	const unsigned char *chunk_generation_data;
 	const unsigned char *chunk_generation_data_overflow;
+	size_t chunk_generation_data_overflow_size;
 	const unsigned char *chunk_extra_edges;
 	size_t chunk_extra_edges_size;
 	const unsigned char *chunk_base_graphs;
diff --git a/t/t5328-commit-graph-64bit-time.sh b/t/t5328-commit-graph-64bit-time.sh
index e9c521c061..e5ff3e07ad 100755
--- a/t/t5328-commit-graph-64bit-time.sh
+++ b/t/t5328-commit-graph-64bit-time.sh
@@ -10,6 +10,7 @@ then
 fi
 
 . "$TEST_DIRECTORY"/lib-commit-graph.sh
+. "$TEST_DIRECTORY/lib-chunk.sh"
 
 UNIX_EPOCH_ZERO="@0 +0000"
 FUTURE_DATE="@4147483646 +0000"
@@ -72,4 +73,13 @@ test_expect_success 'single commit with generation data exceeding UINT32_MAX' '
 	git -C repo-uint32-max commit-graph verify
 '
 
+test_expect_success 'reader notices out-of-bounds generation overflow' '
+	graph=.git/objects/info/commit-graph &&
+	test_when_finished "rm -rf $graph" &&
+	git commit-graph write --reachable &&
+	corrupt_chunk_file $graph GDO2 clear &&
+	test_must_fail git log 2>err &&
+	grep "commit-graph overflow generation data is too small" err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 15/20] commit-graph: check size of generations chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We neither check nor record the size of the generations chunk we parse
from a commit-graph file. This should have one uint32_t for each commit
in the file; if it is smaller (due to corruption, etc), we may read
outside the mapped memory.

The included test segfaults without this patch, as it shrinks the size
considerably (and the chunk is near the end of the file, so we read off
the end of the array rather than accidentally reading another chunk).

We can fix this by checking the size up front (like we do for other
fixed-size chunks, like CDAT).

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c          | 14 ++++++++++++--
 t/t5318-commit-graph.sh |  8 ++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 4377b547c8..ca26870d1b 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -350,6 +350,16 @@ static int graph_read_commit_data(const unsigned char *chunk_start,
 	return 0;
 }
 
+static int graph_read_generation_data(const unsigned char *chunk_start,
+				      size_t chunk_size, void *data)
+{
+	struct commit_graph *g = data;
+	if (chunk_size != g->num_commits * sizeof(uint32_t))
+		return error("commit-graph generations chunk is wrong size");
+	g->chunk_generation_data = chunk_start;
+	return 0;
+}
+
 static int graph_read_bloom_data(const unsigned char *chunk_start,
 				  size_t chunk_size, void *data)
 {
@@ -439,8 +449,8 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 		   &graph->chunk_base_graphs_size);
 
 	if (s->commit_graph_generation_version >= 2) {
-		pair_chunk_unsafe(cf, GRAPH_CHUNKID_GENERATION_DATA,
-			&graph->chunk_generation_data);
+		read_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA,
+			   graph_read_generation_data, graph);
 		pair_chunk_unsafe(cf, GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW,
 			&graph->chunk_generation_data_overflow);
 
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 05bafcfe5f..6505ff595a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -887,4 +887,12 @@ test_expect_success 'reader notices out-of-bounds extra edge' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'reader notices too-small generations chunk' '
+	check_corrupt_chunk GDA2 clear 00000000 &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph generations chunk is wrong size
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 14/20] commit-graph: bounds-check base graphs chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

When we are loading a commit-graph chain, we check that each slice of the
chain points to the appropriate set of base graphs via its BASE chunk.
But since we don't record the size of the chunk, we may access
out-of-bounds memory if the file is corrupted.

Since we know the number of entries we expect to find (based on the
position within the commit-graph-chain file), we can just check the size
up front.

In theory this would also let us drop the st_mult() call a few lines
later when we actually access the memory, since we know that the
computed offset will fit in a size_t. But because the operands
"g->hash_len" and "n" have types "unsigned char" and "int", we'd have to
cast to size_t first. Leaving the st_mult() does that cast, and makes it
more obvious that we don't have an overflow problem.

Note that the test does not actually segfault before this patch, since
it just reads garbage from the chunk after BASE (and indeed, it even
rejects the file because that garbage does not have the expected hash
value). You could construct a file with BASE at the end that did
segfault, but corrupting the existing one is easy, and we can check
stderr for the expected message.

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c                |  8 +++++++-
 commit-graph.h                |  1 +
 t/t5324-split-commit-graph.sh | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index e4860841fc..4377b547c8 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -435,7 +435,8 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 	read_chunk(cf, GRAPH_CHUNKID_DATA, graph_read_commit_data, graph);
 	pair_chunk(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges,
 		   &graph->chunk_extra_edges_size);
-	pair_chunk_unsafe(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs);
+	pair_chunk(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs,
+		   &graph->chunk_base_graphs_size);
 
 	if (s->commit_graph_generation_version >= 2) {
 		pair_chunk_unsafe(cf, GRAPH_CHUNKID_GENERATION_DATA,
@@ -546,6 +547,11 @@ static int add_graph_to_chain(struct commit_graph *g,
 		return 0;
 	}
 
+	if (g->chunk_base_graphs_size / g->hash_len < n) {
+		warning(_("commit-graph base graphs chunk is too small"));
+		return 0;
+	}
+
 	while (n) {
 		n--;
 
diff --git a/commit-graph.h b/commit-graph.h
index 1f8a9de4fb..e4248ea05d 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -97,6 +97,7 @@ struct commit_graph {
 	const unsigned char *chunk_extra_edges;
 	size_t chunk_extra_edges_size;
 	const unsigned char *chunk_base_graphs;
+	size_t chunk_base_graphs_size;
 	const unsigned char *chunk_bloom_indexes;
 	const unsigned char *chunk_bloom_data;
 
diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh
index 55b5765e2d..3c8482d073 100755
--- a/t/t5324-split-commit-graph.sh
+++ b/t/t5324-split-commit-graph.sh
@@ -2,6 +2,7 @@
 
 test_description='split commit graph'
 . ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-chunk.sh
 
 GIT_TEST_COMMIT_GRAPH=0
 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
@@ -398,6 +399,19 @@ test_expect_success 'verify across alternates' '
 	)
 '
 
+test_expect_success 'reader bounds-checks base-graph chunk' '
+	git clone --no-hardlinks . corrupt-base-chunk &&
+	(
+		cd corrupt-base-chunk &&
+		tip_file=$graphdir/graph-$(tail -n 1 $graphdir/commit-graph-chain).graph &&
+		corrupt_chunk_file "$tip_file" BASE clear 01020304 &&
+		git -c core.commitGraph=false log >expect.out &&
+		git -c core.commitGraph=true log >out 2>err &&
+		test_cmp expect.out out &&
+		grep "commit-graph base graphs chunk is too small" err
+	)
+'
+
 test_expect_success 'add octopus merge' '
 	git reset --hard commits/10 &&
 	git merge commits/3 commits/4 &&
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 13/20] commit-graph: detect out-of-bounds extra-edges pointers
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

If an entry in a commit-graph file has more than 2 parents, the
fixed-size parent fields instead point to an offset within an "extra
edges" chunk. We blindly follow these, assuming that the chunk is
present and sufficiently large; this can lead to an out-of-bounds read
for a corrupt or malicious file.

We can fix this by recording the size of the chunk and adding a
bounds-check in fill_commit_in_graph(). There are a few tricky bits:

  1. We'll switch from working with a pointer to an offset. This makes
     some corner cases just fall out naturally:

      a. If we did not find an EDGE chunk at all, our size will
         correctly be zero (so everything is "out of bounds").

      b. Comparing "size / 4" lets us make sure we have at least 4 bytes
         to read, and we never compute a pointer more than one element
         past the end of the array (computing a larger pointer is
         probably OK in practice, but is technically undefined
         behavior).

      c. The current code casts to "uint32_t *". Replacing it with an
         offset avoids any comparison between different types of pointer
         (since the chunk is stored as "unsigned char *").

  2. This is the first case in which fill_commit_in_graph() may return
     anything but success. We need to make sure to roll back the
     "parsed" flag (and any parents we might have added before running
     out of buffer) so that the caller can cleanly fall back to
     loading the commit object itself.

     It's a little non-trivial to do this, and we might benefit from
     factoring it out. But we can wait on that until we actually see a
     second case where we return an error.

As a bonus, this lets us drop the st_mult() call. Since we've already
done a bounds check, we know there won't be any integer overflow (it
would imply our buffer is larger than a size_t can hold).

The included test does not actually segfault before this patch (though
you could construct a case where it does). Instead, it reads garbage
from the next chunk which results in it complaining about a bogus parent
id. This is sufficient for our needs, though (we care that the fallback
succeeds, and that stderr mentions the out-of-bounds read).

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c          | 20 ++++++++++++++------
 commit-graph.h          |  1 +
 t/t5318-commit-graph.sh |  8 ++++++++
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 9b80bbd75b..e4860841fc 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -433,7 +433,8 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 	read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph);
 	read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph);
 	read_chunk(cf, GRAPH_CHUNKID_DATA, graph_read_commit_data, graph);
-	pair_chunk_unsafe(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges);
+	pair_chunk(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges,
+		   &graph->chunk_extra_edges_size);
 	pair_chunk_unsafe(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs);
 
 	if (s->commit_graph_generation_version >= 2) {
@@ -899,7 +900,7 @@ static int fill_commit_in_graph(struct repository *r,
 				struct commit_graph *g, uint32_t pos)
 {
 	uint32_t edge_value;
-	uint32_t *parent_data_ptr;
+	uint32_t parent_data_pos;
 	struct commit_list **pptr;
 	const unsigned char *commit_data;
 	uint32_t lex_index;
@@ -931,14 +932,21 @@ static int fill_commit_in_graph(struct repository *r,
 		return 1;
 	}
 
-	parent_data_ptr = (uint32_t*)(g->chunk_extra_edges +
-			  st_mult(4, edge_value & GRAPH_EDGE_LAST_MASK));
+	parent_data_pos = edge_value & GRAPH_EDGE_LAST_MASK;
 	do {
-		edge_value = get_be32(parent_data_ptr);
+		if (g->chunk_extra_edges_size / sizeof(uint32_t) <= parent_data_pos) {
+			error("commit-graph extra-edges pointer out of bounds");
+			free_commit_list(item->parents);
+			item->parents = NULL;
+			item->object.parsed = 0;
+			return 0;
+		}
+		edge_value = get_be32(g->chunk_extra_edges +
+				      sizeof(uint32_t) * parent_data_pos);
 		pptr = insert_parent_or_die(r, g,
 					    edge_value & GRAPH_EDGE_LAST_MASK,
 					    pptr);
-		parent_data_ptr++;
+		parent_data_pos++;
 	} while (!(edge_value & GRAPH_LAST_EDGE));
 
 	return 1;
diff --git a/commit-graph.h b/commit-graph.h
index 20ada7e891..1f8a9de4fb 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -95,6 +95,7 @@ struct commit_graph {
 	const unsigned char *chunk_generation_data;
 	const unsigned char *chunk_generation_data_overflow;
 	const unsigned char *chunk_extra_edges;
+	size_t chunk_extra_edges_size;
 	const unsigned char *chunk_base_graphs;
 	const unsigned char *chunk_bloom_indexes;
 	const unsigned char *chunk_bloom_data;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 492460157d..05bafcfe5f 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -879,4 +879,12 @@ test_expect_success 'reader notices too-small commit data chunk' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'reader notices out-of-bounds extra edge' '
+	check_corrupt_chunk EDGE clear &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph extra-edges pointer out of bounds
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 12/20] commit-graph: check size of commit data chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We expect a commit-graph file to have a fixed-size data record for each
commit in the file (and we know the number of commits to expct from the
size of the lookup table). If we encounter a file where this is too
small, we'll look past the end of the chunk (and possibly even off the
mapped memory).

We can fix this by checking the size up front when we record the
pointer.

The included test doesn't segfault, since it ends up reading bytes
from another chunk. But it produces nonsense results, since the values
it reads are garbage. Our test notices this by comparing the output to a
non-corrupted run of the same command (and of course we also check that
the expected error is printed to stderr).

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c          | 12 +++++++++++-
 t/t5318-commit-graph.sh |  9 +++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 472332f603..9b80bbd75b 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -340,6 +340,16 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start,
 	return 0;
 }
 
+static int graph_read_commit_data(const unsigned char *chunk_start,
+				  size_t chunk_size, void *data)
+{
+	struct commit_graph *g = data;
+	if (chunk_size != g->num_commits * GRAPH_DATA_WIDTH)
+		return error("commit-graph commit data chunk is wrong size");
+	g->chunk_commit_data = chunk_start;
+	return 0;
+}
+
 static int graph_read_bloom_data(const unsigned char *chunk_start,
 				  size_t chunk_size, void *data)
 {
@@ -422,7 +432,7 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 
 	read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph);
 	read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph);
-	pair_chunk_unsafe(cf, GRAPH_CHUNKID_DATA, &graph->chunk_commit_data);
+	read_chunk(cf, GRAPH_CHUNKID_DATA, graph_read_commit_data, graph);
 	pair_chunk_unsafe(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges);
 	pair_chunk_unsafe(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs);
 
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index d10658de9e..492460157d 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -870,4 +870,13 @@ test_expect_success 'reader notices out-of-bounds fanout' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'reader notices too-small commit data chunk' '
+	check_corrupt_chunk CDAT clear 00000000 &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph commit data chunk is wrong size
+	error: commit-graph is missing the Commit Data chunk
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 11/20] midx: check size of revindex chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

When we load a revindex from disk, we check the size of the file
compared to the number of objects we expect it to have. But when we use
a RIDX chunk stored directly in the midx, we just access the memory
directly. This can lead to out-of-bounds memory access for a corrupted
or malicious multi-pack-index file.

We can catch this by recording the RIDX chunk size, and then checking it
against the expected size when we "load" the revindex. Note that this
check is much simpler than the one that load_revindex_from_disk() does,
because we just have the data array with no header (so we do not need
to account for the header size, and nor do we need to bother validating
the header values).

The test confirms both that we catch this case, and that we continue the
process (the revindex is required to use the midx bitmaps, but we
fallback to a non-bitmap traversal).

Signed-off-by: Jeff King <peff@peff.net>
---
 midx.c                      |  3 ++-
 midx.h                      |  1 +
 pack-revindex.c             | 13 ++++++++++++-
 t/t5319-multi-pack-index.sh | 17 +++++++++++++++++
 4 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/midx.c b/midx.c
index 3e768d0df0..2f3863c936 100644
--- a/midx.c
+++ b/midx.c
@@ -184,7 +184,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 		   &m->chunk_large_offsets_len);
 
 	if (git_env_bool("GIT_TEST_MIDX_READ_RIDX", 1))
-		pair_chunk_unsafe(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex);
+		pair_chunk(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex,
+			   &m->chunk_revindex_len);
 
 	CALLOC_ARRAY(m->pack_names, m->num_packs);
 	CALLOC_ARRAY(m->packs, m->num_packs);
diff --git a/midx.h b/midx.h
index e8e8884d16..a5d98919c8 100644
--- a/midx.h
+++ b/midx.h
@@ -39,6 +39,7 @@ struct multi_pack_index {
 	const unsigned char *chunk_large_offsets;
 	size_t chunk_large_offsets_len;
 	const unsigned char *chunk_revindex;
+	size_t chunk_revindex_len;
 
 	const char **pack_names;
 	struct packed_git **packs;
diff --git a/pack-revindex.c b/pack-revindex.c
index 7fffcad912..6d8fd3645a 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -343,6 +343,17 @@ int verify_pack_revindex(struct packed_git *p)
 	return res;
 }
 
+static int can_use_midx_ridx_chunk(struct multi_pack_index *m)
+{
+	if (!m->chunk_revindex)
+		return 0;
+	if (m->chunk_revindex_len != st_mult(sizeof(uint32_t), m->num_objects)) {
+		error(_("multi-pack-index reverse-index chunk is the wrong size"));
+		return 0;
+	}
+	return 1;
+}
+
 int load_midx_revindex(struct multi_pack_index *m)
 {
 	struct strbuf revindex_name = STRBUF_INIT;
@@ -351,7 +362,7 @@ int load_midx_revindex(struct multi_pack_index *m)
 	if (m->revindex_data)
 		return 0;
 
-	if (m->chunk_revindex) {
+	if (can_use_midx_ridx_chunk(m)) {
 		/*
 		 * If the MIDX `m` has a `RIDX` chunk, then use its contents for
 		 * the reverse index instead of trying to load a separate `.rev`
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index 16050f39d9..2a11dd1af6 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1138,4 +1138,21 @@ test_expect_success 'reader bounds-checks large offset table' '
 	)
 '
 
+test_expect_success 'reader notices too-small revindex chunk' '
+	# We only get a revindex with bitmaps (and likewise only
+	# load it when they are asked for).
+	test_config repack.writeBitmaps true &&
+	corrupt_chunk RIDX clear 00000000 &&
+	git -c core.multipackIndex=false rev-list \
+		--all --use-bitmap-index >expect.out &&
+	git -c core.multipackIndex=true rev-list \
+		--all --use-bitmap-index >out 2>err &&
+	test_cmp expect.out out &&
+	cat >expect.err <<-\EOF &&
+	error: multi-pack-index reverse-index chunk is the wrong size
+	warning: multi-pack bitmap is missing required reverse index
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 10/20] midx: bounds-check large offset chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

When we see a large offset bit in the regular midx offset table, we
use the entry as an index into a separate large offset table (just like
a pack idx does). But we don't bounds-check the access to that large
offset table (nor even record its size when we parse the chunk!).

The equivalent code for a regular pack idx is in check_pack_index_ptr().
But things are a bit simpler here because of the chunked format: we can
just check our array index directly.

As a bonus, we can get rid of the st_mult() here. If our array
bounds-check is successful, then we know that the result will fit in a
size_t (and the bounds check uses a division to avoid overflow
entirely).

Signed-off-by: Jeff King <peff@peff.net>
---
 midx.c                      |  8 +++++---
 midx.h                      |  1 +
 t/t5319-multi-pack-index.sh | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/midx.c b/midx.c
index 7b1b45f381..3e768d0df0 100644
--- a/midx.c
+++ b/midx.c
@@ -180,7 +180,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 	if (read_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, midx_read_object_offsets, m))
 		die(_("multi-pack-index required object offsets chunk missing or corrupted"));
 
-	pair_chunk_unsafe(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets);
+	pair_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets,
+		   &m->chunk_large_offsets_len);
 
 	if (git_env_bool("GIT_TEST_MIDX_READ_RIDX", 1))
 		pair_chunk_unsafe(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex);
@@ -303,8 +304,9 @@ off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
 			die(_("multi-pack-index stores a 64-bit offset, but off_t is too small"));
 
 		offset32 ^= MIDX_LARGE_OFFSET_NEEDED;
-		return get_be64(m->chunk_large_offsets +
-				st_mult(sizeof(uint64_t), offset32));
+		if (offset32 >= m->chunk_large_offsets_len / sizeof(uint64_t))
+			die(_("multi-pack-index large offset out of bounds"));
+		return get_be64(m->chunk_large_offsets + sizeof(uint64_t) * offset32);
 	}
 
 	return offset32;
diff --git a/midx.h b/midx.h
index 5b2a7da043..e8e8884d16 100644
--- a/midx.h
+++ b/midx.h
@@ -37,6 +37,7 @@ struct multi_pack_index {
 	const unsigned char *chunk_oid_lookup;
 	const unsigned char *chunk_object_offsets;
 	const unsigned char *chunk_large_offsets;
+	size_t chunk_large_offsets_len;
 	const unsigned char *chunk_revindex;
 
 	const char **pack_names;
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index 30687d5452..16050f39d9 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1118,4 +1118,24 @@ test_expect_success 'reader notices too-small object offset chunk' '
 	test_cmp expect err
 '
 
+test_expect_success 'reader bounds-checks large offset table' '
+	# re-use the objects64 dir here to cheaply get access to a midx
+	# with large offsets.
+	git init repo &&
+	test_when_finished "rm -rf repo" &&
+	(
+		cd repo &&
+		(cd ../objects64 && pwd) >.git/objects/info/alternates &&
+		git multi-pack-index --object-dir=../objects64 write &&
+		midx=../objects64/pack/multi-pack-index &&
+		corrupt_chunk_file $midx LOFF clear &&
+		test_must_fail git cat-file \
+			--batch-check --batch-all-objects 2>err &&
+		cat >expect <<-\EOF &&
+		fatal: multi-pack-index large offset out of bounds
+		EOF
+		test_cmp expect err
+	)
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 09/20] midx: check size of object offset chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

The object offset chunk has one fixed-size entry for each object in the
midx. But since we don't check its size, we may access out-of-bounds
memory if we see a corrupt or malicious midx file.

Sine the entries are fixed-size, the total length can be known up-front,
and we can just check it while parsing the chunk (this is similar to
what we do when opening pack idx files, which contain a similar offset
table).

Signed-off-by: Jeff King <peff@peff.net>
---
 midx.c                      | 15 ++++++++++++++-
 t/t5319-multi-pack-index.sh | 10 ++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/midx.c b/midx.c
index 98f84fbe61..7b1b45f381 100644
--- a/midx.c
+++ b/midx.c
@@ -88,6 +88,19 @@ static int midx_read_oid_lookup(const unsigned char *chunk_start,
 	return 0;
 }
 
+static int midx_read_object_offsets(const unsigned char *chunk_start,
+				    size_t chunk_size, void *data)
+{
+	struct multi_pack_index *m = data;
+	m->chunk_object_offsets = chunk_start;
+
+	if (chunk_size != st_mult(m->num_objects, MIDX_CHUNK_OFFSET_WIDTH)) {
+		error(_("multi-pack-index object offset chunk is the wrong size"));
+		return 1;
+	}
+	return 0;
+}
+
 struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local)
 {
 	struct multi_pack_index *m = NULL;
@@ -164,7 +177,7 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 		die(_("multi-pack-index required OID fanout chunk missing or corrupted"));
 	if (read_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, midx_read_oid_lookup, m))
 		die(_("multi-pack-index required OID lookup chunk missing or corrupted"));
-	if (pair_chunk_unsafe(cf, MIDX_CHUNKID_OBJECTOFFSETS, &m->chunk_object_offsets))
+	if (read_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, midx_read_object_offsets, m))
 		die(_("multi-pack-index required object offsets chunk missing or corrupted"));
 
 	pair_chunk_unsafe(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets);
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index 34f5944142..30687d5452 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1108,4 +1108,14 @@ test_expect_success 'reader handles unaligned chunks' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'reader notices too-small object offset chunk' '
+	corrupt_chunk OOFF clear 00000000 &&
+	test_must_fail git log 2>err &&
+	cat >expect <<-\EOF &&
+	error: multi-pack-index object offset chunk is the wrong size
+	fatal: multi-pack-index required object offsets chunk missing or corrupted
+	EOF
+	test_cmp expect err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 08/20] midx: enforce chunk alignment on reading
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

The midx reader assumes chunks are aligned to a 4-byte boundary: we
treat the fanout chunk as an array of uint32_t, indexing it to feed the
results to ntohl(). Without aligning the chunks, we may violate the
CPU's alignment constraints. Though many platforms allow this, some do
not. And certanily UBSan will complain, since it is undefined behavior.

Even though most chunks are naturally 4-byte-aligned (because they are
storing uint32_t or larger types), PNAM is not. It stores NUL-terminated
pack names, so you can have a valid chunk with any length. The writing
side handles this by 4-byte-aligning the chunk, introducing a few extra
NULs as necessary. But since we don't check this on the reading side, we
may end up with a misaligned fanout and trigger the undefined behavior.

We have two options here:

  1. Swap out ntohl(fanout[i]) for get_be32(fanout+i) everywhere. The
     latter handles alignment itself. It's possible that it's slightly
     slower (though in practice I'm not sure how true that is,
     especially for these code paths which then go on to do a binary
     search).

  2. Enforce the alignment when reading the chunks. This is easy to do,
     since the table-of-contents reader can check it in one spot.

I went with the second option here, just because it places less burden
on maintenance going forward (it is OK to continue using ntohl), and we
know it can't have any performance impact on the actual reads.

The commit-graph code uses the same chunk API. It's usually also 4-byte
aligned, but some chunks are not (like Bloom filter BDAT chunks). So
we'll pass "1" here to allow any alignment. It doesn't suffer from the
same problem as midx with its fanout because the fanout chunk is always
the first (and the rest of the format dictates that the first chunk will
start aligned).

The new test shows the effect on a midx with a misaligned PNAM chunk.
Note that the midx-reading code treats chunk-toc errors as soft, falling
back to the non-midx path rather than calling die(), as we do for other
parsing errors. Arguably we should make all of these behave the same,
but that's out of scope for this patch. For now the test just expects
the fallback behavior.

Signed-off-by: Jeff King <peff@peff.net>
---
 chunk-format.c              |  8 +++++++-
 chunk-format.h              |  3 ++-
 commit-graph.c              |  2 +-
 midx.c                      |  3 ++-
 t/t5319-multi-pack-index.sh | 14 ++++++++++++++
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/chunk-format.c b/chunk-format.c
index 8d910e23f6..09ef86afa6 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -102,7 +102,8 @@ int read_table_of_contents(struct chunkfile *cf,
 			   const unsigned char *mfile,
 			   size_t mfile_size,
 			   uint64_t toc_offset,
-			   int toc_length)
+			   int toc_length,
+			   unsigned expected_alignment)
 {
 	int i;
 	uint32_t chunk_id;
@@ -120,6 +121,11 @@ int read_table_of_contents(struct chunkfile *cf,
 			error(_("terminating chunk id appears earlier than expected"));
 			return 1;
 		}
+		if (chunk_offset % expected_alignment != 0) {
+			error(_("chunk id %"PRIx32" not %d-byte aligned"),
+			      chunk_id, expected_alignment);
+			return 1;
+		}
 
 		table_of_contents += CHUNK_TOC_ENTRY_SIZE;
 		next_chunk_offset = get_be64(table_of_contents + 4);
diff --git a/chunk-format.h b/chunk-format.h
index 8dce7967f4..d608b8135c 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -36,7 +36,8 @@ int read_table_of_contents(struct chunkfile *cf,
 			   const unsigned char *mfile,
 			   size_t mfile_size,
 			   uint64_t toc_offset,
-			   int toc_length);
+			   int toc_length,
+			   unsigned expected_alignment);
 
 #define CHUNK_NOT_FOUND (-2)
 
diff --git a/commit-graph.c b/commit-graph.c
index b217e19194..472332f603 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -417,7 +417,7 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 	cf = init_chunkfile(NULL);
 
 	if (read_table_of_contents(cf, graph->data, graph_size,
-				   GRAPH_HEADER_SIZE, graph->num_chunks))
+				   GRAPH_HEADER_SIZE, graph->num_chunks, 1))
 		goto free_and_return;
 
 	read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph);
diff --git a/midx.c b/midx.c
index ec585cae1b..98f84fbe61 100644
--- a/midx.c
+++ b/midx.c
@@ -154,7 +154,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 	cf = init_chunkfile(NULL);
 
 	if (read_table_of_contents(cf, m->data, midx_size,
-				   MIDX_HEADER_SIZE, m->num_chunks))
+				   MIDX_HEADER_SIZE, m->num_chunks,
+				   MIDX_CHUNK_ALIGNMENT))
 		goto cleanup_fail;
 
 	if (pair_chunk(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names, &m->chunk_pack_names_len))
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index 0a0ccec8a4..34f5944142 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1094,4 +1094,18 @@ test_expect_success 'reader notices too-small pack names chunk' '
 	test_cmp expect err
 '
 
+test_expect_success 'reader handles unaligned chunks' '
+	# A 9-byte PNAM means all of the subsequent chunks
+	# will no longer be 4-byte aligned, but it is still
+	# a valid one-pack chunk on its own (it is "foo.pack\0").
+	corrupt_chunk PNAM clear 666f6f2e7061636b00 &&
+	git -c core.multipackindex=false log >expect.out &&
+	git -c core.multipackindex=true log >out 2>err &&
+	test_cmp expect.out out &&
+	cat >expect.err <<-\EOF &&
+	error: chunk id 4f494446 not 4-byte aligned
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 07/20] midx: check size of pack names chunk
From: Jeff King @ 2023-10-09 21:05 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We parse the pack-name chunk as a series of NUL-terminated strings. But
since we don't look at the chunk size, there's nothing to guarantee that
we don't parse off the end of the chunk (or even off the end of the
mapped file).

We can record the length, and then as we parse make sure that we never
walk past it.

The new test exercises the case, though note that it does not actually
segfault before this patch. It hits a NUL byte somewhere in one of the
other chunks, and comes up with a garbage pack name. You could construct
one that reads out-of-bounds (e.g., a PNAM chunk at the end of file),
but this case is simple and sufficient to check that we detect the
problem.

Signed-off-by: Jeff King <peff@peff.net>
---
 midx.c                      | 11 +++++++++--
 midx.h                      |  1 +
 t/t5319-multi-pack-index.sh | 11 +++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/midx.c b/midx.c
index 62e4c03e79..ec585cae1b 100644
--- a/midx.c
+++ b/midx.c
@@ -157,7 +157,7 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 				   MIDX_HEADER_SIZE, m->num_chunks))
 		goto cleanup_fail;
 
-	if (pair_chunk_unsafe(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names))
+	if (pair_chunk(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names, &m->chunk_pack_names_len))
 		die(_("multi-pack-index required pack-name chunk missing or corrupted"));
 	if (read_chunk(cf, MIDX_CHUNKID_OIDFANOUT, midx_read_oid_fanout, m))
 		die(_("multi-pack-index required OID fanout chunk missing or corrupted"));
@@ -176,9 +176,16 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 
 	cur_pack_name = (const char *)m->chunk_pack_names;
 	for (i = 0; i < m->num_packs; i++) {
+		const char *end;
+		size_t avail = m->chunk_pack_names_len -
+				(cur_pack_name - (const char *)m->chunk_pack_names);
+
 		m->pack_names[i] = cur_pack_name;
 
-		cur_pack_name += strlen(cur_pack_name) + 1;
+		end = memchr(cur_pack_name, '\0', avail);
+		if (!end)
+			die(_("multi-pack-index pack-name chunk is too short"));
+		cur_pack_name = end + 1;
 
 		if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0)
 			die(_("multi-pack-index pack names out of order: '%s' before '%s'"),
diff --git a/midx.h b/midx.h
index 5578cd7b83..5b2a7da043 100644
--- a/midx.h
+++ b/midx.h
@@ -32,6 +32,7 @@ struct multi_pack_index {
 	int local;
 
 	const unsigned char *chunk_pack_names;
+	size_t chunk_pack_names_len;
 	const uint32_t *chunk_oid_fanout;
 	const unsigned char *chunk_oid_lookup;
 	const unsigned char *chunk_object_offsets;
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index 2722e495b2..0a0ccec8a4 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1083,4 +1083,15 @@ test_expect_success 'reader notices too-small oid lookup chunk' '
 	test_cmp expect err
 '
 
+test_expect_success 'reader notices too-small pack names chunk' '
+	# There is no NUL to terminate the name here, so the
+	# chunk is too short.
+	corrupt_chunk PNAM clear 70656666 &&
+	test_must_fail git log 2>err &&
+	cat >expect <<-\EOF &&
+	fatal: multi-pack-index pack-name chunk is too short
+	EOF
+	test_cmp expect err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 06/20] commit-graph: check consistency of fanout table
From: Jeff King @ 2023-10-09 21:04 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We use bsearch_hash() to look up items in the oid index of a
commit-graph. It also has a fanout table to reduce the initial range in
which we'll search. But since the fanout comes from the on-disk file, a
corrupted or malicious file can cause us to look outside of the
allocated index memory.

One solution here would be to pass the total table size to
bsearch_hash(), which could then bounds check the values it reads from
the fanout. But there's an inexpensive up-front check we can do, and
it's the same one used by the midx and pack idx code (both of which
likewise have fanout tables and use bsearch_hash(), but are not affected
by this bug):

  1. We can check the value of the final fanout entry against the size
     of the table we got from the index chunk. These must always match,
     since the fanout is just slicing up the index.

       As a side note, the midx and pack idx code compute it the other
       way around: they use the final fanout value as the object count, and
       check the index size against it. Either is valid; if they
       disagree we cannot know which is wrong (a corrupted fanout value,
       or a too-small table of oids).

  2. We can quickly scan the fanout table to make sure it is
     monotonically increasing. If it is, then we know that every value
     is less than or equal to the final value, and therefore less than
     or equal to the table size.

     It would also be sufficient to just check that each fanout value is
     smaller than the final one, but the midx and pack idx code both do
     a full monotonicity check. It's the same cost, and it catches some
     other corruptions (though not all; the checks done by "commit-graph
     verify" are more complete but more expensive, and our goal here is
     to be fast and memory-safe).

There are two new tests. One just checks the final fanout value (this is
the mirror image of the "too small oid lookup" case added for the midx
in the previous commit; it's flipped here because commit-graph considers
the oid lookup chunk to be the source of truth).

The other actually creates a fanout with many out-of-bounds entries, and
prior to this patch, it does cause the segfault you'd expect. But note
that the error is not "your fanout entry is out-of-bounds", but rather
"fanout value out of order". That's because we leave the final fanout
value in place (to get past the table size check), making the index
non-monotonic (the second-to-last entry is big, but the last one must
remain small to match the actual table).

We need adjustments to a few existing tests, as well:

  - an earlier test in t5318 corrupts the fanout and runs "commit-graph
    verify". Its message is now changed, since we catch the problem
    earlier (during the load step, rather than the careful validation
    step).

  - in t5324, we test that "commit-graph verify --shallow" does not do
    expensive verification on the base file of the chain. But the
    corruption it uses (munging a byte at offset 1000) happens to be in
    the middle of the fanout table. And now we detect that problem in
    the cheaper checks that are performed for every part of the graph.
    We'll push this back to offset 1500, which is only caught by the
    more expensive checksum validation.

    Likewise, there's a later test in t5324 which munges an offset 100
    bytes into a file (also in the fanout table) that is referenced by
    an alternates file. So we now find that corruption during the load
    step, rather than the verification step. At the very least we need
    to change the error message (like the case above in t5318). But it
    is probably good to make sure we handle all parts of the
    verification even for alternate graph files. So let's likewise
    corrupt byte 1500 and make sure we found the invalid checksum.

Signed-off-by: Jeff King <peff@peff.net>
---
So I actually implemented the bsearch_hash() bounds checks and wrote
tests for midx and idx files before realizing how they handle this. ;)
Which makes sense, because the usual outcome for a corrupted idx file is
for it to say "non-monotonic index", which I have seen lead to user
confusion. Arguably we should have it say something about "hey, your idx
file seems to be corrupted, because...". But that can be its own topic.

 commit-graph.c                | 16 ++++++++++++++++
 t/t5318-commit-graph.sh       | 25 ++++++++++++++++++++++++-
 t/t5324-split-commit-graph.sh |  6 +++---
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 9b3b01da61..b217e19194 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -277,6 +277,8 @@ struct commit_graph *load_commit_graph_one_fd_st(struct repository *r,
 
 static int verify_commit_graph_lite(struct commit_graph *g)
 {
+	int i;
+
 	/*
 	 * Basic validation shared between parse_commit_graph()
 	 * which'll be called every time the graph is used, and the
@@ -302,6 +304,20 @@ static int verify_commit_graph_lite(struct commit_graph *g)
 		return 1;
 	}
 
+	for (i = 0; i < 255; i++) {
+		uint32_t oid_fanout1 = ntohl(g->chunk_oid_fanout[i]);
+		uint32_t oid_fanout2 = ntohl(g->chunk_oid_fanout[i + 1]);
+
+		if (oid_fanout1 > oid_fanout2) {
+			error("commit-graph fanout values out of order");
+			return 1;
+		}
+	}
+	if (ntohl(g->chunk_oid_fanout[255]) != g->num_commits) {
+		error("commit-graph oid table and fanout disagree on size");
+		return 1;
+	}
+
 	return 0;
 }
 
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index d25bea3ec5..d10658de9e 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -560,7 +560,7 @@ test_expect_success 'detect incorrect fanout' '
 
 test_expect_success 'detect incorrect fanout final value' '
 	corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \
-		"fanout value"
+		"oid table and fanout disagree on size"
 '
 
 test_expect_success 'detect incorrect OID order' '
@@ -847,4 +847,27 @@ test_expect_success 'reader notices too-small oid fanout chunk' '
 	test_cmp expect.err err
 '
 
+test_expect_success 'reader notices fanout/lookup table mismatch' '
+	check_corrupt_chunk OIDF 1020 "FFFFFFFF" &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph oid table and fanout disagree on size
+	EOF
+	test_cmp expect.err err
+'
+
+test_expect_success 'reader notices out-of-bounds fanout' '
+	# Rather than try to corrupt a specific hash, we will just
+	# wreck them all. But we cannot just set them all to 0xFFFFFFFF or
+	# similar, as they are used for hi/lo starts in a binary search (so if
+	# they are identical, that indicates that the search should abort
+	# immediately). Instead, we will give them high values that differ by
+	# 2^24, ensuring that any that are used would cause an out-of-bounds
+	# read.
+	check_corrupt_chunk OIDF 0 $(printf "%02x000000" $(test_seq 0 254)) &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph fanout values out of order
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh
index 06bb897f02..55b5765e2d 100755
--- a/t/t5324-split-commit-graph.sh
+++ b/t/t5324-split-commit-graph.sh
@@ -317,7 +317,7 @@ test_expect_success 'verify --shallow does not check base contents' '
 		cd verify-shallow &&
 		git commit-graph verify &&
 		base_file=$graphdir/graph-$(head -n 1 $graphdir/commit-graph-chain).graph &&
-		corrupt_file "$base_file" 1000 "\01" &&
+		corrupt_file "$base_file" 1500 "\01" &&
 		git commit-graph verify --shallow &&
 		test_must_fail git commit-graph verify 2>test_err &&
 		grep -v "^+" test_err >err &&
@@ -391,10 +391,10 @@ test_expect_success 'verify across alternates' '
 		test_commit extra &&
 		git commit-graph write --reachable --split &&
 		tip_file=$graphdir/graph-$(tail -n 1 $graphdir/commit-graph-chain).graph &&
-		corrupt_file "$tip_file" 100 "\01" &&
+		corrupt_file "$tip_file" 1500 "\01" &&
 		test_must_fail git commit-graph verify --shallow 2>test_err &&
 		grep -v "^+" test_err >err &&
-		test_i18ngrep "commit-graph has incorrect fanout value" err
+		test_i18ngrep "incorrect checksum" err
 	)
 '
 
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 05/20] midx: check size of oid lookup chunk
From: Jeff King @ 2023-10-09 21:02 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

When reading an on-disk multi-pack-index, we take the number of objects
in the midx from the final value of the fanout table. But we just
blindly assume that the chunk containing the actual oid entries is the
correct size. This can lead to us reading out-of-bounds memory if the
lookup chunk is too small (or if the fanout is corrupted; when they
don't agree we cannot tell which one is wrong).

Note that we bump the assignment of m->num_objects into the fanout
parser callback, so that it's set when we parse the lookup table
(otherwise we'd have to manually record the lookup table size and check
it later).

Signed-off-by: Jeff King <peff@peff.net>
---
As an aside, the first hunk of the diff annoyingly slides the "return 0"
into the wrong spot. I thought this was our heuristics gone wrong, but I
suspect it's actually the shortest diff because of the one-line change
in midx_read_oid_fanout(), which would otherwise require extra context
to split.

 midx.c                      | 18 +++++++++++++++---
 t/t5319-multi-pack-index.sh | 10 ++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/midx.c b/midx.c
index 21d7dd15ef..62e4c03e79 100644
--- a/midx.c
+++ b/midx.c
@@ -71,6 +71,20 @@ static int midx_read_oid_fanout(const unsigned char *chunk_start,
 		error(_("multi-pack-index OID fanout is of the wrong size"));
 		return 1;
 	}
+	m->num_objects = ntohl(m->chunk_oid_fanout[255]);
+	return 0;
+}
+
+static int midx_read_oid_lookup(const unsigned char *chunk_start,
+				size_t chunk_size, void *data)
+{
+	struct multi_pack_index *m = data;
+	m->chunk_oid_lookup = chunk_start;
+
+	if (chunk_size != st_mult(m->hash_len, m->num_objects)) {
+		error(_("multi-pack-index OID lookup chunk is the wrong size"));
+		return 1;
+	}
 	return 0;
 }
 
@@ -147,7 +161,7 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 		die(_("multi-pack-index required pack-name chunk missing or corrupted"));
 	if (read_chunk(cf, MIDX_CHUNKID_OIDFANOUT, midx_read_oid_fanout, m))
 		die(_("multi-pack-index required OID fanout chunk missing or corrupted"));
-	if (pair_chunk_unsafe(cf, MIDX_CHUNKID_OIDLOOKUP, &m->chunk_oid_lookup))
+	if (read_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, midx_read_oid_lookup, m))
 		die(_("multi-pack-index required OID lookup chunk missing or corrupted"));
 	if (pair_chunk_unsafe(cf, MIDX_CHUNKID_OBJECTOFFSETS, &m->chunk_object_offsets))
 		die(_("multi-pack-index required object offsets chunk missing or corrupted"));
@@ -157,8 +171,6 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 	if (git_env_bool("GIT_TEST_MIDX_READ_RIDX", 1))
 		pair_chunk_unsafe(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex);
 
-	m->num_objects = ntohl(m->chunk_oid_fanout[255]);
-
 	CALLOC_ARRAY(m->pack_names, m->num_packs);
 	CALLOC_ARRAY(m->packs, m->num_packs);
 
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index b8fe85aeba..2722e495b2 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -1073,4 +1073,14 @@ test_expect_success 'reader notices too-small oid fanout chunk' '
 	test_cmp expect err
 '
 
+test_expect_success 'reader notices too-small oid lookup chunk' '
+	corrupt_chunk OIDL clear 00000000 &&
+	test_must_fail git log 2>err &&
+	cat >expect <<-\EOF &&
+	error: multi-pack-index OID lookup chunk is the wrong size
+	fatal: multi-pack-index required OID lookup chunk missing or corrupted
+	EOF
+	test_cmp expect err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related

* [PATCH 04/20] commit-graph: check size of oid fanout chunk
From: Jeff King @ 2023-10-09 20:59 UTC (permalink / raw)
  To: git; +Cc: Taylor Blau
In-Reply-To: <20231009205544.GA3281950@coredump.intra.peff.net>

We load the oid fanout chunk with pair_chunk(), which means we never see
the size of the chunk. We just assume the on-disk file uses the
appropriate size, and if it's too small we'll access random memory.

It's easy to check this up-front; the fanout always consists of 256
uint32's, since it is a fanout of the first byte of the hash pointing
into the oid index. These parameters can't be changed without
introducing a new chunk type.

This matches the similar check in the midx OIDF chunk (but note that
rather than checking for the error immediately, the graph code just
leaves parts of the struct NULL and checks for required fields later).

Signed-off-by: Jeff King <peff@peff.net>
---
 commit-graph.c          | 13 +++++++++++--
 t/t5318-commit-graph.sh | 26 ++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index a689a55b79..9b3b01da61 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -305,6 +305,16 @@ static int verify_commit_graph_lite(struct commit_graph *g)
 	return 0;
 }
 
+static int graph_read_oid_fanout(const unsigned char *chunk_start,
+				 size_t chunk_size, void *data)
+{
+	struct commit_graph *g = data;
+	if (chunk_size != 256 * sizeof(uint32_t))
+		return error("commit-graph oid fanout chunk is wrong size");
+	g->chunk_oid_fanout = (const uint32_t *)chunk_start;
+	return 0;
+}
+
 static int graph_read_oid_lookup(const unsigned char *chunk_start,
 				 size_t chunk_size, void *data)
 {
@@ -394,8 +404,7 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
 				   GRAPH_HEADER_SIZE, graph->num_chunks))
 		goto free_and_return;
 
-	pair_chunk_unsafe(cf, GRAPH_CHUNKID_OIDFANOUT,
-		   (const unsigned char **)&graph->chunk_oid_fanout);
+	read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph);
 	read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph);
 	pair_chunk_unsafe(cf, GRAPH_CHUNKID_DATA, &graph->chunk_commit_data);
 	pair_chunk_unsafe(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges);
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index ba65f17dd9..d25bea3ec5 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -2,6 +2,7 @@
 
 test_description='commit graph'
 . ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-chunk.sh
 
 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
 
@@ -821,4 +822,29 @@ test_expect_success 'overflow during generation version upgrade' '
 	)
 '
 
+corrupt_chunk () {
+	graph=full/.git/objects/info/commit-graph &&
+	test_when_finished "rm -rf $graph" &&
+	git -C full commit-graph write --reachable &&
+	corrupt_chunk_file $graph "$@"
+}
+
+check_corrupt_chunk () {
+	corrupt_chunk "$@" &&
+	git -C full -c core.commitGraph=false log >expect.out &&
+	git -C full -c core.commitGraph=true log >out 2>err &&
+	test_cmp expect.out out
+}
+
+test_expect_success 'reader notices too-small oid fanout chunk' '
+	# make it big enough that the graph file is plausible,
+	# otherwise we hit an earlier check
+	check_corrupt_chunk OIDF clear $(printf "000000%02x" $(test_seq 250)) &&
+	cat >expect.err <<-\EOF &&
+	error: commit-graph oid fanout chunk is wrong size
+	error: commit-graph is missing the OID Fanout chunk
+	EOF
+	test_cmp expect.err err
+'
+
 test_done
-- 
2.42.0.884.g35e1fe1a6a


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox