Git development
 help / color / mirror / Atom feed
From: "Johannes Schindelin via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: "Derrick Stolee" <stolee@gmail.com>,
	"Torsten Bögershausen" <tboegi@web.de>,
	"Jeff King" <peff@peff.net>,
	"Johannes Schindelin" <johannes.schindelin@gmx.de>,
	"Johannes Schindelin" <johannes.schindelin@gmx.de>
Subject: [PATCH v2 05/11] test-tool: add a helper to synthesize large packfiles
Date: Mon, 04 May 2026 17:08:22 +0000	[thread overview]
Message-ID: <afa74a3a2b9caf9989055a9311309f590729d6c1.1777914508.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2102.v2.git.1777914508.gitgitgadget@gmail.com>

From: Johannes Schindelin <johannes.schindelin@gmx.de>

To test Git's behavior with very large pack files, we need a way to
generate such files quickly.

A naive approach using only readily-available Git commands would take
over 10 hours for a 4GB pack file, which is prohibitive.

Side-stepping Git's machinery and actual zlib compression by writing
uncompressed content with the appropriate zlib header makes things
much faster. The fastest method using this approach generates many
small, unreachable blob objects and takes about 1.5 minutes for 4GB.
However, this cannot be used because we need to test git clone, which
requires a reachable commit history.

Generating many reachable commits with small, uncompressed blobs takes
about 4 minutes for 4GB. But this approach 1) does not reproduce the
issues we want to fix (which require individual objects larger than
4GB) and 2) is comparatively slow because of the many SHA-1
calculations.

The approach taken here generates a single large blob (filled with NUL
bytes), along with the trees and commits needed to make it reachable.
This takes about 2.5 minutes for 4.5GB, which is the fastest option
that produces a valid, clonable repository with an object large enough
to trigger the bugs we want to test.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---
 Makefile                   |   1 +
 compat/zlib-compat.h       |   2 +
 t/helper/meson.build       |   1 +
 t/helper/test-synthesize.c | 250 +++++++++++++++++++++++++++++++++++++
 t/helper/test-tool.c       |   1 +
 t/helper/test-tool.h       |   1 +
 6 files changed, 256 insertions(+)
 create mode 100644 t/helper/test-synthesize.c

diff --git a/Makefile b/Makefile
index cedc234173..85405cb5b8 100644
--- a/Makefile
+++ b/Makefile
@@ -872,6 +872,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o
 TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o
 TEST_BUILTINS_OBJS += test-submodule.o
 TEST_BUILTINS_OBJS += test-subprocess.o
+TEST_BUILTINS_OBJS += test-synthesize.o
 TEST_BUILTINS_OBJS += test-trace2.o
 TEST_BUILTINS_OBJS += test-truncate.o
 TEST_BUILTINS_OBJS += test-userdiff.o
diff --git a/compat/zlib-compat.h b/compat/zlib-compat.h
index ac08276622..5078c5ef6c 100644
--- a/compat/zlib-compat.h
+++ b/compat/zlib-compat.h
@@ -7,6 +7,8 @@
 # define z_stream_s zng_stream_s
 # define gz_header_s zng_gz_header_s
 
+# define adler32(adler, buf, len) zng_adler32(adler, buf, len)
+
 # define crc32(crc, buf, len) zng_crc32(crc, buf, len)
 
 # define inflate(strm, bits) zng_inflate(strm, bits)
diff --git a/t/helper/meson.build b/t/helper/meson.build
index 675e64c010..3235f10ab8 100644
--- a/t/helper/meson.build
+++ b/t/helper/meson.build
@@ -69,6 +69,7 @@ test_tool_sources = [
   'test-submodule-nested-repo-config.c',
   'test-submodule.c',
   'test-subprocess.c',
+  'test-synthesize.c',
   'test-tool.c',
   'test-trace2.c',
   'test-truncate.c',
diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c
new file mode 100644
index 0000000000..3ce7078078
--- /dev/null
+++ b/t/helper/test-synthesize.c
@@ -0,0 +1,250 @@
+#define USE_THE_REPOSITORY_VARIABLE
+
+#include "test-tool.h"
+#include "git-compat-util.h"
+#include "git-zlib.h"
+#include "hash.h"
+#include "hex.h"
+#include "object-file.h"
+#include "object.h"
+#include "pack.h"
+#include "parse-options.h"
+#include "parse.h"
+#include "repository.h"
+#include "setup.h"
+#include "strbuf.h"
+#include "write-or-die.h"
+
+#define BLOCK_SIZE 0xffff
+static const unsigned char zeros[BLOCK_SIZE];
+
+/*
+ * Write data as an uncompressed zlib stream.
+ * For data larger than 64KB, writes multiple uncompressed blocks.
+ * If data is NULL, writes zeros.
+ * Updates the pack checksum context.
+ */
+static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx,
+				    const void *data, size_t len,
+				    const struct git_hash_algo *algo)
+{
+	unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */
+	unsigned char block_header[5];
+	const unsigned char *p = data;
+	size_t remaining = len;
+	uint32_t adler = 1L; /* adler32 initial value */
+	unsigned char adler_buf[4];
+
+	/* Write zlib header */
+	fwrite_or_die(f, zlib_header, sizeof(zlib_header));
+	algo->update_fn(pack_ctx, zlib_header, 2);
+
+	/* Write uncompressed blocks (max 64KB each) */
+	do {
+		size_t block_len = remaining > BLOCK_SIZE ? BLOCK_SIZE : remaining;
+		int is_final = (block_len == remaining);
+		const unsigned char *block_data = data ? p : zeros;
+
+		block_header[0] = is_final ? 0x01 : 0x00;
+		block_header[1] = block_len & 0xff;
+		block_header[2] = (block_len >> 8) & 0xff;
+		block_header[3] = block_header[1] ^ 0xff;
+		block_header[4] = block_header[2] ^ 0xff;
+
+		fwrite_or_die(f, block_header, sizeof(block_header));
+		algo->update_fn(pack_ctx, block_header, 5);
+
+		if (block_len) {
+			fwrite_or_die(f, block_data, block_len);
+			algo->update_fn(pack_ctx, block_data, block_len);
+			adler = adler32(adler, block_data, block_len);
+		}
+
+		if (data)
+			p += block_len;
+		remaining -= block_len;
+	} while (remaining > 0);
+
+	/* Write adler32 checksum */
+	put_be32(adler_buf, adler);
+	fwrite_or_die(f, adler_buf, sizeof(adler_buf));
+	algo->update_fn(pack_ctx, adler_buf, 4);
+}
+
+/*
+ * Write an uncompressed object to the pack file.
+ * If `data == NULL`, it is treated like a buffer to NUL bytes.
+ * Updates the pack checksum context.
+ */
+static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx,
+			      enum object_type type,
+			      const void *data, size_t len,
+			      struct object_id *oid,
+			      const struct git_hash_algo *algo)
+{
+	unsigned char pack_header[MAX_PACK_OBJECT_HEADER];
+	char object_header[32];
+	int pack_header_len, object_header_len;
+	struct git_hash_ctx ctx;
+
+	/* Write pack object header */
+	pack_header_len = encode_in_pack_object_header(pack_header,
+						       sizeof(pack_header),
+						       type, len);
+	fwrite_or_die(f, pack_header, pack_header_len);
+	algo->update_fn(pack_ctx, pack_header, pack_header_len);
+
+	/* Write the data as uncompressed zlib */
+	write_uncompressed_zlib(f, pack_ctx, data, len, algo);
+
+	algo->init_fn(&ctx);
+	object_header_len = format_object_header(object_header,
+						 sizeof(object_header),
+						 type, len);
+	algo->update_fn(&ctx, object_header, object_header_len);
+	if (data)
+		algo->update_fn(&ctx, data, len);
+	else {
+		for (size_t i = len / BLOCK_SIZE; i; i--)
+			algo->update_fn(&ctx, zeros, BLOCK_SIZE);
+		algo->update_fn(&ctx, zeros, len % BLOCK_SIZE);
+	}
+	algo->final_oid_fn(oid, &ctx);
+}
+
+/*
+ * Generate a pack file with a single large (>4GB) reachable object.
+ *
+ * Creates:
+ *   1. A large blob (all NUL bytes)
+ *   2. A tree containing that blob as "file"
+ *   3. A commit using that tree
+ *   4. The empty tree
+ *   5. A child commit using the empty tree
+ *
+ * This is useful for testing that Git can handle objects larger than 4GB.
+ */
+static int generate_pack_with_large_object(const char *path, size_t blob_size,
+					   const struct git_hash_algo *algo)
+{
+	FILE *f = xfopen(path, "wb");
+	struct git_hash_ctx pack_ctx;
+	unsigned char pack_hash[GIT_MAX_RAWSZ];
+	struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid;
+	struct strbuf buf = STRBUF_INIT;
+	const uint32_t object_count = 5;
+	struct pack_header pack_header = {
+		.hdr_signature = htonl(PACK_SIGNATURE),
+		.hdr_version = htonl(PACK_VERSION),
+		.hdr_entries = htonl(object_count),
+	};
+
+	algo->init_fn(&pack_ctx);
+
+	/* Write pack header */
+	fwrite_or_die(f, &pack_header, sizeof(pack_header));
+	algo->update_fn(&pack_ctx, &pack_header, sizeof(pack_header));
+
+	/* 1. Write the large blob */
+	write_pack_object(f, &pack_ctx, OBJ_BLOB, NULL, blob_size, &blob_oid, algo);
+
+	/* 2. Write tree containing the blob as "file" */
+	strbuf_addf(&buf, "100644 file%c", '\0');
+	strbuf_add(&buf, blob_oid.hash, algo->rawsz);
+	write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, &tree_oid, algo);
+
+	/* 3. Write commit using that tree */
+	strbuf_reset(&buf);
+	strbuf_addf(&buf,
+		    "tree %s\n"
+		    "author A U Thor <author@example.com> 1234567890 +0000\n"
+		    "committer C O Mitter <committer@example.com> 1234567890 +0000\n"
+		    "\n"
+		    "Large blob commit\n",
+		    oid_to_hex(&tree_oid));
+	write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &commit_oid, algo);
+
+	/* 4. Write the empty tree */
+	write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, &empty_tree_oid, algo);
+
+	/* 5. Write final commit using empty tree, with previous commit as parent */
+	strbuf_reset(&buf);
+	strbuf_addf(&buf,
+		    "tree %s\n"
+		    "parent %s\n"
+		    "author A U Thor <author@example.com> 1234567890 +0000\n"
+		    "committer C O Mitter <committer@example.com> 1234567890 +0000\n"
+		    "\n"
+		    "Empty tree commit\n",
+		    oid_to_hex(&empty_tree_oid),
+		    oid_to_hex(&commit_oid));
+	write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &final_commit_oid, algo);
+
+	/* Write pack trailer (checksum) */
+	algo->final_fn(pack_hash, &pack_ctx);
+	fwrite_or_die(f, pack_hash, algo->rawsz);
+	if (fclose(f))
+		die_errno(_("could not close '%s'"), path);
+
+	strbuf_release(&buf);
+
+	/* Print the final commit OID so caller can set up refs */
+	printf("%s\n", oid_to_hex(&final_commit_oid));
+
+	return 0;
+}
+
+static int cmd__synthesize__pack(int argc, const char **argv,
+				 const char *prefix UNUSED,
+				 struct repository *repo)
+{
+	int non_git;
+	int reachable_large = 0;
+	const struct git_hash_algo *algo;
+	size_t blob_size;
+	uintmax_t blob_size_u;
+	const char *path;
+	const char * const usage[] = {
+		"test-tool synthesize pack "
+		"--reachable-large <blob-size> <filename>",
+		NULL
+	};
+	struct option options[] = {
+		OPT_BOOL(0, "reachable-large", &reachable_large,
+			 N_("write a pack with a single reachable large blob")),
+		OPT_END()
+	};
+
+	setup_git_directory_gently(&non_git);
+	repo = the_repository;
+	algo = repo->hash_algo;
+
+	argc = parse_options(argc, argv, NULL, options, usage,
+			     PARSE_OPT_KEEP_ARGV0);
+	if (argc != 3 || !reachable_large)
+		usage_with_options(usage, options);
+
+	if (!git_parse_unsigned(argv[1], &blob_size_u,
+				maximum_unsigned_value_of_type(size_t)))
+		die(_("'%s' is not a valid blob size"), argv[1]);
+	blob_size = blob_size_u;
+	path = argv[2];
+
+	return !!generate_pack_with_large_object(path, blob_size, algo);
+}
+
+int cmd__synthesize(int argc, const char **argv)
+{
+	const char *prefix = NULL;
+	char const * const synthesize_usage[] = {
+		"test-tool synthesize pack <options>",
+		NULL,
+	};
+	parse_opt_subcommand_fn *fn = NULL;
+	struct option options[] = {
+		OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack),
+		OPT_END()
+	};
+	argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0);
+	return !!fn(argc, argv, prefix, NULL);
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index a7abc618b3..b71a22b43b 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -82,6 +82,7 @@ static struct test_cmd cmds[] = {
 	{ "submodule-config", cmd__submodule_config },
 	{ "submodule-nested-repo-config", cmd__submodule_nested_repo_config },
 	{ "subprocess", cmd__subprocess },
+	{ "synthesize", cmd__synthesize },
 	{ "trace2", cmd__trace2 },
 	{ "truncate", cmd__truncate },
 	{ "userdiff", cmd__userdiff },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 7f150fa1eb..f2885b33d5 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -75,6 +75,7 @@ int cmd__submodule(int argc, const char **argv);
 int cmd__submodule_config(int argc, const char **argv);
 int cmd__submodule_nested_repo_config(int argc, const char **argv);
 int cmd__subprocess(int argc, const char **argv);
+int cmd__synthesize(int argc, const char **argv);
 int cmd__trace2(int argc, const char **argv);
 int cmd__truncate(int argc, const char **argv);
 int cmd__userdiff(int argc, const char **argv);
-- 
gitgitgadget


  parent reply	other threads:[~2026-05-04 17:08 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-28 16:26 [PATCH 0/6] Handle cloning of objects larger than 4GB on Windows Johannes Schindelin via GitGitGadget
2026-04-28 16:26 ` [PATCH 1/6] index-pack, unpack-objects: use size_t for object size Johannes Schindelin via GitGitGadget
2026-04-30 14:13   ` Torsten Bögershausen
2026-05-03 14:46     ` Johannes Schindelin
2026-04-28 16:26 ` [PATCH 2/6] git-zlib: handle data streams larger than 4GB Johannes Schindelin via GitGitGadget
2026-04-28 16:26 ` [PATCH 3/6] odb, packfile: use size_t for streaming object sizes Johannes Schindelin via GitGitGadget
2026-04-28 16:26 ` [PATCH 4/6] delta, packfile: use size_t for delta header sizes Johannes Schindelin via GitGitGadget
2026-04-29 13:28   ` Derrick Stolee
2026-05-03 14:49     ` Johannes Schindelin
2026-04-28 16:26 ` [PATCH 5/6] test-tool: add a helper to synthesize large packfiles Johannes Schindelin via GitGitGadget
2026-04-28 16:26 ` [PATCH 6/6] t5608: add regression test for >4GB object clone Johannes Schindelin via GitGitGadget
2026-04-29 13:34   ` Derrick Stolee
2026-05-01  6:38     ` Jeff King
2026-05-01 13:19       ` Derrick Stolee
2026-05-04 17:07         ` Johannes Schindelin
2026-04-29 13:35 ` [PATCH 0/6] Handle cloning of objects larger than 4GB on Windows Derrick Stolee
2026-05-04 17:08 ` [PATCH v2 00/11] " Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 01/11] index-pack, unpack-objects: use size_t for object size Johannes Schindelin via GitGitGadget
2026-05-05 19:11     ` Torsten Bögershausen
2026-05-08  7:36       ` Johannes Schindelin
2026-05-08 19:09         ` Torsten Bögershausen
2026-05-10  2:41           ` Junio C Hamano
2026-05-10  9:14             ` Torsten Bögershausen
2026-05-04 17:08   ` [PATCH v2 02/11] git-zlib: handle data streams larger than 4GB Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 03/11] odb, packfile: use size_t for streaming object sizes Johannes Schindelin via GitGitGadget
2026-05-05 19:27     ` Torsten Bögershausen
2026-05-08  7:38       ` Johannes Schindelin
2026-05-04 17:08   ` [PATCH v2 04/11] delta, packfile: use size_t for delta header sizes Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` Johannes Schindelin via GitGitGadget [this message]
2026-05-04 17:08   ` [PATCH v2 06/11] t5608: add regression test for >4GB object clone Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 07/11] test-tool synthesize: use the unsafe hash for speed Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 08/11] test-tool synthesize: precompute pack for 4 GiB + 1 Johannes Schindelin via GitGitGadget
2026-05-04 18:27     ` Derrick Stolee
2026-05-05 20:54       ` Johannes Schindelin
2026-05-04 17:08   ` [PATCH v2 09/11] test-tool synthesize: add precomputed SHA-256 " Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 10/11] t5608: mark >4GB tests as EXPENSIVE Johannes Schindelin via GitGitGadget
2026-05-04 17:08   ` [PATCH v2 11/11] ci: run expensive tests on push builds to integration branches Johannes Schindelin via GitGitGadget
2026-05-04 18:35     ` Derrick Stolee
2026-05-05 12:56       ` Junio C Hamano
2026-05-05 23:07         ` Junio C Hamano
2026-05-06  8:33           ` Johannes Schindelin
2026-05-07  9:18             ` Junio C Hamano
2026-05-07 10:24               ` Patrick Steinhardt
2026-05-08  2:50         ` Junio C Hamano
2026-05-08  8:16   ` [PATCH v3 00/11] Handle cloning of objects larger than 4GB on Windows Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 01/11] index-pack, unpack-objects: use size_t for object size Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 02/11] git-zlib: handle data streams larger than 4GB Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 03/11] odb, packfile: use size_t for streaming object sizes Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 04/11] delta, packfile: use size_t for delta header sizes Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 05/11] test-tool: add a helper to synthesize large packfiles Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 06/11] t5608: add regression test for >4GB object clone Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 07/11] test-tool synthesize: use the unsafe hash for speed Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 08/11] test-tool synthesize: precompute pack for 4 GiB + 1 Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 09/11] test-tool synthesize: add precomputed SHA-256 " Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 10/11] t5608: mark >4GB tests as EXPENSIVE Johannes Schindelin via GitGitGadget
2026-05-08  8:16     ` [PATCH v3 11/11] ci: run expensive tests on push builds to integration branches Johannes Schindelin via GitGitGadget
2026-05-10 23:51       ` [PATCH] ci: enable EXPENSIVE for contributor builds Junio C Hamano
2026-05-11  7:05         ` Patrick Steinhardt
2026-05-11  8:29           ` Junio C Hamano
2026-05-11 10:02             ` Patrick Steinhardt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=afa74a3a2b9caf9989055a9311309f590729d6c1.1777914508.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=johannes.schindelin@gmx.de \
    --cc=peff@peff.net \
    --cc=stolee@gmail.com \
    --cc=tboegi@web.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox