git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Patrick Steinhardt <ps@pks.im>
To: git@vger.kernel.org
Cc: Eric Sunshine <sunshine@sunshineco.com>,
	Junio C Hamano <gitster@pobox.com>
Subject: [PATCH v2 8/9] refs: implement logic to migrate between ref storage formats
Date: Fri, 24 May 2024 12:15:06 +0200	[thread overview]
Message-ID: <4d3eb5ea896bffffbf28ab4865b69901cc9edee7.1716545235.git.ps@pks.im> (raw)
In-Reply-To: <cover.1716545235.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 11187 bytes --]

With the introduction of the new "reftable" backend, users may want to
migrate repositories between the backends without having to recreate the
whole repository. Add the logic to do so.

The implementation is generic and works with arbitrary ref storage
formats so that a backend does not need to implement any migration
logic. It does have a few limitations though:

  - We do not migrate repositories with worktrees, because worktrees
    have separate ref storages. It makes the overall affair more complex
    if we have to migrate multiple storages at once.

  - We do not migrate reflogs, because we have no interfaces to write
    many reflog entries.

  - We do not lock the repository for concurrent access, and thus
    concurrent writes may make use end up with weird in-between states.
    There is no way to fully lock the "files" backend for writes due to
    its format, and thus we punt on this topic altogether and defer to
    the user to avoid those from happening.

In other words, this version is a minimum viable product for migrating a
repository's ref storage format. It works alright for bare repos, which
often have neither worktrees nor reflogs. But it will not work for many
other repositories without some preparations. These limitations are not
set into stone though, and ideally we will eventually address them over
time.

The logic is not yet used by anything, and thus there are no tests for
it. Those will be added in the next commit.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 refs.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 refs.h |  18 ++++
 2 files changed, 302 insertions(+)

diff --git a/refs.c b/refs.c
index 9b112b0527..4ef2dae5e1 100644
--- a/refs.c
+++ b/refs.c
@@ -2570,3 +2570,287 @@ int ref_update_check_old_target(const char *referent, struct ref_update *update,
 			    referent, update->old_target);
 	return -1;
 }
+
+struct migration_data {
+	struct ref_store *old_refs;
+	struct ref_store *new_refs;
+	struct ref_transaction *transaction;
+	struct strbuf *errbuf;
+	const char *refname;
+};
+
+static int migrate_one_ref(const char *refname, const struct object_id *oid,
+			   int flags, void *cb_data)
+{
+	struct migration_data *data = cb_data;
+	struct strbuf symref_target = STRBUF_INIT;
+	int ret;
+
+	if (flags & REF_ISSYMREF) {
+		ret = refs_read_symbolic_ref(data->old_refs, refname, &symref_target);
+		if (ret < 0)
+			goto done;
+
+		ret = ref_transaction_update(data->transaction, refname, NULL, null_oid(),
+					     symref_target.buf, NULL,
+					     REF_SKIP_CREATE_REFLOG | REF_NO_DEREF, NULL, data->errbuf);
+		if (ret < 0)
+			goto done;
+	} else {
+		ret = ref_transaction_create(data->transaction, refname, oid,
+					     REF_SKIP_CREATE_REFLOG | REF_SKIP_OID_VERIFICATION,
+					     NULL, data->errbuf);
+		if (ret < 0)
+			goto done;
+	}
+
+done:
+	strbuf_release(&symref_target);
+	return ret;
+}
+
+static int move_files(const char *from_path, const char *to_path, struct strbuf *errbuf)
+{
+	struct strbuf from_buf = STRBUF_INIT, to_buf = STRBUF_INIT;
+	size_t from_len, to_len;
+	DIR *from_dir;
+	int ret;
+
+	from_dir = opendir(from_path);
+	if (!from_dir) {
+		strbuf_addf(errbuf, "could not open source directory: '%s'", from_path);
+		ret = -1;
+		goto done;
+	}
+
+	strbuf_addstr(&from_buf, from_path);
+	strbuf_complete(&from_buf, '/');
+	from_len = from_buf.len;
+
+	strbuf_addstr(&to_buf, to_path);
+	strbuf_complete(&to_buf, '/');
+	to_len = to_buf.len;
+
+	while (1) {
+		struct dirent *ent;
+
+		errno = 0;
+		ent = readdir(from_dir);
+		if (!ent)
+			break;
+
+		if (!strcmp(ent->d_name, ".") ||
+		    !strcmp(ent->d_name, ".."))
+			continue;
+
+		strbuf_setlen(&from_buf, from_len);
+		strbuf_addstr(&from_buf, ent->d_name);
+
+		strbuf_setlen(&to_buf, to_len);
+		strbuf_addstr(&to_buf, ent->d_name);
+
+		ret = rename(from_buf.buf, to_buf.buf);
+		if (ret < 0) {
+			strbuf_addf(errbuf, "could not link file '%s' to '%s': %s",
+				    from_buf.buf, to_buf.buf, strerror(errno));
+			goto done;
+		}
+	}
+
+	if (errno) {
+		strbuf_addf(errbuf, "could not read entry from directory '%s': %s",
+			    from_path, strerror(errno));
+		ret = -1;
+		goto done;
+	}
+
+	ret = 0;
+
+done:
+	strbuf_release(&from_buf);
+	strbuf_release(&to_buf);
+	closedir(from_dir);
+	return ret;
+}
+
+static int count_reflogs(const char *reflog, void *payload)
+{
+	size_t *reflog_count = payload;
+	(*reflog_count)++;
+	return 0;
+}
+
+static int has_worktrees(void)
+{
+	struct worktree **worktrees = get_worktrees();
+	int ret = 0;
+	size_t i;
+
+	for (i = 0; worktrees[i]; i++) {
+		if (is_main_worktree(worktrees[i]))
+			continue;
+		ret = 1;
+	}
+
+	free_worktrees(worktrees);
+	return ret;
+}
+
+int repo_migrate_ref_storage_format(struct repository *repo,
+				    enum ref_storage_format format,
+				    unsigned int flags,
+				    struct strbuf *errbuf)
+{
+	struct ref_store *old_refs = NULL, *new_refs = NULL;
+	struct ref_transaction *transaction = NULL;
+	struct strbuf buf = STRBUF_INIT;
+	struct migration_data data;
+	size_t reflog_count = 0;
+	char *new_gitdir;
+	int ret;
+
+	old_refs = get_main_ref_store(repo);
+
+	/*
+	 * The overall logic looks like this:
+	 *
+	 *   1. Set up a new temporary directory and initialize it with the new
+	 *      format. This is where all refs will be migrated into.
+	 *
+	 *   2. Enumerate all refs and write them into the new ref storage.
+	 *      This operation is safe as we do not yet modify the main
+	 *      repository.
+	 *
+	 *   3. If we're in dry-run mode then we are done and can hand over the
+	 *      directory to the caller for inspection. If not, we now start
+	 *      with the destructive part.
+	 *
+	 *   4. Delete the old ref storage from disk. As we have a copy of refs
+	 *      in the new ref storage it's okay(ish) if we now get interrupted
+	 *      as there is an equivalent copy of all refs available.
+	 *
+	 *   5. Move the new ref storage files into place.
+	 *
+	 *   6. Change the repository format to the new ref format.
+	 */
+	strbuf_addf(&buf, "%s/%s", old_refs->gitdir, "ref_migration.XXXXXX");
+	new_gitdir = mkdtemp(buf.buf);
+	if (!new_gitdir) {
+		strbuf_addf(errbuf, "cannot create migration directory: %s",
+			    strerror(errno));
+		ret = -1;
+		goto done;
+	}
+
+	if (refs_for_each_reflog(old_refs, count_reflogs, &reflog_count) < 0) {
+		strbuf_addstr(errbuf, "cannot count reflogs");
+		ret = -1;
+		goto done;
+	}
+	if (reflog_count) {
+		strbuf_addstr(errbuf, "migrating reflogs is not supported yet");
+		ret = -1;
+		goto done;
+	}
+
+	/*
+	 * TODO: we should really be passing the caller-provided repository to
+	 * `has_worktrees()`, but our worktree subsystem doesn't yet support
+	 * that.
+	 */
+	if (has_worktrees()) {
+		strbuf_addstr(errbuf, "migrating repositories with worktrees is not supported yet");
+		ret = -1;
+		goto done;
+	}
+
+	new_refs = ref_store_init(repo, format, new_gitdir,
+				  REF_STORE_ALL_CAPS);
+	ret = ref_store_create_on_disk(new_refs, 0, errbuf);
+	if (ret < 0)
+		goto done;
+
+	transaction = ref_store_transaction_begin(new_refs, errbuf);
+	if (!transaction)
+		goto done;
+
+	data.old_refs = old_refs;
+	data.new_refs = new_refs;
+	data.transaction = transaction;
+	data.errbuf = errbuf;
+
+	/*
+	 * We need to use the internal `do_for_each_ref()` here so that we can
+	 * also include broken refs and symrefs. These would otherwise be
+	 * skipped silently.
+	 *
+	 * Ideally, we would do this call while locking the old ref storage
+	 * such that there cannot be any concurrent modifications. We do not
+	 * have the infra for that though, and the "files" backend does not
+	 * allow for a central lock due to its design. It's thus on the user to
+	 * ensure that there are no concurrent writes.
+	 */
+	ret = do_for_each_ref(old_refs, "", NULL, migrate_one_ref, 0,
+			      DO_FOR_EACH_INCLUDE_ROOT_REFS | DO_FOR_EACH_INCLUDE_BROKEN,
+			      &data);
+	if (ret < 0)
+		goto done;
+
+	/*
+	 * TODO: we might want to migrate to `initial_ref_transaction_commit()`
+	 * here, which is more efficient for the files backend because it would
+	 * write new refs into the packed-refs file directly. At this point,
+	 * the files backend doesn't handle pseudo-refs and symrefs correctly
+	 * though, so this requires some more work.
+	 */
+	ret = ref_transaction_commit(transaction, errbuf);
+	if (ret < 0)
+		goto done;
+
+	if (flags & REPO_MIGRATE_REF_STORAGE_FORMAT_DRYRUN) {
+		printf(_("Finished dry-run migration of refs, "
+			 "the result can be found at '%s'\n"), new_gitdir);
+		ret = 0;
+		goto done;
+	}
+
+	/*
+	 * Until now we were in the non-destructive phase, where we only
+	 * populated the new ref store. From hereon though we are about
+	 * to get hands by deleting the old ref store and then moving
+	 * the new one into place.
+	 *
+	 * Assuming that there were no concurrent writes, the new ref
+	 * store should have all information. So if we fail from hereon
+	 * we may be in an in-between state, but it would still be able
+	 * to recover by manually moving remaining files from the
+	 * temporary migration directory into place.
+	 */
+	ret = ref_store_remove_on_disk(old_refs, errbuf);
+	if (ret < 0)
+		goto done;
+
+	ret = move_files(new_gitdir, old_refs->gitdir, errbuf);
+	if (ret < 0)
+		goto done;
+	rmdir(new_gitdir);
+
+	/*
+	 * We have migrated the repository, so we now need to adjust the
+	 * repository format so that clients will use the new ref store.
+	 * We also need to swap out the repository's main ref store.
+	 */
+	initialize_repository_version(hash_algo_by_ptr(repo->hash_algo), format, 1);
+
+	repo->refs_private = new_refs;
+	ref_store_release(old_refs);
+
+	ret = 0;
+
+done:
+	if (ret && new_refs)
+		ref_store_release(new_refs);
+	ref_transaction_free(transaction);
+	strbuf_release(&buf);
+	return ret;
+}
diff --git a/refs.h b/refs.h
index 61ee7b7a15..76d25df4de 100644
--- a/refs.h
+++ b/refs.h
@@ -1070,6 +1070,24 @@ int is_root_ref(const char *refname);
  */
 int is_pseudo_ref(const char *refname);
 
+/*
+ * The following flags can be passed to `repo_migrate_ref_storage_format()`:
+ *
+ *   - REPO_MIGRATE_REF_STORAGE_FORMAT_DRYRUN: perform a dry-run migration
+ *     without touching the main repository. The result will be written into a
+ *     temporary ref storage directory.
+ */
+#define REPO_MIGRATE_REF_STORAGE_FORMAT_DRYRUN (1 << 0)
+
+/*
+ * Migrate the ref storage format used by the repository to the
+ * specified one.
+ */
+int repo_migrate_ref_storage_format(struct repository *repo,
+				    enum ref_storage_format format,
+				    unsigned int flags,
+				    struct strbuf *err);
+
 /*
  * The following functions have been removed in Git v2.45 in favor of functions
  * that receive a `ref_store` as parameter. The intent of this section is
-- 
2.45.1.216.g4365c6fcf9.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  parent reply	other threads:[~2024-05-24 10:15 UTC|newest]

Thread overview: 103+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-23  8:25 [PATCH 0/9] refs: ref storage format migrations Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 1/9] setup: unset ref storage when reinitializing repository version Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 2/9] refs: convert ref storage format to an enum Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 3/9] refs: pass storage format to `ref_store_init()` explicitly Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 4/9] refs: allow to skip creation of reflog entries Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 5/9] refs/files: refactor `add_pseudoref_and_head_entries()` Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 6/9] refs/files: extract function to iterate through root refs Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 7/9] refs: implement removal of ref storages Patrick Steinhardt
2024-05-23  8:25 ` [PATCH 8/9] refs: implement logic to migrate between ref storage formats Patrick Steinhardt
2024-05-23 17:31   ` Eric Sunshine
2024-05-24  7:35     ` Patrick Steinhardt
2024-05-24  9:01       ` Eric Sunshine
2024-05-23  8:25 ` [PATCH 9/9] builtin/refs: new command to migrate " Patrick Steinhardt
2024-05-23 17:40   ` Eric Sunshine
2024-05-24  7:35     ` Patrick Steinhardt
2024-05-23 16:09 ` [PATCH 0/9] refs: ref storage format migrations Junio C Hamano
2024-05-24  7:33   ` Patrick Steinhardt
2024-05-24 16:28     ` Junio C Hamano
2024-05-28  5:13       ` Patrick Steinhardt
2024-05-28 16:16         ` Junio C Hamano
2024-05-24 10:14 ` [PATCH v2 " Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 1/9] setup: unset ref storage when reinitializing repository version Patrick Steinhardt
2024-05-24 21:33     ` Justin Tobler
2024-05-28  5:13       ` Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 2/9] refs: convert ref storage format to an enum Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 3/9] refs: pass storage format to `ref_store_init()` explicitly Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 4/9] refs: allow to skip creation of reflog entries Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 5/9] refs/files: refactor `add_pseudoref_and_head_entries()` Patrick Steinhardt
2024-05-24 10:14   ` [PATCH v2 6/9] refs/files: extract function to iterate through root refs Patrick Steinhardt
2024-05-24 10:15   ` [PATCH v2 7/9] refs: implement removal of ref storages Patrick Steinhardt
2024-05-24 10:15   ` Patrick Steinhardt [this message]
2024-05-24 22:32     ` [PATCH v2 8/9] refs: implement logic to migrate between ref storage formats Justin Tobler
2024-05-28  5:14       ` Patrick Steinhardt
2024-05-24 10:15   ` [PATCH v2 9/9] builtin/refs: new command to migrate " Patrick Steinhardt
2024-05-24 18:24     ` Ramsay Jones
2024-05-24 19:29       ` Eric Sunshine
2024-05-28  5:14         ` Patrick Steinhardt
2024-05-28  6:31 ` [PATCH v3 00/12] refs: ref storage format migrations Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 01/12] setup: unset ref storage when reinitializing repository version Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 02/12] refs: convert ref storage format to an enum Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 03/12] refs: pass storage format to `ref_store_init()` explicitly Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 04/12] refs: allow to skip creation of reflog entries Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 05/12] refs/files: refactor `add_pseudoref_and_head_entries()` Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 06/12] refs/files: extract function to iterate through root refs Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 07/12] refs/files: fix NULL pointer deref when releasing ref store Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 08/12] reftable: inline `merged_table_release()` Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 09/12] worktree: don't store main worktree twice Patrick Steinhardt
2024-05-28  6:31   ` [PATCH v3 10/12] refs: implement removal of ref storages Patrick Steinhardt
2024-05-28  6:32   ` [PATCH v3 11/12] refs: implement logic to migrate between ref storage formats Patrick Steinhardt
2024-05-28  6:32   ` [PATCH v3 12/12] builtin/refs: new command to migrate " Patrick Steinhardt
2024-05-31 23:46     ` Junio C Hamano
2024-06-02  1:03       ` Junio C Hamano
2024-06-03  7:37         ` Patrick Steinhardt
2024-05-28 18:16   ` [PATCH v3 00/12] refs: ref storage format migrations Junio C Hamano
2024-05-28 18:26     ` Junio C Hamano
2024-06-03  9:30 ` [PATCH v4 00/12] refs: ref storage migrations Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 01/12] setup: unset ref storage when reinitializing repository version Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 02/12] refs: convert ref storage format to an enum Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 03/12] refs: pass storage format to `ref_store_init()` explicitly Patrick Steinhardt
2024-06-04  8:23     ` Karthik Nayak
2024-06-03  9:30   ` [PATCH v4 04/12] refs: allow to skip creation of reflog entries Patrick Steinhardt
2024-06-04 11:04     ` Karthik Nayak
2024-06-03  9:30   ` [PATCH v4 05/12] refs/files: refactor `add_pseudoref_and_head_entries()` Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 06/12] refs/files: extract function to iterate through root refs Patrick Steinhardt
2024-06-05 10:07     ` Jeff King
2024-06-06  4:50       ` Patrick Steinhardt
2024-06-06  5:15         ` Patrick Steinhardt
2024-06-06  6:32           ` Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 07/12] refs/files: fix NULL pointer deref when releasing ref store Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 08/12] reftable: inline `merged_table_release()` Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 09/12] worktree: don't store main worktree twice Patrick Steinhardt
2024-06-03  9:30   ` [PATCH v4 10/12] refs: implement removal of ref storages Patrick Steinhardt
2024-06-04 11:17     ` Karthik Nayak
2024-06-05 10:12     ` Jeff King
2024-06-05 16:54       ` Junio C Hamano
2024-06-06  4:51       ` Patrick Steinhardt
2024-06-03  9:31   ` [PATCH v4 11/12] refs: implement logic to migrate between ref storage formats Patrick Steinhardt
2024-06-04 15:28     ` Karthik Nayak
2024-06-05  5:52       ` Patrick Steinhardt
2024-06-05 10:03     ` Jeff King
2024-06-05 16:59       ` Junio C Hamano
2024-06-06  4:51         ` Patrick Steinhardt
2024-06-06  7:01           ` Jeff King
2024-06-06 15:41             ` Junio C Hamano
2024-06-08 11:36               ` Jeff King
2024-06-08 19:06                 ` Junio C Hamano
2024-06-06  4:51       ` Patrick Steinhardt
2024-06-03  9:31   ` [PATCH v4 12/12] builtin/refs: new command to migrate " Patrick Steinhardt
2024-06-06  5:28 ` [PATCH v5 00/12] refs: ref storage migrations Patrick Steinhardt
2024-06-06  5:28   ` [PATCH v5 01/12] setup: unset ref storage when reinitializing repository version Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 02/12] refs: convert ref storage format to an enum Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 03/12] refs: pass storage format to `ref_store_init()` explicitly Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 04/12] refs: allow to skip creation of reflog entries Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 05/12] refs/files: refactor `add_pseudoref_and_head_entries()` Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 06/12] refs/files: extract function to iterate through root refs Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 07/12] refs/files: fix NULL pointer deref when releasing ref store Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 08/12] reftable: inline `merged_table_release()` Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 09/12] worktree: don't store main worktree twice Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 10/12] refs: implement removal of ref storages Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 11/12] refs: implement logic to migrate between ref storage formats Patrick Steinhardt
2024-06-06  5:29   ` [PATCH v5 12/12] builtin/refs: new command to migrate " Patrick Steinhardt
2024-06-06  7:06   ` [PATCH v5 00/12] refs: ref storage migrations Jeff King
2024-06-06 16:18   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4d3eb5ea896bffffbf28ab4865b69901cc9edee7.1716545235.git.ps@pks.im \
    --to=ps@pks.im \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=sunshine@sunshineco.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).