All of lore.kernel.org
 help / color / mirror / Atom feed
From: Patrick Steinhardt <ps@pks.im>
To: Taylor Blau <me@ttaylorr.com>
Cc: git@vger.kernel.org, Junio C Hamano <gitster@pobox.com>,
	Jeff King <peff@peff.net>,
	Jonathan Tan <jonathantanmy@google.com>
Subject: Re: [PATCH 2/2] builtin/repack.c: implement support for `--cruft-max-size`
Date: Fri, 8 Sep 2023 13:21:44 +0200	[thread overview]
Message-ID: <ZPsDyKOa76Mxb9u-@tanuki> (raw)
In-Reply-To: <7e4e42e1aacf2111f04a933c0725a8c81769f8d6.1694123506.git.me@ttaylorr.com>

[-- Attachment #1: Type: text/plain, Size: 6905 bytes --]

On Thu, Sep 07, 2023 at 05:52:04PM -0400, Taylor Blau wrote:
[snip]
> @@ -125,17 +133,39 @@ static void mark_packs_for_deletion_1(struct string_list *names,
>  		if (len < hexsz)
>  			continue;
>  		sha1 = item->string + len - hexsz;
> -		/*
> -		 * Mark this pack for deletion, which ensures that this
> -		 * pack won't be included in a MIDX (if `--write-midx`
> -		 * was given) and that we will actually delete this pack
> -		 * (if `-d` was given).
> -		 */
> -		if (!string_list_has_string(names, sha1))
> -			item->util = (void*)1;
> +
> +		if (pack_is_retained(item)) {
> +			item->util = NULL;
> +		} else if (!string_list_has_string(names, sha1)) {
> +			/*
> +			 * Mark this pack for deletion, which ensures
> +			 * that this pack won't be included in a MIDX
> +			 * (if `--write-midx` was given) and that we
> +			 * will actually delete this pack (if `-d` was
> +			 * given).
> +			 */
> +			item->util = DELETE_PACK;
> +		}

I find the behaviour of this function a tad surprising as it doesn't
only mark a pack for deletion, but it also marks a pack as not being
retained anymore. Shouldn't we rather:

    if (pack_is_retained(item)) {
        // Theoretically speaking we shouldn't even do this bit here as
        // we _un_mark the pack for deletion. But at least we shouldn't
        // be removing the `RETAIN_PACK` bit, I'd think.
        item->util &= ~DELETE_PACK; 
    } else if (!string_list_has_string(names, sha1)) {
        // And here we shouldn't discard the `RETAIN_PACK` bit either.
        item->util |= DELETE_PACK;
    }

>  	}
>  }
>  
> +static void retain_cruft_pack(struct existing_packs *existing,
> +			      struct packed_git *cruft)
> +{
> +	struct strbuf buf = STRBUF_INIT;
> +	struct string_list_item *item;
> +
> +	strbuf_addstr(&buf, pack_basename(cruft));
> +	strbuf_strip_suffix(&buf, ".pack");
> +
> +	item = string_list_lookup(&existing->cruft_packs, buf.buf);
> +	if (!item)
> +		BUG("could not find cruft pack '%s'", pack_basename(cruft));
> +
> +	item->util = (void*)RETAIN_PACK;
> +	strbuf_release(&buf);
> +}
> +

Similarly, should we handle potentially pre-existing bits gracefully and
`item->util |= RETAIN_PACK`?

>  static void mark_packs_for_deletion(struct existing_packs *existing,
>  				    struct string_list *names)
>  
> @@ -217,6 +247,8 @@ static void collect_pack_filenames(struct existing_packs *existing,
>  	}
>  
>  	string_list_sort(&existing->kept_packs);
> +	string_list_sort(&existing->non_kept_packs);
> +	string_list_sort(&existing->cruft_packs);
>  	strbuf_release(&buf);
>  }
>  
> @@ -799,6 +831,72 @@ static void remove_redundant_bitmaps(struct string_list *include,
>  	strbuf_release(&path);
>  }
>  
> +static int existing_cruft_pack_cmp(const void *va, const void *vb)
> +{
> +	struct packed_git *a = *(struct packed_git **)va;
> +	struct packed_git *b = *(struct packed_git **)vb;
> +
> +	if (a->pack_size < b->pack_size)
> +		return -1;
> +	if (a->pack_size > b->pack_size)
> +		return 1;
> +	return 0;
> +}
> +
> +static void collapse_small_cruft_packs(FILE *in, unsigned long max_size,

We might want to use `size_t` to denote file sizes instead of `unsigned
long`.

> +				       struct existing_packs *existing)
> +{
> +	struct packed_git **existing_cruft, *p;
> +	struct strbuf buf = STRBUF_INIT;
> +	unsigned long total_size = 0;

Here, as well.

> +	size_t existing_cruft_nr = 0;
> +	size_t i;
> +
> +	ALLOC_ARRAY(existing_cruft, existing->cruft_packs.nr);
> +
> +	for (p = get_all_packs(the_repository); p; p = p->next) {
> +		if (!(p->is_cruft && p->pack_local))
> +			continue;
> +
> +		strbuf_reset(&buf);
> +		strbuf_addstr(&buf, pack_basename(p));
> +		strbuf_strip_suffix(&buf, ".pack");
> +
> +		if (!string_list_has_string(&existing->cruft_packs, buf.buf))
> +			continue;
> +
> +		if (existing_cruft_nr >= existing->cruft_packs.nr)
> +			BUG("too many cruft packs (found %"PRIuMAX", but knew "
> +			    "of %"PRIuMAX")",
> +			    (uintmax_t)existing_cruft_nr + 1,
> +			    (uintmax_t)existing->cruft_packs.nr);
> +		existing_cruft[existing_cruft_nr++] = p;
> +	}
> +
> +	QSORT(existing_cruft, existing_cruft_nr, existing_cruft_pack_cmp);
> +
> +	for (i = 0; i < existing_cruft_nr; i++) {
> +		off_t proposed;

This should also be `size_t` given that `st_add` returns `size_t` and
not `off_t`.

> +		p = existing_cruft[i];
> +		proposed = st_add(total_size, p->pack_size);
> +
> +		if (proposed <= max_size) {
> +			total_size = proposed;
> +			fprintf(in, "-%s\n", pack_basename(p));
> +		} else {
> +			retain_cruft_pack(existing, p);
> +			fprintf(in, "%s\n", pack_basename(p));
> +		}

It's a bit funny that we re-check whether we have exceeded the maximum
size in subsequente iterations once we hit the limit, but it arguably
makes the logic a bit simpler.

> +	}
> +
> +	for (i = 0; i < existing->non_kept_packs.nr; i++)
> +		fprintf(in, "-%s.pack\n",
> +			existing->non_kept_packs.items[i].string);
> +
> +	strbuf_release(&buf);
> +}
> +
>  static int write_cruft_pack(const struct pack_objects_args *args,
>  			    const char *destination,
>  			    const char *pack_prefix,
> @@ -846,10 +944,18 @@ static int write_cruft_pack(const struct pack_objects_args *args,
>  	in = xfdopen(cmd.in, "w");
>  	for_each_string_list_item(item, names)
>  		fprintf(in, "%s-%s.pack\n", pack_prefix, item->string);
> -	for_each_string_list_item(item, &existing->non_kept_packs)
> -		fprintf(in, "-%s.pack\n", item->string);
> -	for_each_string_list_item(item, &existing->cruft_packs)
> -		fprintf(in, "-%s.pack\n", item->string);
> +	if (args->max_pack_size && !cruft_expiration) {
> +		unsigned long max_pack_size;
> +		if (!git_parse_ulong(args->max_pack_size, &max_pack_size))
> +			return error(_("could not parse --cruft-max-size: '%s'"),
> +				     args->max_pack_size);
> +		collapse_small_cruft_packs(in, max_pack_size, existing);
> +	} else {
> +		for_each_string_list_item(item, &existing->non_kept_packs)
> +			fprintf(in, "-%s.pack\n", item->string);
> +		for_each_string_list_item(item, &existing->cruft_packs)
> +			fprintf(in, "-%s.pack\n", item->string);
> +	}

If I understand correctly, we only collapse small cruft packs in case
we're not expiring any objects at the same time. Is there an inherent
reason why? I would imagine that it can indeed be useful to expire
objects contained in cruft packs and then have git-repack(1) recombine
whatever is left into larger packs.

If the reason is basically "it's complicated" then that is fine with me,
we can still implement the functionality at a later point in time. But
until then I think that we should let callers know that the two options
are incompatible with each other by producing an error when both are
passed.

Patrick

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  parent reply	other threads:[~2023-09-08 11:21 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-07 21:51 [PATCH 0/2] repack: implement `--cruft-max-size` Taylor Blau
2023-09-07 21:52 ` [PATCH 1/2] t7700: split cruft-related tests to t7704 Taylor Blau
2023-09-08  0:01   ` Eric Sunshine
2023-09-07 21:52 ` [PATCH 2/2] builtin/repack.c: implement support for `--cruft-max-size` Taylor Blau
2023-09-07 23:42   ` Junio C Hamano
2023-09-25 18:01     ` Taylor Blau
2023-09-08 11:21   ` Patrick Steinhardt [this message]
2023-10-02 20:30     ` Taylor Blau
2023-10-03  0:44 ` [PATCH v2 0/3] repack: implement `--cruft-max-size` Taylor Blau
2023-10-03  0:44   ` [PATCH v2 1/3] t7700: split cruft-related tests to t7704 Taylor Blau
2023-10-03  0:44   ` [PATCH v2 2/3] builtin/repack.c: parse `--max-pack-size` with OPT_MAGNITUDE Taylor Blau
2023-10-05 11:31     ` Patrick Steinhardt
2023-10-05 17:28       ` Taylor Blau
2023-10-05 20:22         ` Junio C Hamano
2023-10-03  0:44   ` [PATCH v2 3/3] builtin/repack.c: implement support for `--max-cruft-size` Taylor Blau
2023-10-05 12:08     ` Patrick Steinhardt
2023-10-05 17:35       ` Taylor Blau
2023-10-05 20:25       ` Junio C Hamano
2023-10-07 17:20     ` [PATCH] repack: free existing_cruft array after use Jeff King
2023-10-09  1:24       ` Taylor Blau
2023-10-09 17:28         ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZPsDyKOa76Mxb9u-@tanuki \
    --to=ps@pks.im \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jonathantanmy@google.com \
    --cc=me@ttaylorr.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.