git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Patrick Steinhardt <ps@pks.im>
To: Eric Wong <e@80x24.org>
Cc: git@vger.kernel.org, Jeff King <peff@peff.net>
Subject: Re: [PATCH v1 05/10] cat-file: use delta_base_cache entries directly
Date: Wed, 24 Jul 2024 10:35:59 +0200	[thread overview]
Message-ID: <ZqC872ExETzRH60Z@tanuki> (raw)
In-Reply-To: <20240715003519.2671385-6-e@80x24.org>

[-- Attachment #1: Type: text/plain, Size: 6381 bytes --]

On Mon, Jul 15, 2024 at 12:35:14AM +0000, Eric Wong wrote:
> For objects already in the delta_base_cache, we can safely use
> them directly to avoid the malloc+memcpy+free overhead.

Same here, I feel like you need to explain a bit more in depth what the
actual idea behind your patch is to help reviewers.

> diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> index bc4bb89610..769c8b48d2 100644
> --- a/builtin/cat-file.c
> +++ b/builtin/cat-file.c
> @@ -24,6 +24,7 @@
>  #include "promisor-remote.h"
>  #include "mailmap.h"
>  #include "write-or-die.h"
> +#define USE_DIRECT_CACHE 1

I'm confused by this. Why do we introduce a macro that is always defined
to a trueish value? Why don't we just remove the code guarded by this?

>  enum batch_mode {
>  	BATCH_MODE_CONTENTS,
> @@ -386,7 +387,18 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
>  
>  	if (data->content) {
>  		batch_write(opt, data->content, data->size);
> -		FREE_AND_NULL(data->content);
> +		switch (data->info.whence) {
> +		case OI_CACHED: BUG("FIXME OI_CACHED support not done");

Is this something that will get addressed in a subsequent patch? If so,
the commit message and the message here should likely mention this. If
not, we should have a comment here saying why this is fine to be kept.

> +		case OI_LOOSE:
> +		case OI_PACKED:
> +			FREE_AND_NULL(data->content);
> +			break;
> +		case OI_DBCACHED:
> +			if (USE_DIRECT_CACHE)
> +				unlock_delta_base_cache();
> +			else
> +				FREE_AND_NULL(data->content);
> +		}
>  	} else if (data->type == OBJ_BLOB) {
>  		if (opt->buffer_output)
>  			fflush(stdout);
> @@ -815,6 +827,7 @@ static int batch_objects(struct batch_options *opt)
>  			data.info.sizep = &data.size;
>  			data.info.contentp = &data.content;
>  			data.info.content_limit = big_file_threshold;
> +			data.info.direct_cache = USE_DIRECT_CACHE;
>  		}
>  	}
>  
> diff --git a/object-file.c b/object-file.c
> index 1cc29c3c58..19100e823d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1586,6 +1586,11 @@ static int do_oid_object_info_extended(struct repository *r,
>  			oidclr(oi->delta_base_oid, the_repository->hash_algo);
>  		if (oi->type_name)
>  			strbuf_addstr(oi->type_name, type_name(co->type));
> +		/*
> +		 * Currently `blame' is the only command which creates
> +		 * OI_CACHED, and direct_cache is only used by `cat-file'.
> +		 */
> +		assert(!oi->direct_cache);

We shouldn't use asserts, but rather use `BUG()` statements in our
codebase. `assert()`s don't help users that run production builds.

>  		if (oi->contentp)
>  			*oi->contentp = xmemdupz(co->buf, co->size);
>  		oi->whence = OI_CACHED;
> diff --git a/object-store-ll.h b/object-store-ll.h
> index b71a15f590..50c5219308 100644
> --- a/object-store-ll.h
> +++ b/object-store-ll.h
> @@ -298,6 +298,13 @@ struct object_info {
>  		OI_PACKED,
>  		OI_DBCACHED
>  	} whence;
> +
> +	/*
> +	 * set if caller is able to use OI_DBCACHED entries without copying
> +	 * TODO OI_CACHED if its use goes beyond blame
> +	 */
> +	unsigned direct_cache:1;
> +

This comment looks unfinished to me.

>  	union {
>  		/*
>  		 * struct {
> diff --git a/packfile.c b/packfile.c
> index 1a409ec142..b2660e14f9 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1362,6 +1362,9 @@ static enum object_type packed_to_object_type(struct repository *r,
>  static struct hashmap delta_base_cache;
>  static size_t delta_base_cached;
>  
> +/* ensures oi->direct_cache is used properly */
> +static int delta_base_cache_lock;
> +

How exactly does it ensure it? What is the intent of this variable and
how would it be used correctly?

>  static LIST_HEAD(delta_base_cache_lru);
>  
>  struct delta_base_cache_key {
> @@ -1444,6 +1447,18 @@ static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
>  	free(ent);
>  }
>  
> +static void lock_delta_base_cache(void)
> +{
> +	delta_base_cache_lock++;
> +	assert(delta_base_cache_lock == 1);
> +}
> +
> +void unlock_delta_base_cache(void)
> +{
> +	delta_base_cache_lock--;
> +	assert(delta_base_cache_lock == 0);
> +}

Hum. So this looks like a pseudo-mutex to me? Are there any code paths
where this may be used in a threaded context? I assume not in the
current state of affairs as we only use it in git-cat-file(1).

>  static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
>  {
>  	free(ent->data);
> @@ -1453,6 +1468,7 @@ static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
>  void clear_delta_base_cache(void)
>  {
>  	struct list_head *lru, *tmp;
> +	assert(!delta_base_cache_lock);
>  	list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
>  		struct delta_base_cache_entry *entry =
>  			list_entry(lru, struct delta_base_cache_entry, lru);
> @@ -1466,6 +1482,7 @@ static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
>  	struct delta_base_cache_entry *ent;
>  	struct list_head *lru, *tmp;
>  
> +	assert(!delta_base_cache_lock);
>  	/*
>  	 * Check required to avoid redundant entries when more than one thread
>  	 * is unpacking the same object, in unpack_entry() (since its phases I
> @@ -1521,11 +1538,16 @@ int packed_object_info(struct repository *r, struct packed_git *p,
>  		if (oi->sizep)
>  			*oi->sizep = ent->size;
>  		if (oi->contentp) {
> -			if (!oi->content_limit ||
> -					ent->size <= oi->content_limit)
> +			/* ignore content_limit if avoiding copy from cache */
> +			if (oi->direct_cache) {
> +				lock_delta_base_cache();
> +				*oi->contentp = ent->data;
> +			} else if (!oi->content_limit ||
> +					ent->size <= oi->content_limit) {
>  				*oi->contentp = xmemdupz(ent->data, ent->size);
> -			else
> +			} else {
>  				*oi->contentp = NULL; /* caller must stream */
> +			}
>  		}
>  	} else if (oi->contentp && !oi->content_limit) {
>  		*oi->contentp = unpack_entry(r, p, obj_offset, &type,

Okay, this hunk is the gist of this patch. Instead of copying over the
delta base, we simply take its data pointer as the content pointer. All
the other infra that you're adding is mostly only added as a safeguard
to make sure that we don't discard the delta base while the object is
getting accessed.

Patrick

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2024-07-24  8:36 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-15  0:35 [PATCH v1 00/10] cat-file speedups Eric Wong
2024-07-15  0:35 ` [PATCH v1 01/10] packfile: move sizep computation Eric Wong
2024-07-24  8:35   ` Patrick Steinhardt
2024-07-15  0:35 ` [PATCH v1 02/10] packfile: allow content-limit for cat-file Eric Wong
2024-07-24  8:35   ` Patrick Steinhardt
2024-07-26  7:30     ` Eric Wong
2024-07-15  0:35 ` [PATCH v1 03/10] packfile: fix off-by-one in content_limit comparison Eric Wong
2024-07-24  8:35   ` Patrick Steinhardt
2024-07-26  7:43     ` Eric Wong
2024-07-15  0:35 ` [PATCH v1 04/10] packfile: inline cache_or_unpack_entry Eric Wong
2024-07-15  0:35 ` [PATCH v1 05/10] cat-file: use delta_base_cache entries directly Eric Wong
2024-07-24  8:35   ` Patrick Steinhardt [this message]
2024-07-26  7:42     ` Eric Wong
2024-08-18 17:36       ` assert vs BUG [was: [PATCH v1 05/10] cat-file: use delta_base_cache entries directly] Eric Wong
2024-08-19 15:50         ` Junio C Hamano
2024-07-15  0:35 ` [PATCH v1 06/10] packfile: packed_object_info avoids packed_to_object_type Eric Wong
2024-07-24  8:36   ` Patrick Steinhardt
2024-07-26  8:01     ` Eric Wong
2024-07-15  0:35 ` [PATCH v1 07/10] object_info: content_limit only applies to blobs Eric Wong
2024-07-15  0:35 ` [PATCH v1 08/10] cat-file: batch-command uses content_limit Eric Wong
2024-07-15  0:35 ` [PATCH v1 09/10] cat-file: batch_write: use size_t for length Eric Wong
2024-07-15  0:35 ` [PATCH v1 10/10] cat-file: use writev(2) if available Eric Wong
2024-07-24  8:35 ` [PATCH v1 00/10] cat-file speedups Patrick Steinhardt
2024-08-23 22:46 ` [PATCH v2 " Eric Wong
2024-08-23 22:46   ` [PATCH v2 01/10] packfile: move sizep computation Eric Wong
2024-09-17 10:06     ` Taylor Blau
2024-08-23 22:46   ` [PATCH v2 02/10] packfile: allow content-limit for cat-file Eric Wong
2024-08-26 17:10     ` Junio C Hamano
2024-08-27 20:23       ` Eric Wong
2024-09-17 10:10         ` Taylor Blau
2024-09-17 21:15           ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 03/10] packfile: fix off-by-one in content_limit comparison Eric Wong
2024-08-26 16:55     ` Junio C Hamano
2024-09-17 10:11       ` Taylor Blau
2024-08-23 22:46   ` [PATCH v2 04/10] packfile: inline cache_or_unpack_entry Eric Wong
2024-08-26 17:09     ` Junio C Hamano
2024-10-06 17:40       ` Eric Wong
2024-08-23 22:46   ` [PATCH v2 05/10] cat-file: use delta_base_cache entries directly Eric Wong
2024-08-26 21:31     ` Junio C Hamano
2024-08-26 23:05       ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 06/10] packfile: packed_object_info avoids packed_to_object_type Eric Wong
2024-08-26 21:50     ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 07/10] object_info: content_limit only applies to blobs Eric Wong
2024-08-26 22:02     ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 08/10] cat-file: batch-command uses content_limit Eric Wong
2024-08-26 22:13     ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 09/10] cat-file: batch_write: use size_t for length Eric Wong
2024-08-27  5:06     ` Junio C Hamano
2024-08-23 22:46   ` [PATCH v2 10/10] cat-file: use writev(2) if available Eric Wong
2024-08-27  5:41     ` Junio C Hamano
2024-08-27 15:43       ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZqC872ExETzRH60Z@tanuki \
    --to=ps@pks.im \
    --cc=e@80x24.org \
    --cc=git@vger.kernel.org \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).