All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stefan Beller <stefanbeller@gmail.com>
To: Patrick Reynolds <patrick.reynolds@github.com>,
	Git <git@vger.kernel.org>, Jeff King <peff@peff.net>
Subject: Re: [PATCH v2] use a hashmap to make remotes faster
Date: Fri, 05 Sep 2014 11:55:06 +0200	[thread overview]
Message-ID: <5409887A.1000602@gmail.com> (raw)
In-Reply-To: <25222.0938886865-sendEmail@debian>

On 29.07.2014 16:43, Patrick Reynolds wrote:
> Remotes are stored as an array, so looking one up or adding one without
> duplication is an O(n) operation.  Reading an entire config file full of
> remotes is O(n^2) in the number of remotes.  For a repository with tens of
> thousands of remotes, the running time can hit multiple minutes.
> 
> Hash tables are way faster.  So we add a hashmap from remote name to
> struct remote and use it for all lookups.  The time to add a new remote to
> a repo that already has 50,000 remotes drops from ~2 minutes to < 1
> second.
> 
> We retain the old array of remotes so iterators proceed in config-file
> order.
> 
> Signed-off-by: Patrick Reynolds <patrick.reynolds@github.com>
> ---
> mark function with no arguments as taking void
> 
>  remote.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++-----------------
>  remote.h |  3 +++
>  2 files changed, 49 insertions(+), 17 deletions(-)
> 
> diff --git a/remote.c b/remote.c
> index a0701f6..52533e4 100644
> --- a/remote.c
> +++ b/remote.c
> @@ -42,6 +42,7 @@ struct rewrites {
>  static struct remote **remotes;
>  static int remotes_alloc;
>  static int remotes_nr;
> +static struct hashmap remotes_hash;
>  
>  static struct branch **branches;
>  static int branches_alloc;
> @@ -136,26 +137,51 @@ static void add_url_alias(struct remote *remote, const char *url)
>  	add_pushurl_alias(remote, url);
>  }
>  
> +struct remotes_hash_key {
> +	const char *str;
> +	int len;
> +};
> +
> +static int remotes_hash_cmp(const struct remote *a, const struct remote *b, const struct remotes_hash_key *key)
> +{
> +	if (key)
> +		return strncmp(a->name, key->str, key->len) || a->name[key->len];
> +	else
> +		return strcmp(a->name, b->name);
> +}
> +
> +static inline void init_remotes_hash(void)
> +{
> +	if (!remotes_hash.cmpfn)
> +		hashmap_init(&remotes_hash, (hashmap_cmp_fn)remotes_hash_cmp, 0);
> +}
> +
>  static struct remote *make_remote(const char *name, int len)
>  {
> -	struct remote *ret;
> -	int i;
> +	struct remote *ret, *replaced;
> +	struct remotes_hash_key lookup;
> +	struct hashmap_entry lookup_entry;
>  
> -	for (i = 0; i < remotes_nr; i++) {
> -		if (len ? (!strncmp(name, remotes[i]->name, len) &&
> -			   !remotes[i]->name[len]) :
> -		    !strcmp(name, remotes[i]->name))
> -			return remotes[i];
> -	}
> +	if (!len)
> +		len = strlen(name);
> +
> +	init_remotes_hash();
> +	lookup.str = name;
> +	lookup.len = len;
> +	hashmap_entry_init(&lookup_entry, memhash(name, len));
> +
> +	if ((ret = hashmap_get(&remotes_hash, &lookup_entry, &lookup)) != NULL)
> +		return ret;
>  
>  	ret = xcalloc(1, sizeof(struct remote));
>  	ret->prune = -1;  /* unspecified */
>  	ALLOC_GROW(remotes, remotes_nr + 1, remotes_alloc);
>  	remotes[remotes_nr++] = ret;
> -	if (len)
> -		ret->name = xstrndup(name, len);
> -	else
> -		ret->name = xstrdup(name);
> +	ret->name = xstrndup(name, len);
> +
> +	hashmap_entry_init(ret, lookup_entry.hash);
> +	replaced = hashmap_put(&remotes_hash, ret);
> +	assert(replaced == NULL);  /* no previous entry overwritten */
>  	return ret;
>  }
>  
> @@ -722,13 +748,16 @@ struct remote *pushremote_get(const char *name)
>  
>  int remote_is_configured(const char *name)
>  {
> -	int i;
> +	struct remotes_hash_key lookup;
> +	struct hashmap_entry lookup_entry;
>  	read_config();
>  
> -	for (i = 0; i < remotes_nr; i++)
> -		if (!strcmp(name, remotes[i]->name))
> -			return 1;
> -	return 0;
> +	init_remotes_hash();
> +	lookup.str = name;
> +	lookup.len = strlen(name);
> +	hashmap_entry_init(&lookup_entry, memhash(name, lookup.len));
> +
> +	return hashmap_get(&remotes_hash, &lookup_entry, &lookup) != NULL;
>  }
>  
>  int for_each_remote(each_remote_fn fn, void *priv)
> diff --git a/remote.h b/remote.h
> index 917d383..81cb5ff 100644
> --- a/remote.h
> +++ b/remote.h
> @@ -1,6 +1,7 @@
>  #ifndef REMOTE_H
>  #define REMOTE_H
>  
> +#include "hashmap.h"
>  #include "parse-options.h"
>  
>  enum {
> @@ -10,6 +11,8 @@ enum {
>  };
>  
>  struct remote {
> +	struct hashmap_entry ent;  /* must be first */
> +

I stumbled about this comment  "/* must be first */"
when reading the changelog.

Why does it need to be first?
Is it a common reason I'm just not aware of,
or would it make sense to put the reason into the comment as well?

Thanks,
Stefan

>  	const char *name;
>  	int origin;
>  
> 

  parent reply	other threads:[~2014-09-05  9:55 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-29 14:43 [PATCH v2] use a hashmap to make remotes faster Patrick Reynolds
2014-07-29 17:46 ` Jeff King
2014-07-30 18:26 ` Junio C Hamano
2014-09-05  9:55 ` Stefan Beller [this message]
2014-09-05 15:16   ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5409887A.1000602@gmail.com \
    --to=stefanbeller@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=patrick.reynolds@github.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.