[PATCH 1/3] cherry: cache patch-ids to avoid repeating work

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
@ 2008-07-09  3:53 Geoffrey Irving
  2008-07-09  5:14 ` Junio C Hamano
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-09  3:53 UTC (permalink / raw)
  To: Johannes Schindelin, git@vger.kernel.org, Junio C Hamano

>From a3afd1455d215a541e1481e0f064df743d9219cc Mon Sep 17 00:00:00 2001
From: Geoffrey Irving <irving@naml.us>
Date: Sat, 7 Jun 2008 16:03:49 -0700
Subject: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work

Added cached-sha-map.[hc] implementing a persistent hash map from sha1 to
sha1.  The map is read with mmap, and completely rewritten if any entries
change.  It would be good to add incremental update to handle the usual case
where only a few entries change.

This structure is used by patch-ids.c to cache the mapping from commit to
patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
this speeds up the second invocation of git-cherry by two orders of
magnitude.

Original code cannibalized from Johannes Schindelin's notes-index structure.
---

Here's another (hopefully final) version of the patch-id-cache code,
since I finally got around to updating it with Dscho's suggestions.

 Makefile          |    2 +
 cached-sha1-map.c |  182 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 cached-sha1-map.h |   45 +++++++++++++
 patch-ids.c       |   18 +++++-
 4 files changed, 246 insertions(+), 1 deletions(-)
 create mode 100644 cached-sha1-map.c
 create mode 100644 cached-sha1-map.h

diff --git a/Makefile b/Makefile
index 4796565..f7360e1 100644
--- a/Makefile
+++ b/Makefile
@@ -356,6 +356,7 @@ LIB_H += pack-refs.h
 LIB_H += pack-revindex.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
+LIB_H += cached-sha1-map.h
 LIB_H += path-list.h
 LIB_H += pkt-line.h
 LIB_H += progress.h
@@ -436,6 +437,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += cached-sha1-map.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/cached-sha1-map.c b/cached-sha1-map.c
new file mode 100644
index 0000000..e363745
--- /dev/null
+++ b/cached-sha1-map.c
@@ -0,0 +1,182 @@
+#include "cached-sha1-map.h"
+
+union cached_sha1_map_header {
+	struct {
+		char signature[4]; /* HASH */
+		off_t count, size;
+	};
+	struct cached_sha1_entry padding; /* pad header out to 40 bytes */
+};
+
+static const char *signature = "HASH";
+
+static void init_empty_map(struct cached_sha1_map *cache, size_t size)
+{
+	cache->count = 0;
+	cache->size = size;
+	cache->initialized = 1;
+	cache->dirty = 1;
+	cache->mmapped = 0;
+	cache->entries = xcalloc(size, sizeof(struct cached_sha1_entry));
+}
+
+static void grow_map(struct cached_sha1_map *cache)
+{
+	struct cached_sha1_map new_cache;
+	size_t i;
+
+	/* allocate cache with twice the size */
+	new_cache.filename = cache->filename;
+	init_empty_map(&new_cache, cache->size * 2);
+
+	/* reinsert all entries */
+ 	for (i = 0; i < cache->size; i++)
+		if (!is_null_sha1(cache->entries[i].key))
+			set_cached_sha1_entry(&new_cache,
+				cache->entries[i].key, cache->entries[i].value);
+	/* finish */
+	free_cached_sha1_map(cache);
+	*cache = new_cache;
+}
+
+static void init_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	int fd;
+	union cached_sha1_map_header header;
+
+	if (cache->initialized)
+		return;
+
+	fd = open(git_path(cache->filename), O_RDONLY);
+	if (fd < 0) {
+		init_empty_map(cache, 64);
+		return;
+	}
+
+	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header))
+		die("cannot read %s header", cache->filename);
+
+	if (memcmp(header.signature, signature, 4))
+		die("%s has invalid header", cache->filename);
+
+	if (header.size & (header.size-1))
+		die("%s size %lld is not a power of two", cache->filename,
+			(long long)header.size);
+
+	cache->count = header.count;
+	cache->size = header.size;
+	cache->dirty = 0;
+	cache->initialized = 1;
+	cache->mmapped = 1;
+
+	/* check off_t to size_t conversion */
+	if (cache->count != header.count || cache->size != header.size)
+		die("%s is too large to hold in memory", cache->filename);
+
+	/* mmap entire file so that file / memory blocks are aligned */
+	cache->entries = xmmap(NULL,
+		sizeof(struct cached_sha1_entry) * (header.size + 1),
+		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	cache->entries += 1; /* skip header */
+	close(fd);
+}
+
+int write_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	union cached_sha1_map_header header;
+	struct lock_file update_lock;
+	int fd;
+	size_t entry_size;
+
+	if (!cache->initialized || !cache->dirty)
+		return 0;
+
+	fd = hold_lock_file_for_update(&update_lock,
+			git_path(cache->filename), 0);
+
+	if (fd < 0)
+		return error("could not construct %s", cache->filename);
+
+	memcpy(header.signature, signature, 4);
+	header.count = cache->count;
+	header.size = cache->size;
+	entry_size = sizeof(struct cached_sha1_entry) * cache->size;
+	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
+		|| write_in_full(fd, cache->entries, entry_size) != entry_size)
+		return error("could not write %s", cache->filename);
+
+	if (commit_lock_file(&update_lock) < 0)
+		return error("could not write %s", cache->filename);
+
+	cache->dirty = 0;
+	return 0;
+}
+
+void free_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	if (!cache->initialized)
+		return;
+
+	if (cache->mmapped)
+		munmap(cache->entries - 1,
+			sizeof(struct cached_sha1_entry) * (cache->size + 1));
+	else
+		free(cache->entries);
+}
+
+static size_t get_hash_index(const unsigned char *sha1)
+{
+	return ntohl(*(size_t*)sha1);
+}
+
+int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, unsigned char *value)
+{
+	size_t i, mask;
+
+	if (!cache->initialized)
+		init_cached_sha1_map(cache);
+
+	mask = cache->size - 1;
+
+	for (i = get_hash_index(key) & mask; ; i = (i+1) & mask) {
+		if (!hashcmp(key, cache->entries[i].key)) {
+			hashcpy(value, cache->entries[i].value);
+			return 0;
+		} else if (is_null_sha1(cache->entries[i].key))
+			return -1;
+	}
+}
+
+void set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value)
+{
+	size_t i, mask;
+	struct cached_sha1_entry *entry;
+
+	if (!cache->initialized)
+		init_cached_sha1_map(cache);
+
+	if (4*cache->count >= 3*cache->size)
+		grow_map(cache);
+
+	mask = cache->size - 1;
+
+	for (i = get_hash_index(key) & mask; ; i = (i+1) & mask) {
+		entry = cache->entries+i;
+
+		if (is_null_sha1(entry->key)) {
+			hashcpy(entry->key, key);
+			hashcpy(entry->value, value);
+			cache->count++;
+			cache->dirty = 1;
+			return;
+		} else if(!hashcmp(key, entry->key)) {
+			if (hashcmp(value, entry->value)) {
+				hashcpy(entry->value, value);
+				cache->dirty = 1;
+			}
+			return;
+		}
+	}
+}
diff --git a/cached-sha1-map.h b/cached-sha1-map.h
new file mode 100644
index 0000000..f592d07
--- /dev/null
+++ b/cached-sha1-map.h
@@ -0,0 +1,45 @@
+#ifndef CACHED_SHA1_MAP_H
+#define CACHED_SHA1_MAP_H
+
+#include "cache.h"
+
+/*
+ * A cached-sha1-map is a file storing a hash map from sha1 to sha1.
+ *
+ * The file is mmap'ed, updated in memory during operation, and flushed
+ * back to disk when freed.  Currently the entire file is rewritten for
+ * any change.  This could be a significant bottleneck for common uses,
+ * so it would be good to fix this later if possible.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ */
+
+struct cached_sha1_entry {
+	unsigned char key[20];
+	unsigned char value[20];
+};
+
+struct cached_sha1_map {
+	const char *filename; /* relative to GIT_DIR */
+
+	/* rest is for internal use */
+	size_t count, size;
+	unsigned int initialized : 1;
+	unsigned int dirty : 1;
+	unsigned int mmapped : 1;
+	struct cached_sha1_entry *entries; /* pointer to mmap'ed memory + 1 */
+};
+
+extern int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key,unsigned char *value);
+
+extern void set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value);
+
+extern int write_cached_sha1_map(struct cached_sha1_map *cache);
+
+extern void free_cached_sha1_map(struct cached_sha1_map *cache);
+
+#endif
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..36332f3 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -2,17 +2,31 @@
 #include "diff.h"
 #include "commit.h"
 #include "patch-ids.h"
+#include "cached-sha1-map.h"
+
+struct cached_sha1_map patch_id_cache;

 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	/* pull patch-id out of the cache if possible */
+	patch_id_cache.filename = "patch-id-cache";
+	if (!get_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	int ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	/* record commit, patch-id pair in cache */
+	set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -136,6 +150,8 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	write_cached_sha1_map(&patch_id_cache);
 	return 0;
 }

-- 
1.5.6.2.258.g7a51a

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
  2008-07-09  3:53 [PATCH 1/3] cherry: cache patch-ids to avoid repeating work Geoffrey Irving
@ 2008-07-09  5:14 ` Junio C Hamano
  2008-07-09  5:26   ` Geoffrey Irving
  0 siblings, 1 reply; 21+ messages in thread
From: Junio C Hamano @ 2008-07-09  5:14 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Johannes Schindelin, git@vger.kernel.org, Junio C Hamano

"Geoffrey Irving" <irving@naml.us> writes:

> From a3afd1455d215a541e1481e0f064df743d9219cc Mon Sep 17 00:00:00 2001

Please drop this line.

> From: Geoffrey Irving <irving@naml.us>
> Date: Sat, 7 Jun 2008 16:03:49 -0700
> Subject: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work

These are Ok _if_ the difference between them and what you have in your
e-mail header really matter (e.g. you are forwarding somebody else's
patch).  I don't think it is in this case, though.

> Added cached-sha-map.[hc] implementing a persistent hash map from sha1 to
> sha1.

"Add cached-sha1-map.[ch]" (imperative mood), not "(here is what I) _did_".

> diff --git a/cached-sha1-map.c b/cached-sha1-map.c
> new file mode 100644
> index 0000000..e363745
> --- /dev/null
> +++ b/cached-sha1-map.c
> @@ -0,0 +1,182 @@
> +#include "cached-sha1-map.h"
> +
> +union cached_sha1_map_header {
> +	struct {
> +		char signature[4]; /* HASH */
> +		off_t count, size;
> +	};
> +	struct cached_sha1_entry padding; /* pad header out to 40 bytes */
> +};
> +
> +static const char *signature = "HASH";

That sounds a bit too generic, doesn't it, to protect ourselves from
getting confused by some other filetype?

> +static void init_cached_sha1_map(struct cached_sha1_map *cache)
> +{
> +	int fd;
> +	union cached_sha1_map_header header;
> +
> +	if (cache->initialized)
> +		return;
> +
> +	fd = open(git_path(cache->filename), O_RDONLY);
> +	if (fd < 0) {
> +		init_empty_map(cache, 64);
> +		return;

Check errno and do this only when ENOENT.  Other errors should be caught
and reported.

> +	}
> +
> +	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header))
> +		die("cannot read %s header", cache->filename);
> +
> +	if (memcmp(header.signature, signature, 4))
> +		die("%s has invalid header", cache->filename);
> +
> +	if (header.size & (header.size-1))
> +		die("%s size %lld is not a power of two", cache->filename,
> +			(long long)header.size);

Two issues and a half:

 - Isn't it gcc extension to be able to say header.signature, bypassing
   the anonymous structure inside the union that the "header" itself is?

 - The signature header (count and size) is defined to be off_t, which
   makes the cached file unusable across architectures.  The map header
   structure should be specified with explicit size:

	union {
		struct {
	                char sig[4];
			uint32_t version;
                        uint32_t count;
                        unit32_t size;
		} u;
                struct cached_sha1_entry pad;
        };

   the uint32_t fields should be treated as network byte order integers,
   e.g.

	cache->count = ntohl(header.u.count);
        header.u.size = htonl(cache->size);

 - If this file is truly intended as "cache", shouldn't corruption of it
   be detected, reported but otherwise ignored, so that the lookup would
   continue in degraded uncached mode?

> +	/* check off_t to size_t conversion */
> +	if (cache->count != header.count || cache->size != header.size)
> +		die("%s is too large to hold in memory", cache->filename);

This does not make sense to me.  What you are checking does not match the
error message.

If you are making the file format architecture dependent (which I would
suggest strongly against), you can use the same type and be done with it.
Otherwise, if you are making the format portable across architectures,
then you would know how large the on-disk integer will be, so as long as
you use appropriate type that is large enough for cache->count you should
be Ok.

What you may want to check is that (header.u.size + 1) * sizeof(entry)
does not wrap around, but you don't.

> +	/* mmap entire file so that file / memory blocks are aligned */
> +	cache->entries = xmmap(NULL,
> +		sizeof(struct cached_sha1_entry) * (header.size + 1),
> +		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);

I think this will die() if the file is too large to map.  Ideally you
would want to allow this mmap to fail if the cache is too large, in which
case you can gracefully degrade to cacheless mode of operation, but that
can probably be left to 47th round.

> +int write_cached_sha1_map(struct cached_sha1_map *cache)
> +{
> +	union cached_sha1_map_header header;
> +	struct lock_file update_lock;
> +	int fd;
> +	size_t entry_size;
> +
> +	if (!cache->initialized || !cache->dirty)
> +		return 0;
> +
> +	fd = hold_lock_file_for_update(&update_lock,
> +			git_path(cache->filename), 0);
> +
> +	if (fd < 0)
> +		return error("could not construct %s", cache->filename);

Use a "const char *" to hold git_path(cache->filename) upfront in the
function, use it to obtain lock _and_ for reporting.

> +	memcpy(header.signature, signature, 4);
> +	header.count = cache->count;
> +	header.size = cache->size;

And here will be htonl().

> +	entry_size = sizeof(struct cached_sha1_entry) * cache->size;

Typically "entry_size" means the size of individual entry; this is the
size of the whole thing.

> +	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
> +		|| write_in_full(fd, cache->entries, entry_size) != entry_size)
> +		return error("could not write %s", cache->filename);
> +
> +	if (commit_lock_file(&update_lock) < 0)
> +		return error("could not write %s", cache->filename);
> +
> +	cache->dirty = 0;
> +	return 0;
> +}

But it is good that you used this intermediate variable; the above
write_in_full() is much easier to read than the xmmap() above at the end
of init_cached_sha1_map() function.

> +static size_t get_hash_index(const unsigned char *sha1)
> +{
> +	return ntohl(*(size_t*)sha1);
> +}

Two issues:

 - I do not see any guarantee that sha1 is suitably aligned for reading
   size_t bytes off of;

 - size_t is architecture dependent, so you would get different hash value
   depending on the architecture, which again makes this file format
   unportable.

> diff --git a/patch-ids.c b/patch-ids.c
> index 3be5d31..36332f3 100644
> --- a/patch-ids.c
> +++ b/patch-ids.c
> @@ -2,17 +2,31 @@
>  #include "diff.h"
>  #include "commit.h"
>  #include "patch-ids.h"
> +#include "cached-sha1-map.h"
> +
> +struct cached_sha1_map patch_id_cache;

Does this have to be extern?

>  static int commit_patch_id(struct commit *commit, struct diff_options *options,
>  		    unsigned char *sha1)
>  {
> +	/* pull patch-id out of the cache if possible */
> +	patch_id_cache.filename = "patch-id-cache";
> +	if (!get_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1))
> +		return 0;
> +
>  	if (commit->parents)
>  		diff_tree_sha1(commit->parents->item->object.sha1,
>  		               commit->object.sha1, "", options);
>  	else
>  		diff_root_tree_sha1(commit->object.sha1, "", options);
>  	diffcore_std(options);
> -	return diff_flush_patch_id(options, sha1);
> +	int ret = diff_flush_patch_id(options, sha1);

Decl-after-statement.

> +	if (ret)
> +		return ret;
> +
> +	/* record commit, patch-id pair in cache */
> +	set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
> +	return 0;
>  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
  2008-07-09  5:14 ` Junio C Hamano
@ 2008-07-09  5:26   ` Geoffrey Irving
  2008-07-09  6:24     ` Junio C Hamano
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-09  5:26 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Schindelin, git@vger.kernel.org

On Tue, Jul 8, 2008 at 10:14 PM, Junio C Hamano <gitster@pobox.com> wrote:
> "Geoffrey Irving" <irving@naml.us> writes:
>
>> From a3afd1455d215a541e1481e0f064df743d9219cc Mon Sep 17 00:00:00 2001
>
> Please drop this line.
>
>> From: Geoffrey Irving <irving@naml.us>
>> Date: Sat, 7 Jun 2008 16:03:49 -0700
>> Subject: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
>
> These are Ok _if_ the difference between them and what you have in your
> e-mail header really matter (e.g. you are forwarding somebody else's
> patch).  I don't think it is in this case, though.
>
>> Added cached-sha-map.[hc] implementing a persistent hash map from sha1 to
>> sha1.
>
> "Add cached-sha1-map.[ch]" (imperative mood), not "(here is what I) _did_".
>
>> diff --git a/cached-sha1-map.c b/cached-sha1-map.c
>> new file mode 100644
>> index 0000000..e363745
>> --- /dev/null
>> +++ b/cached-sha1-map.c
>> @@ -0,0 +1,182 @@
>> +#include "cached-sha1-map.h"
>> +
>> +union cached_sha1_map_header {
>> +     struct {
>> +             char signature[4]; /* HASH */
>> +             off_t count, size;
>> +     };
>> +     struct cached_sha1_entry padding; /* pad header out to 40 bytes */
>> +};
>> +
>> +static const char *signature = "HASH";
>
> That sounds a bit too generic, doesn't it, to protect ourselves from
> getting confused by some other filetype?
>
>> +static void init_cached_sha1_map(struct cached_sha1_map *cache)
>> +{
>> +     int fd;
>> +     union cached_sha1_map_header header;
>> +
>> +     if (cache->initialized)
>> +             return;
>> +
>> +     fd = open(git_path(cache->filename), O_RDONLY);
>> +     if (fd < 0) {
>> +             init_empty_map(cache, 64);
>> +             return;
>
> Check errno and do this only when ENOENT.  Other errors should be caught
> and reported.
>
>> +     }
>> +
>> +     if (read_in_full(fd, &header, sizeof(header)) != sizeof(header))
>> +             die("cannot read %s header", cache->filename);
>> +
>> +     if (memcmp(header.signature, signature, 4))
>> +             die("%s has invalid header", cache->filename);
>> +
>> +     if (header.size & (header.size-1))
>> +             die("%s size %lld is not a power of two", cache->filename,
>> +                     (long long)header.size);
>
> Two issues and a half:
>
>  - Isn't it gcc extension to be able to say header.signature, bypassing
>   the anonymous structure inside the union that the "header" itself is?
>
>  - The signature header (count and size) is defined to be off_t, which
>   makes the cached file unusable across architectures.  The map header
>   structure should be specified with explicit size:
>
>        union {
>                struct {
>                        char sig[4];
>                        uint32_t version;
>                        uint32_t count;
>                        unit32_t size;
>                } u;
>                struct cached_sha1_entry pad;
>        };
>
>   the uint32_t fields should be treated as network byte order integers,
>   e.g.
>
>        cache->count = ntohl(header.u.count);
>        header.u.size = htonl(cache->size);
>
>  - If this file is truly intended as "cache", shouldn't corruption of it
>   be detected, reported but otherwise ignored, so that the lookup would
>   continue in degraded uncached mode?
>
>> +     /* check off_t to size_t conversion */
>> +     if (cache->count != header.count || cache->size != header.size)
>> +             die("%s is too large to hold in memory", cache->filename);
>
> This does not make sense to me.  What you are checking does not match the
> error message.
>
> If you are making the file format architecture dependent (which I would
> suggest strongly against), you can use the same type and be done with it.
> Otherwise, if you are making the format portable across architectures,
> then you would know how large the on-disk integer will be, so as long as
> you use appropriate type that is large enough for cache->count you should
> be Ok.
>
> What you may want to check is that (header.u.size + 1) * sizeof(entry)
> does not wrap around, but you don't.
>
>> +     /* mmap entire file so that file / memory blocks are aligned */
>> +     cache->entries = xmmap(NULL,
>> +             sizeof(struct cached_sha1_entry) * (header.size + 1),
>> +             PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
>
> I think this will die() if the file is too large to map.  Ideally you
> would want to allow this mmap to fail if the cache is too large, in which
> case you can gracefully degrade to cacheless mode of operation, but that
> can probably be left to 47th round.
>
>> +int write_cached_sha1_map(struct cached_sha1_map *cache)
>> +{
>> +     union cached_sha1_map_header header;
>> +     struct lock_file update_lock;
>> +     int fd;
>> +     size_t entry_size;
>> +
>> +     if (!cache->initialized || !cache->dirty)
>> +             return 0;
>> +
>> +     fd = hold_lock_file_for_update(&update_lock,
>> +                     git_path(cache->filename), 0);
>> +
>> +     if (fd < 0)
>> +             return error("could not construct %s", cache->filename);
>
> Use a "const char *" to hold git_path(cache->filename) upfront in the
> function, use it to obtain lock _and_ for reporting.
>
>> +     memcpy(header.signature, signature, 4);
>> +     header.count = cache->count;
>> +     header.size = cache->size;
>
> And here will be htonl().
>
>> +     entry_size = sizeof(struct cached_sha1_entry) * cache->size;
>
> Typically "entry_size" means the size of individual entry; this is the
> size of the whole thing.
>
>> +     if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
>> +             || write_in_full(fd, cache->entries, entry_size) != entry_size)
>> +             return error("could not write %s", cache->filename);
>> +
>> +     if (commit_lock_file(&update_lock) < 0)
>> +             return error("could not write %s", cache->filename);
>> +
>> +     cache->dirty = 0;
>> +     return 0;
>> +}
>
> But it is good that you used this intermediate variable; the above
> write_in_full() is much easier to read than the xmmap() above at the end
> of init_cached_sha1_map() function.
>
>> +static size_t get_hash_index(const unsigned char *sha1)
>> +{
>> +     return ntohl(*(size_t*)sha1);
>> +}
>
> Two issues:
>
>  - I do not see any guarantee that sha1 is suitably aligned for reading
>   size_t bytes off of;
>
>  - size_t is architecture dependent, so you would get different hash value
>   depending on the architecture, which again makes this file format
>   unportable.
>
>> diff --git a/patch-ids.c b/patch-ids.c
>> index 3be5d31..36332f3 100644
>> --- a/patch-ids.c
>> +++ b/patch-ids.c
>> @@ -2,17 +2,31 @@
>>  #include "diff.h"
>>  #include "commit.h"
>>  #include "patch-ids.h"
>> +#include "cached-sha1-map.h"
>> +
>> +struct cached_sha1_map patch_id_cache;
>
> Does this have to be extern?
>
>>  static int commit_patch_id(struct commit *commit, struct diff_options *options,
>>                   unsigned char *sha1)
>>  {
>> +     /* pull patch-id out of the cache if possible */
>> +     patch_id_cache.filename = "patch-id-cache";
>> +     if (!get_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1))
>> +             return 0;
>> +
>>       if (commit->parents)
>>               diff_tree_sha1(commit->parents->item->object.sha1,
>>                              commit->object.sha1, "", options);
>>       else
>>               diff_root_tree_sha1(commit->object.sha1, "", options);
>>       diffcore_std(options);
>> -     return diff_flush_patch_id(options, sha1);
>> +     int ret = diff_flush_patch_id(options, sha1);
>
> Decl-after-statement.
>
>> +     if (ret)
>> +             return ret;
>> +
>> +     /* record commit, patch-id pair in cache */
>> +     set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
>> +     return 0;
>>  }

Thanks.  I'll fix these in the next few days.

Should I rewrite the patch sequence to incorporate these changes into
the first commit, or add them as a forth commit off the end?

Geoffrey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
  2008-07-09  5:26   ` Geoffrey Irving
@ 2008-07-09  6:24     ` Junio C Hamano
  2008-07-09 12:18       ` Johannes Schindelin
  2008-07-10  3:34       ` [PATCH] " Geoffrey Irving
  0 siblings, 2 replies; 21+ messages in thread
From: Junio C Hamano @ 2008-07-09  6:24 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Johannes Schindelin, git@vger.kernel.org

"Geoffrey Irving" <irving@naml.us> writes:

> On Tue, Jul 8, 2008 at 10:14 PM, Junio C Hamano <gitster@pobox.com> wrote:
> ...
>>>  }

Please don't quote the whole thing without trimming if you do not have any
interspersed comments/responses to quoted part.

> Should I rewrite the patch sequence to incorporate these changes into
> the first commit, or add them as a forth commit off the end?

I strongly encourage the latter.  We try not to keep early mistakes in the
history (see my comments on your [2/3]).

It is not unusal for any sizeable new code to go through a few round of
review cycle without even queued to 'pu', and the general rule is until
the series hits 'next', it is either "rejected (dropped on the floor),
please resend an improved version" or "ok now it is good, will queue".
After queued in 'next', improvements will continue incrementally.

Think of this procedure as giving a chance for you to hide early
embarrassment under the rug ;-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] cherry: cache patch-ids to avoid repeating work
  2008-07-09  6:24     ` Junio C Hamano
@ 2008-07-09 12:18       ` Johannes Schindelin
  2008-07-10  3:34       ` [PATCH] " Geoffrey Irving
  1 sibling, 0 replies; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-09 12:18 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Geoffrey Irving, git@vger.kernel.org

Hi,

On Tue, 8 Jul 2008, Junio C Hamano wrote:

> Think of this procedure as giving a chance for you to hide early 
> embarrassment under the rug ;-)

Further, as Shawn pointed out to me (and you will all be able to hear it 
for yourselves soon), these patch iterations give you the chance to apply 
all the wisdom of the combined developers on this list to your patch, and 
in the end put your name on it :-)

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-09  6:24     ` Junio C Hamano
  2008-07-09 12:18       ` Johannes Schindelin
@ 2008-07-10  3:34       ` Geoffrey Irving
  2008-07-10 14:09         ` Geoffrey Irving
  1 sibling, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-10  3:34 UTC (permalink / raw)
  To: Junio C Hamano, Johannes Schindelin, git@vger.kernel.org

Add cached-sha-map.[ch] implementing a persistent hash map from sha1 to
sha1.  The map is read with mmap, and completely rewritten if any entries
change.  It would be good to add incremental update to handle the usual case
where only a few entries change.

This structure is used by patch-ids.c to cache the mapping from commit to
patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
this speeds up the second invocation of git-cherry by two orders of
magnitude.  The caching can be disabled by setting cherry.cachepatchids to
false.

Original code cannibalized from Johannes Schindelin's notes-index structure.

Signed-off-by: Geoffrey Irving <irving@naml.us>
---

Note: there are at least two "holes" in this code.  First, it is impossible
to verify the validity of the entries (this is impossible to fix).  Second,
it is possible to write a malicious patch-id-cache file that causes git-cherry
to go into an infinite loop.  Fixing the loop requires either traversing every
entry on load (bad) or adding a second loop termination condition to
find_helper.  Since looping forever is better than returning incorrect
results, I figured fixing the weaker hole would just result in a false sense
of security.

I'll await the next round of comments. :)

 Documentation/config.txt |    5 +
 Makefile                 |    2 +
 builtin-log.c            |   12 ++
 cached-sha1-map.c        |  269 ++++++++++++++++++++++++++++++++++++++++++++++
 cached-sha1-map.h        |   45 ++++++++
 patch-ids.c              |   26 +++++-
 patch-ids.h              |    2 +
 7 files changed, 360 insertions(+), 1 deletions(-)
 create mode 100644 cached-sha1-map.c
 create mode 100644 cached-sha1-map.h

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 838794d..02b8113 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -468,6 +468,11 @@ browser.<tool>.path::
 	browse HTML help (see '-w' option in linkgit:git-help[1]) or a
 	working repository in gitweb (see linkgit:git-instaweb[1]).

+cherry.cachepatchids::
+	If true, linkgit:git-cherry will store a cache of computed patch-ids
+	in $GIT_DIR/patch-id-cache in order to make repeated invocations faster.
+	Defaults to true.
+
 clean.requireForce::
 	A boolean to make git-clean do nothing unless given -f
 	or -n.   Defaults to true.
diff --git a/Makefile b/Makefile
index 4796565..f7360e1 100644
--- a/Makefile
+++ b/Makefile
@@ -356,6 +356,7 @@ LIB_H += pack-refs.h
 LIB_H += pack-revindex.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
+LIB_H += cached-sha1-map.h
 LIB_H += path-list.h
 LIB_H += pkt-line.h
 LIB_H += progress.h
@@ -436,6 +437,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += cached-sha1-map.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/builtin-log.c b/builtin-log.c
index 430d876..fbfefbd 100644
--- a/builtin-log.c
+++ b/builtin-log.c
@@ -1081,6 +1081,16 @@ static int add_pending_commit(const char *arg,
struct rev_info *revs, int flags)
 	return -1;
 }

+static int git_cherry_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "cherry.cachepatchids")) {
+		cache_patch_ids = git_config_bool(var, value);
+		return 0;
+	}
+
+	return 0;
+}
+
 static const char cherry_usage[] =
 "git-cherry [-v] <upstream> [<head>] [<limit>]";
 int cmd_cherry(int argc, const char **argv, const char *prefix)
@@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
const char *prefix)
 	const char *limit = NULL;
 	int verbose = 0;

+	git_config(git_cherry_config, NULL);
+
 	if (argc > 1 && !strcmp(argv[1], "-v")) {
 		verbose = 1;
 		argc--;
diff --git a/cached-sha1-map.c b/cached-sha1-map.c
new file mode 100644
index 0000000..3ac5474
--- /dev/null
+++ b/cached-sha1-map.c
@@ -0,0 +1,269 @@
+#include "cached-sha1-map.h"
+
+union cached_sha1_map_header {
+	struct {
+		char signature[4]; /* CS1M */
+		uint32_t version;
+		uint32_t count;
+		uint32_t size;
+	} u;
+	struct cached_sha1_entry pad; /* pad header out to 40 bytes */
+};
+
+static const char *signature = "CS1M";
+static const uint32_t version = 1;
+
+static int init_empty_map(struct cached_sha1_map *cache, uint32_t size)
+{
+	cache->count = 0;
+	cache->size = size;
+	cache->initialized = 1;
+	cache->mmapped = 0;
+	cache->dirty = 1;
+
+	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
+	if (!cache->entries) {
+		warning("failed to allocate empty map of size %"PRIu32" for %s",
+			size, git_path(cache->filename));
+		cache->size = 0;
+		cache->dirty = 0;
+		return -1;
+	}
+	return 0;
+}
+
+static int grow_map(struct cached_sha1_map *cache)
+{
+	struct cached_sha1_map new_cache;
+	uint32_t i;
+
+	if (cache->size * 2 == 0) {
+		warning("%s overflowed, so resetting to empty",
+			git_path(cache->filename));
+		return init_empty_map(cache, 64);
+	}
+
+	/* allocate cache with twice the size */
+	new_cache.filename = cache->filename;
+	if (init_empty_map(&new_cache, cache->size * 2)) {
+		warning("failed to grow %s to size %"PRIu32,
+			git_path(cache->filename), cache->size * 2);
+		return init_empty_map(cache, 64);
+	}
+
+	/* reinsert all entries */
+ 	for (i = 0; i < cache->size; i++)
+		if (!is_null_sha1(cache->entries[i].key))
+			set_cached_sha1_entry(&new_cache,
+				cache->entries[i].key, cache->entries[i].value);
+	/* finish */
+	free_cached_sha1_map(cache);
+	*cache = new_cache;
+	return 0;
+}
+
+/* Any errors that occur result in the cache being initialized to empty */
+static int init_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	int fd;
+	union cached_sha1_map_header header;
+	const char *filename;
+	size_t map_size;
+
+	if (cache->initialized)
+		return cache->size ? 0 : -1;
+
+	filename = git_path(cache->filename);
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			warning("failed to read '%s': %s", filename,
+				strerror(errno));
+		goto empty;
+	}
+
+	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header))
+	{
+		warning("cannot read %s header", filename);
+		goto empty;
+	}
+
+	if (memcmp(header.u.signature, signature, 4))
+	{
+		warning("%s has invalid header", filename);
+		goto empty;
+	}
+
+	if (ntohl(header.u.version) != version)
+	{
+		warning("%s has unrecognized version %"PRIu32, filename,
+			ntohl(header.u.version));
+		goto empty;
+	}
+
+	cache->count = ntohl(header.u.count);
+	cache->size = ntohl(header.u.size);
+
+	if (cache->size & (cache->size-1))
+	{
+		warning("%s is corrupt: size %"PRIu32" is not a power of two",
+			filename, cache->size);
+		goto empty;
+	}
+
+	if (cache->count >= cache->size)
+	{
+		warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
+			filename, cache->count, cache->size);
+		goto empty;
+	}
+
+	cache->dirty = 0;
+	cache->initialized = 1;
+	cache->mmapped = 1;
+
+	/* mmap entire file so that file / memory blocks are aligned */
+	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
+	cache->entries = mmap(NULL, map_size,
+		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (cache->entries == MAP_FAILED) {
+		/* this is just a cache, so don't free pack memory and retry */
+		warning("%s mmap failed: %s", filename, strerror(errno));
+		goto empty;
+	}
+	cache->entries += 1; /* skip header */
+	return 0;
+
+empty:
+	if (fd >= 0)
+		close(fd);
+	return init_empty_map(cache, 64);
+}
+
+int write_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	union cached_sha1_map_header header;
+	struct lock_file update_lock;
+	int fd;
+	size_t map_size;
+	const char *filename;
+
+	if (!cache->initialized || !cache->dirty)
+		return 0;
+
+	filename = git_path(cache->filename);
+	fd = hold_lock_file_for_update(&update_lock, filename, 0);
+
+	if (fd < 0)
+	{
+		warning("could not construct %s", filename);
+		return -1;
+	}
+
+	memcpy(header.u.signature, signature, 4);
+	header.u.version = htonl(version);
+	header.u.count = htonl(cache->count);
+	header.u.size = htonl(cache->size);
+	map_size = sizeof(struct cached_sha1_entry) * cache->size;
+	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
+		|| write_in_full(fd, cache->entries, map_size) != map_size)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	if (commit_lock_file(&update_lock) < 0)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	cache->dirty = 0;
+	return 0;
+}
+
+void free_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	if (!cache->initialized)
+		return;
+
+	if (cache->mmapped)
+		munmap(cache->entries - 1,
+			sizeof(struct cached_sha1_entry) * (cache->size + 1));
+	else
+		free(cache->entries);
+}
+
+/* The fact that size is a power of two means count-1 <= INT32_MAX, so it
+ * is safe to return signed integers here. */
+static int32_t get_hash_index(const unsigned char *sha1)
+{
+	/* this is alignment safe since 40 is a multiple of 4 */
+	return ntohl(*(uint32_t*)sha1);
+}
+
+/*
+ * Returns the index if the entry exists, and the complemented index of
+ * the next free entry otherwise.
+ */
+static int32_t find_helper(struct cached_sha1_map *cache,
+	const unsigned char *key)
+{
+	int32_t i, mask;
+
+	mask = cache->size - 1;
+
+	for (i = get_hash_index(key) & mask; ; i = (i+1) & mask) {
+		if (!hashcmp(key, cache->entries[i].key))
+			return i;
+		else if (is_null_sha1(cache->entries[i].key))
+			return ~i;
+	}
+}
+
+int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, unsigned char *value)
+{
+	int32_t i;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+	if(i < 0)
+		return -1;
+
+	/* entry found, return value */
+	hashcpy(value, cache->entries[i].value);
+	return 0;
+}
+
+int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value)
+{
+	int32_t i;
+	struct cached_sha1_entry *entry;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+
+	if (i < 0) { /* write new entry */
+		entry = cache->entries + ~i;
+		hashcpy(entry->key, key);
+		hashcpy(entry->value, value);
+		cache->count++;
+		cache->dirty = 1;
+	} else { /* overwrite existing entry */
+		entry = cache->entries + i;
+		if (hashcmp(value, entry->value)) {
+			hashcpy(entry->value, value);
+			cache->dirty = 1;
+		}
+	}
+
+	if (4*cache->count >= 3*cache->size)
+		return grow_map(cache);
+	return 0;
+}
diff --git a/cached-sha1-map.h b/cached-sha1-map.h
new file mode 100644
index 0000000..296c17c
--- /dev/null
+++ b/cached-sha1-map.h
@@ -0,0 +1,45 @@
+#ifndef CACHED_SHA1_MAP_H
+#define CACHED_SHA1_MAP_H
+
+#include "cache.h"
+
+/*
+ * A cached-sha1-map is a file storing a hash map from sha1 to sha1.
+ *
+ * The file is mmap'ed, updated in memory during operation, and flushed
+ * back to disk when freed.  Currently the entire file is rewritten for
+ * any change.  This could be a significant bottleneck for common uses,
+ * so it would be good to fix this later if possible.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ */
+
+struct cached_sha1_entry {
+	unsigned char key[20];
+	unsigned char value[20];
+};
+
+struct cached_sha1_map {
+	const char *filename; /* relative to GIT_DIR */
+
+	/* rest is for internal use */
+	uint32_t count, size;
+	unsigned int initialized : 1;
+	unsigned int dirty : 1;
+	unsigned int mmapped : 1;
+	struct cached_sha1_entry *entries; /* pointer to mmap'ed memory + 1 */
+};
+
+extern int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key,unsigned char *value);
+
+extern int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value);
+
+extern int write_cached_sha1_map(struct cached_sha1_map *cache);
+
+extern void free_cached_sha1_map(struct cached_sha1_map *cache);
+
+#endif
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..663ffee 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -2,17 +2,36 @@
 #include "diff.h"
 #include "commit.h"
 #include "patch-ids.h"
+#include "cached-sha1-map.h"
+
+int cache_patch_ids = 1;
+static struct cached_sha1_map patch_id_cache;

 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	int ret;
+
+	/* pull patch-id out of the cache if possible */
+	patch_id_cache.filename = "patch-id-cache";
+	if (cache_patch_ids && !get_cached_sha1_entry(&patch_id_cache,
+			commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	/* record commit, patch-id pair in cache */
+	if (cache_patch_ids)
+		set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -136,6 +155,11 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	/* write cached patch-ids and ignore any errors that arise
+	 * (e.g. if the repository is write protected) */
+	if (cache_patch_ids)
+		write_cached_sha1_map(&patch_id_cache);
 	return 0;
 }

diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c0ebdc1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -18,4 +18,6 @@ int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
 struct patch_id *has_commit_patch_id(struct commit *, struct patch_ids *);

+extern int cache_patch_ids;
+
 #endif /* PATCH_IDS_H */
-- 
1.5.6.2.256.g47cb9.dirty

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-10  3:34       ` [PATCH] " Geoffrey Irving
@ 2008-07-10 14:09         ` Geoffrey Irving
  2008-07-10 14:28           ` Johannes Schindelin
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-10 14:09 UTC (permalink / raw)
  To: Junio C Hamano, Johannes Schindelin, git@vger.kernel.org

On Wed, Jul 9, 2008 at 8:34 PM, Geoffrey Irving <irving@naml.us> wrote:
> Add cached-sha-map.[ch] implementing a persistent hash map from sha1 to
> sha1.  The map is read with mmap, and completely rewritten if any entries
> change.  It would be good to add incremental update to handle the usual case
> where only a few entries change.
>
> This structure is used by patch-ids.c to cache the mapping from commit to
> patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
> this speeds up the second invocation of git-cherry by two orders of
> magnitude.  The caching can be disabled by setting cherry.cachepatchids to
> false.
>
> Original code cannibalized from Johannes Schindelin's notes-index structure.
>
> Signed-off-by: Geoffrey Irving <irving@naml.us>
> ---
>
> Note: there are at least two "holes" in this code.  First, it is impossible
> to verify the validity of the entries (this is impossible to fix).  Second,
> it is possible to write a malicious patch-id-cache file that causes git-cherry
> to go into an infinite loop.  Fixing the loop requires either traversing every
> entry on load (bad) or adding a second loop termination condition to
> find_helper.  Since looping forever is better than returning incorrect
> results, I figured fixing the weaker hole would just result in a false sense
> of security.

Oops: avoiding the infinite loop only requires reading expected O(1)
entries on load, so I can fix that if you like.  It would only be all
of them if it actually did detect the infinite loop.

Geoffrey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-10 14:09         ` Geoffrey Irving
@ 2008-07-10 14:28           ` Johannes Schindelin
  2008-07-10 14:33             ` Geoffrey Irving
  0 siblings, 1 reply; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-10 14:28 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Junio C Hamano, git@vger.kernel.org

Hi,

On Thu, 10 Jul 2008, Geoffrey Irving wrote:

> On Wed, Jul 9, 2008 at 8:34 PM, Geoffrey Irving <irving@naml.us> wrote:
>
> > Note: there are at least two "holes" in this code.  First, it is 
> > impossible to verify the validity of the entries (this is impossible 
> > to fix).  Second, it is possible to write a malicious patch-id-cache 
> > file that causes git-cherry to go into an infinite loop.  Fixing the 
> > loop requires either traversing every entry on load (bad) or adding a 
> > second loop termination condition to find_helper.  Since looping 
> > forever is better than returning incorrect results, I figured fixing 
> > the weaker hole would just result in a false sense of security.
> 
> Oops: avoiding the infinite loop only requires reading expected O(1) 
> entries on load, so I can fix that if you like.  It would only be all of 
> them if it actually did detect the infinite loop.

I have to admit that you lost me there.  AFAIR the patch-id cache is a 
simple commit->patch_id store, right?  Then there should be no way to get 
an infinite loop.

Besides, this is a purely local cache, no?  Never to be transmitted...  So 
not much chance of a malicious attack, except if you allow write access to 
your local repository, in which case you are endangered no matter what.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-10 14:28           ` Johannes Schindelin
@ 2008-07-10 14:33             ` Geoffrey Irving
  2008-07-10 15:56               ` Johannes Schindelin
  2008-07-11  6:54               ` Junio C Hamano
  0 siblings, 2 replies; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-10 14:33 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Junio C Hamano, git@vger.kernel.org

On Thu, Jul 10, 2008 at 7:28 AM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Thu, 10 Jul 2008, Geoffrey Irving wrote:
>
>> On Wed, Jul 9, 2008 at 8:34 PM, Geoffrey Irving <irving@naml.us> wrote:
>>
>> > Note: there are at least two "holes" in this code.  First, it is
>> > impossible to verify the validity of the entries (this is impossible
>> > to fix).  Second, it is possible to write a malicious patch-id-cache
>> > file that causes git-cherry to go into an infinite loop.  Fixing the
>> > loop requires either traversing every entry on load (bad) or adding a
>> > second loop termination condition to find_helper.  Since looping
>> > forever is better than returning incorrect results, I figured fixing
>> > the weaker hole would just result in a false sense of security.
>>
>> Oops: avoiding the infinite loop only requires reading expected O(1)
>> entries on load, so I can fix that if you like.  It would only be all of
>> them if it actually did detect the infinite loop.
>
> I have to admit that you lost me there.  AFAIR the patch-id cache is a
> simple commit->patch_id store, right?  Then there should be no way to get
> an infinite loop.

If every entry is nonnull, find_helper loops forever.

> Besides, this is a purely local cache, no?  Never to be transmitted...  So
> not much chance of a malicious attack, except if you allow write access to
> your local repository, in which case you are endangered no matter what.

Yep, that's why it's only a hole in quotes, and why I didn't fix it.

Geoffrey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-10 14:33             ` Geoffrey Irving
@ 2008-07-10 15:56               ` Johannes Schindelin
  2008-07-11  6:54               ` Junio C Hamano
  1 sibling, 0 replies; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-10 15:56 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Junio C Hamano, git@vger.kernel.org

Hi,

On Thu, 10 Jul 2008, Geoffrey Irving wrote:

> On Thu, Jul 10, 2008 at 7:28 AM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > On Thu, 10 Jul 2008, Geoffrey Irving wrote:
> >
> >> On Wed, Jul 9, 2008 at 8:34 PM, Geoffrey Irving <irving@naml.us> wrote:
> >>
> >> > Note: there are at least two "holes" in this code.  First, it is 
> >> > impossible to verify the validity of the entries (this is 
> >> > impossible to fix).  Second, it is possible to write a malicious 
> >> > patch-id-cache file that causes git-cherry to go into an infinite 
> >> > loop.  Fixing the loop requires either traversing every entry on 
> >> > load (bad) or adding a second loop termination condition to 
> >> > find_helper.  Since looping forever is better than returning 
> >> > incorrect results, I figured fixing the weaker hole would just 
> >> > result in a false sense of security.
> >>
> >> Oops: avoiding the infinite loop only requires reading expected O(1) 
> >> entries on load, so I can fix that if you like.  It would only be all 
> >> of them if it actually did detect the infinite loop.
> >
> > I have to admit that you lost me there.  AFAIR the patch-id cache is a 
> > simple commit->patch_id store, right?  Then there should be no way to 
> > get an infinite loop.
> 
> If every entry is nonnull, find_helper loops forever.

Ah, that is because you did not use that part of my implementation.  My 
hash did not wrap.

> > Besides, this is a purely local cache, no?  Never to be transmitted...  
> > So not much chance of a malicious attack, except if you allow write 
> > access to your local repository, in which case you are endangered no 
> > matter what.
> 
> Yep, that's why it's only a hole in quotes, and why I didn't fix it.

Then it is not a hole.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-10 14:33             ` Geoffrey Irving
  2008-07-10 15:56               ` Johannes Schindelin
@ 2008-07-11  6:54               ` Junio C Hamano
  2008-07-11 14:58                 ` Geoffrey Irving
  1 sibling, 1 reply; 21+ messages in thread
From: Junio C Hamano @ 2008-07-11  6:54 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Johannes Schindelin, Junio C Hamano, git@vger.kernel.org

"Geoffrey Irving" <irving@naml.us> writes:

>>> Oops: avoiding the infinite loop only requires reading expected O(1)
>>> entries on load, so I can fix that if you like.  It would only be all of
>>> them if it actually did detect the infinite loop.
>>
>> I have to admit that you lost me there.  AFAIR the patch-id cache is a
>> simple commit->patch_id store, right?  Then there should be no way to get
>> an infinite loop.
>
> If every entry is nonnull, find_helper loops forever.

Isn't it sufficient to make this part check the condition as well?

+	if (cache->count >= cache->size)
+	{
+		warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
+			filename, cache->count, cache->size);
+		goto empty;
+	}

At runtime you keep the invariants that hashtable always has at most 3/4
full and whoever wrote the file you are reading must have honored that as
well, or there is something fishy going on.

>> Besides, this is a purely local cache, no?  Never to be transmitted...  So
>> not much chance of a malicious attack, except if you allow write access to
>> your local repository, in which case you are endangered no matter what.
>
> Yep, that's why it's only a hole in quotes, and why I didn't fix it.

You might want to protect yourself against file corruption, though.
Checksumming the whole file and checking it at opening defeats the point
of mmap'ed access, but at least the header may want to be checksummed?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-11  6:54               ` Junio C Hamano
@ 2008-07-11 14:58                 ` Geoffrey Irving
  2008-07-11 15:36                   ` Johannes Schindelin
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-11 14:58 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Schindelin, git@vger.kernel.org

On Thu, Jul 10, 2008 at 11:54 PM, Junio C Hamano <gitster@pobox.com> wrote:
> "Geoffrey Irving" <irving@naml.us> writes:
>
>>>> Oops: avoiding the infinite loop only requires reading expected O(1)
>>>> entries on load, so I can fix that if you like.  It would only be all of
>>>> them if it actually did detect the infinite loop.
>>>
>>> I have to admit that you lost me there.  AFAIR the patch-id cache is a
>>> simple commit->patch_id store, right?  Then there should be no way to get
>>> an infinite loop.
>>
>> If every entry is nonnull, find_helper loops forever.
>
> Isn't it sufficient to make this part check the condition as well?
>
> +       if (cache->count >= cache->size)
> +       {
> +               warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
> +                       filename, cache->count, cache->size);
> +               goto empty;
> +       }
>
> At runtime you keep the invariants that hashtable always has at most 3/4
> full and whoever wrote the file you are reading must have honored that as
> well, or there is something fishy going on.

Good point.  There's no reason not to check the 3/4 condition.  It
isn't sufficient to avoid the infinite loop, though, since we don't
verify that count is accurate.

Another route would to eliminate the count field entirely, and replace
the count >= size/4*3 check with a statistical one based on the
entries seen so far.  The main advantage of that would be to make
incremental writes simpler by avoiding the need to update the header.
The disadvantage is that there would be a small chance that the map
would grow in size despite being almost empty.  Thoughts on whether we
should do that?

>>> Besides, this is a purely local cache, no?  Never to be transmitted...  So
>>> not much chance of a malicious attack, except if you allow write access to
>>> your local repository, in which case you are endangered no matter what.
>>
>> Yep, that's why it's only a hole in quotes, and why I didn't fix it.
>
> You might want to protect yourself against file corruption, though.
> Checksumming the whole file and checking it at opening defeats the point
> of mmap'ed access, but at least the header may want to be checksummed?

Okay.  I imagine I should use sha1 as the sum?

Geoffrey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-11 14:58                 ` Geoffrey Irving
@ 2008-07-11 15:36                   ` Johannes Schindelin
  2008-07-11 15:41                     ` Geoffrey Irving
  0 siblings, 1 reply; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-11 15:36 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Junio C Hamano, git@vger.kernel.org

Hi,

On Fri, 11 Jul 2008, Geoffrey Irving wrote:

> On Thu, Jul 10, 2008 at 11:54 PM, Junio C Hamano <gitster@pobox.com> wrote:
> > "Geoffrey Irving" <irving@naml.us> writes:
> >
> >>>> Oops: avoiding the infinite loop only requires reading expected O(1)
> >>>> entries on load, so I can fix that if you like.  It would only be all of
> >>>> them if it actually did detect the infinite loop.
> >>>
> >>> I have to admit that you lost me there.  AFAIR the patch-id cache is a
> >>> simple commit->patch_id store, right?  Then there should be no way to get
> >>> an infinite loop.
> >>
> >> If every entry is nonnull, find_helper loops forever.
> >
> > Isn't it sufficient to make this part check the condition as well?
> >
> > +       if (cache->count >= cache->size)
> > +       {
> > +               warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
> > +                       filename, cache->count, cache->size);
> > +               goto empty;
> > +       }
> >
> > At runtime you keep the invariants that hashtable always has at most 3/4
> > full and whoever wrote the file you are reading must have honored that as
> > well, or there is something fishy going on.
> 
> Good point.  There's no reason not to check the 3/4 condition.  It isn't 
> sufficient to avoid the infinite loop, though, since we don't verify 
> that count is accurate.

Why so complicated?  I mean, you can count in that "infinite" loop 
yourself, no?

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-11 15:36                   ` Johannes Schindelin
@ 2008-07-11 15:41                     ` Geoffrey Irving
  2008-07-11 15:48                       ` Johannes Schindelin
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-11 15:41 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Junio C Hamano, git@vger.kernel.org

On Fri, Jul 11, 2008 at 8:36 AM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Fri, 11 Jul 2008, Geoffrey Irving wrote:
>
>> On Thu, Jul 10, 2008 at 11:54 PM, Junio C Hamano <gitster@pobox.com> wrote:
>> > "Geoffrey Irving" <irving@naml.us> writes:
>> >
>> >>>> Oops: avoiding the infinite loop only requires reading expected O(1)
>> >>>> entries on load, so I can fix that if you like.  It would only be all of
>> >>>> them if it actually did detect the infinite loop.
>> >>>
>> >>> I have to admit that you lost me there.  AFAIR the patch-id cache is a
>> >>> simple commit->patch_id store, right?  Then there should be no way to get
>> >>> an infinite loop.
>> >>
>> >> If every entry is nonnull, find_helper loops forever.
>> >
>> > Isn't it sufficient to make this part check the condition as well?
>> >
>> > +       if (cache->count >= cache->size)
>> > +       {
>> > +               warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
>> > +                       filename, cache->count, cache->size);
>> > +               goto empty;
>> > +       }
>> >
>> > At runtime you keep the invariants that hashtable always has at most 3/4
>> > full and whoever wrote the file you are reading must have honored that as
>> > well, or there is something fishy going on.
>>
>> Good point.  There's no reason not to check the 3/4 condition.  It isn't
>> sufficient to avoid the infinite loop, though, since we don't verify
>> that count is accurate.
>
> Why so complicated?  I mean, you can count in that "infinite" loop
> yourself, no?

Yeah, I was just trying to avoid the extra termination condition,
because I don't think it adds any real safety.

Geoffrey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-11 15:41                     ` Geoffrey Irving
@ 2008-07-11 15:48                       ` Johannes Schindelin
       [not found]                         ` <7vej60jln6.fsf@gitster.siamese.dyndns.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-11 15:48 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Junio C Hamano, git@vger.kernel.org

Hi,

On Fri, 11 Jul 2008, Geoffrey Irving wrote:

> On Fri, Jul 11, 2008 at 8:36 AM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > Why so complicated?  I mean, you can count in that "infinite" loop 
> > yourself, no?
> 
> Yeah, I was just trying to avoid the extra termination condition, 
> because I don't think it adds any real safety.

Sorry.  You absolutely lost me.  While doing the loop over the entries, 
trying to find an entry, adding a counter, and exiting when the counter 
reaches the total number of slots does not add any real safety?

Puzzled,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
       [not found]                         ` <7vej60jln6.fsf@gitster.siamese.dyndns.org>
@ 2008-07-13  3:14                           ` Geoffrey Irving
  2008-07-15 16:57                             ` Geoffrey Irving
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-13  3:14 UTC (permalink / raw)
  To: Junio C Hamano, git@vger.kernel.org; +Cc: Johannes Schindelin

Add cached-sha-map.[ch] implementing a persistent hash map from sha1 to
sha1.  The map is read with mmap, and completely rewritten if any entries
change.  It would be good to add incremental update to handle the usual case
where only a few entries change.

This structure is used by patch-ids.c to cache the mapping from commit to
patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
this speeds up the second invocation of git-cherry by two orders of
magnitude.  The caching can be disabled by setting cherry.cachepatchids to
false.

Original code cannibalized from Johannes Schindelin's notes-index structure.

Signed-off-by: Geoffrey Irving <irving@naml.us>
---

Here's an updated version that avoids infinite loops and adds a sha1
checksum of the header.  It is still vastly more likely that this code
will return incorrect results due to disk corruption than that the old
version would infinite loop.  If we want to be even more paranoid, we
could add a checksum for every 511 entries, but I'm hoping that isn't
required. :)

Your version of the infinite loop avoidance didn't quite work, since
I'm already using every 32 bit return value in find_helper.

I also fixed the 4/3 check to not overflow.

 Documentation/config.txt |    5 +
 Makefile                 |    2 +
 builtin-log.c            |   12 ++
 cached-sha1-map.c        |  293 ++++++++++++++++++++++++++++++++++++++++++++++
 cached-sha1-map.h        |   45 +++++++
 patch-ids.c              |   26 ++++-
 patch-ids.h              |    2 +
 7 files changed, 384 insertions(+), 1 deletions(-)
 create mode 100644 cached-sha1-map.c
 create mode 100644 cached-sha1-map.h

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 838794d..02b8113 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -468,6 +468,11 @@ browser.<tool>.path::
 	browse HTML help (see '-w' option in linkgit:git-help[1]) or a
 	working repository in gitweb (see linkgit:git-instaweb[1]).

+cherry.cachepatchids::
+	If true, linkgit:git-cherry will store a cache of computed patch-ids
+	in $GIT_DIR/patch-id-cache in order to make repeated invocations faster.
+	Defaults to true.
+
 clean.requireForce::
 	A boolean to make git-clean do nothing unless given -f
 	or -n.   Defaults to true.
diff --git a/Makefile b/Makefile
index 4796565..f7360e1 100644
--- a/Makefile
+++ b/Makefile
@@ -356,6 +356,7 @@ LIB_H += pack-refs.h
 LIB_H += pack-revindex.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
+LIB_H += cached-sha1-map.h
 LIB_H += path-list.h
 LIB_H += pkt-line.h
 LIB_H += progress.h
@@ -436,6 +437,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += cached-sha1-map.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/builtin-log.c b/builtin-log.c
index 430d876..fbfefbd 100644
--- a/builtin-log.c
+++ b/builtin-log.c
@@ -1081,6 +1081,16 @@ static int add_pending_commit(const char *arg,
struct rev_info *revs, int flags)
 	return -1;
 }

+static int git_cherry_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "cherry.cachepatchids")) {
+		cache_patch_ids = git_config_bool(var, value);
+		return 0;
+	}
+
+	return 0;
+}
+
 static const char cherry_usage[] =
 "git-cherry [-v] <upstream> [<head>] [<limit>]";
 int cmd_cherry(int argc, const char **argv, const char *prefix)
@@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
const char *prefix)
 	const char *limit = NULL;
 	int verbose = 0;

+	git_config(git_cherry_config, NULL);
+
 	if (argc > 1 && !strcmp(argv[1], "-v")) {
 		verbose = 1;
 		argc--;
diff --git a/cached-sha1-map.c b/cached-sha1-map.c
new file mode 100644
index 0000000..9cf7252
--- /dev/null
+++ b/cached-sha1-map.c
@@ -0,0 +1,293 @@
+#include "cached-sha1-map.h"
+
+union cached_sha1_map_header {
+	struct {
+		char signature[4]; /* CS1M */
+		uint32_t version;
+		uint32_t count;
+		uint32_t size;
+		uint32_t pad; /* pad to 20 bytes */
+	} u;
+	/* pad header out to 40 bytes.  As a consistency
+	 * check, pad.value stores the sha1 of pad.key. */
+	struct cached_sha1_entry pad;
+};
+
+static const char *signature = "CS1M";
+static const uint32_t version = 1;
+
+static int init_empty_map(struct cached_sha1_map *cache, uint32_t size)
+{
+	cache->count = 0;
+	cache->size = size;
+	cache->initialized = 1;
+	cache->mmapped = 0;
+	cache->dirty = 1;
+
+	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
+	if (!cache->entries) {
+		warning("failed to allocate empty map of size %"PRIu32" for %s",
+			size, git_path(cache->filename));
+		cache->size = 0;
+		cache->dirty = 0;
+		return -1;
+	}
+	return 0;
+}
+
+static int grow_map(struct cached_sha1_map *cache)
+{
+	struct cached_sha1_map new_cache;
+	uint32_t i;
+
+	if (cache->size * 2 == 0) {
+		warning("%s overflowed, so resetting to empty",
+			git_path(cache->filename));
+		return init_empty_map(cache, 64);
+	}
+
+	/* allocate cache with twice the size */
+	new_cache.filename = cache->filename;
+	if (init_empty_map(&new_cache, cache->size * 2)) {
+		warning("failed to grow %s to size %"PRIu32,
+			git_path(cache->filename), cache->size * 2);
+		return init_empty_map(cache, 64);
+	}
+
+	/* reinsert all entries */
+ 	for (i = 0; i < cache->size; i++)
+		if (!is_null_sha1(cache->entries[i].key))
+			set_cached_sha1_entry(&new_cache,
+				cache->entries[i].key, cache->entries[i].value);
+	/* finish */
+	free_cached_sha1_map(cache);
+	*cache = new_cache;
+	return 0;
+}
+
+/* Any errors that occur result in the cache being initialized to empty */
+static int init_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	int fd;
+	union cached_sha1_map_header header;
+	const char *filename;
+	size_t map_size;
+	SHA_CTX ctx;
+
+	if (cache->initialized)
+		return cache->size ? 0 : -1;
+
+	filename = git_path(cache->filename);
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			warning("failed to read '%s': %s", filename,
+				strerror(errno));
+		goto empty;
+	}
+
+	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header)) {
+		warning("cannot read %s header", filename);
+		goto empty;
+	}
+
+	if (memcmp(header.u.signature, signature, 4)) {
+		warning("%s has invalid header", filename);
+		goto empty;
+	}
+
+	if (ntohl(header.u.version) != version) {
+		warning("%s has unrecognized version %"PRIu32, filename,
+			ntohl(header.u.version));
+		goto empty;
+	}
+
+	cache->count = ntohl(header.u.count);
+	cache->size = ntohl(header.u.size);
+
+	if (cache->size & (cache->size-1)) {
+		warning("%s is corrupt: size %"PRIu32" is not a power of two",
+			filename, cache->size);
+		goto empty;
+	}
+
+	if (cache->count >= cache->size) {
+		warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
+			filename, cache->count, cache->size);
+		goto empty;
+	}
+
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.key, &ctx); /* reuse pad.key to store its sha1 */
+	if (hashcmp(header.pad.key, header.pad.value)) {
+		warning("%s header has invalid sha1", filename);
+		goto empty;
+	}
+
+	cache->dirty = 0;
+	cache->initialized = 1;
+	cache->mmapped = 1;
+
+	/* mmap entire file so that file / memory blocks are aligned */
+	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
+	cache->entries = mmap(NULL, map_size,
+		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (cache->entries == MAP_FAILED) {
+		/* this is just a cache, so don't free pack memory and retry */
+		warning("%s mmap failed: %s", filename, strerror(errno));
+		goto empty;
+	}
+	cache->entries += 1; /* skip header */
+	return 0;
+
+empty:
+	if (fd >= 0)
+		close(fd);
+	return init_empty_map(cache, 64);
+}
+
+int write_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	union cached_sha1_map_header header;
+	struct lock_file update_lock;
+	int fd;
+	size_t map_size;
+	const char *filename;
+	SHA_CTX ctx;
+
+	if (!cache->initialized || !cache->dirty)
+		return 0;
+
+	filename = git_path(cache->filename);
+	fd = hold_lock_file_for_update(&update_lock, filename, 0);
+
+	if (fd < 0)
+	{
+		warning("could not construct %s", filename);
+		return -1;
+	}
+
+	/* initialize header */
+	memcpy(header.u.signature, signature, 4);
+	header.u.version = htonl(version);
+	header.u.count = htonl(cache->count);
+	header.u.size = htonl(cache->size);
+	header.u.pad = 0; /* make header deterministic */
+
+	/* compute header sha1 */
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.value, &ctx);
+
+	map_size = sizeof(struct cached_sha1_entry) * cache->size;
+	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
+		|| write_in_full(fd, cache->entries, map_size) != map_size)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	if (commit_lock_file(&update_lock) < 0)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	cache->dirty = 0;
+	return 0;
+}
+
+void free_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	if (!cache->initialized)
+		return;
+
+	if (cache->mmapped)
+		munmap(cache->entries - 1,
+			sizeof(struct cached_sha1_entry) * (cache->size + 1));
+	else
+		free(cache->entries);
+}
+
+/* The fact that size is a power of two means count-1 <= INT32_MAX, so it
+ * is safe to return signed integers here. */
+static int32_t get_hash_index(const unsigned char *sha1)
+{
+	/* this is alignment safe since 40 is a multiple of 4 */
+	return ntohl(*(uint32_t*)sha1);
+}
+
+/*
+ * Returns the index if the entry exists, and the complemented index of
+ * the next free entry otherwise.  If the hash is full, returns the
+ * complement of a nonfree entry and sets count = size (this happens
+ * only if the file is corrupt).
+ */
+static int32_t find_helper(struct cached_sha1_map *cache,
+	const unsigned char *key)
+{
+	int32_t i, mask, full;
+
+	mask = cache->size - 1;
+	i = get_hash_index(key) & mask;
+	full = (i-1) & mask;
+
+	for (; ; i = (i+1) & mask) {
+		if (!hashcmp(key, cache->entries[i].key))
+			return i;
+		else if (is_null_sha1(cache->entries[i].key) || i == full)
+			return ~i;
+		if (i == full) {
+			cache->count = cache->size; /* fix count */
+			return ~1;
+		}
+	}
+}
+
+int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, unsigned char *value)
+{
+	int32_t i;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+	if(i < 0)
+		return -1;
+
+	/* entry found, return value */
+	hashcpy(value, cache->entries[i].value);
+	return 0;
+}
+
+int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value)
+{
+	int32_t i;
+	struct cached_sha1_entry *entry;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+
+	if (i < 0) { /* write new entry */
+		entry = cache->entries + ~i;
+		hashcpy(entry->key, key);
+		hashcpy(entry->value, value);
+		cache->count++;
+		cache->dirty = 1;
+	} else { /* overwrite existing entry */
+		entry = cache->entries + i;
+		if (hashcmp(value, entry->value)) {
+			hashcpy(entry->value, value);
+			cache->dirty = 1;
+		}
+	}
+
+	if (cache->count >= cache->size/4*3)
+		return grow_map(cache);
+	return 0;
+}
diff --git a/cached-sha1-map.h b/cached-sha1-map.h
new file mode 100644
index 0000000..296c17c
--- /dev/null
+++ b/cached-sha1-map.h
@@ -0,0 +1,45 @@
+#ifndef CACHED_SHA1_MAP_H
+#define CACHED_SHA1_MAP_H
+
+#include "cache.h"
+
+/*
+ * A cached-sha1-map is a file storing a hash map from sha1 to sha1.
+ *
+ * The file is mmap'ed, updated in memory during operation, and flushed
+ * back to disk when freed.  Currently the entire file is rewritten for
+ * any change.  This could be a significant bottleneck for common uses,
+ * so it would be good to fix this later if possible.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ */
+
+struct cached_sha1_entry {
+	unsigned char key[20];
+	unsigned char value[20];
+};
+
+struct cached_sha1_map {
+	const char *filename; /* relative to GIT_DIR */
+
+	/* rest is for internal use */
+	uint32_t count, size;
+	unsigned int initialized : 1;
+	unsigned int dirty : 1;
+	unsigned int mmapped : 1;
+	struct cached_sha1_entry *entries; /* pointer to mmap'ed memory + 1 */
+};
+
+extern int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key,unsigned char *value);
+
+extern int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value);
+
+extern int write_cached_sha1_map(struct cached_sha1_map *cache);
+
+extern void free_cached_sha1_map(struct cached_sha1_map *cache);
+
+#endif
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..663ffee 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -2,17 +2,36 @@
 #include "diff.h"
 #include "commit.h"
 #include "patch-ids.h"
+#include "cached-sha1-map.h"
+
+int cache_patch_ids = 1;
+static struct cached_sha1_map patch_id_cache;

 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	int ret;
+
+	/* pull patch-id out of the cache if possible */
+	patch_id_cache.filename = "patch-id-cache";
+	if (cache_patch_ids && !get_cached_sha1_entry(&patch_id_cache,
+			commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	/* record commit, patch-id pair in cache */
+	if (cache_patch_ids)
+		set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -136,6 +155,11 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	/* write cached patch-ids and ignore any errors that arise
+	 * (e.g. if the repository is write protected) */
+	if (cache_patch_ids)
+		write_cached_sha1_map(&patch_id_cache);
 	return 0;
 }

diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c0ebdc1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -18,4 +18,6 @@ int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
 struct patch_id *has_commit_patch_id(struct commit *, struct patch_ids *);

+extern int cache_patch_ids;
+
 #endif /* PATCH_IDS_H */
-- 
1.5.6.2.256.g33ad.dirty

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-13  3:14                           ` Geoffrey Irving
@ 2008-07-15 16:57                             ` Geoffrey Irving
  2008-07-15 21:52                               ` Johannes Schindelin
  0 siblings, 1 reply; 21+ messages in thread
From: Geoffrey Irving @ 2008-07-15 16:57 UTC (permalink / raw)
  To: Junio C Hamano, git@vger.kernel.org; +Cc: Johannes Schindelin

Add cached-sha-map.[ch] implementing a persistent hash map from sha1 to
sha1.  The map is read with mmap, and completely rewritten if any entries
change.  It would be good to add incremental update to handle the usual case
where only a few entries change.

This structure is used by patch-ids.c to cache the mapping from commit to
patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
this speeds up the second invocation of git-cherry by two orders of
magnitude.  The caching can be disabled by setting cherry.cachepatchids to
false.  Only patch-ids with default diff options are cached.

Original code cannibalized from Johannes Schindelin's notes-index structure.

Signed-off-by: Geoffrey Irving <irving@naml.us>
---

Here's a version of the patch fixing the test breakage.  Only
patch-ids.c differs from the previous version.

 Documentation/config.txt |    5 +
 Makefile                 |    2 +
 builtin-log.c            |   12 ++
 cached-sha1-map.c        |  293 ++++++++++++++++++++++++++++++++++++++++++++++
 cached-sha1-map.h        |   45 +++++++
 patch-ids.c              |   40 ++++++-
 patch-ids.h              |    2 +
 7 files changed, 398 insertions(+), 1 deletions(-)
 create mode 100644 cached-sha1-map.c
 create mode 100644 cached-sha1-map.h

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 838794d..02b8113 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -468,6 +468,11 @@ browser.<tool>.path::
 	browse HTML help (see '-w' option in linkgit:git-help[1]) or a
 	working repository in gitweb (see linkgit:git-instaweb[1]).

+cherry.cachepatchids::
+	If true, linkgit:git-cherry will store a cache of computed patch-ids
+	in $GIT_DIR/patch-id-cache in order to make repeated invocations faster.
+	Defaults to true.
+
 clean.requireForce::
 	A boolean to make git-clean do nothing unless given -f
 	or -n.   Defaults to true.
diff --git a/Makefile b/Makefile
index 4796565..f7360e1 100644
--- a/Makefile
+++ b/Makefile
@@ -356,6 +356,7 @@ LIB_H += pack-refs.h
 LIB_H += pack-revindex.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
+LIB_H += cached-sha1-map.h
 LIB_H += path-list.h
 LIB_H += pkt-line.h
 LIB_H += progress.h
@@ -436,6 +437,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += cached-sha1-map.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/builtin-log.c b/builtin-log.c
index 430d876..fbfefbd 100644
--- a/builtin-log.c
+++ b/builtin-log.c
@@ -1081,6 +1081,16 @@ static int add_pending_commit(const char *arg,
struct rev_info *revs, int flags)
 	return -1;
 }

+static int git_cherry_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "cherry.cachepatchids")) {
+		cache_patch_ids = git_config_bool(var, value);
+		return 0;
+	}
+
+	return 0;
+}
+
 static const char cherry_usage[] =
 "git-cherry [-v] <upstream> [<head>] [<limit>]";
 int cmd_cherry(int argc, const char **argv, const char *prefix)
@@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
const char *prefix)
 	const char *limit = NULL;
 	int verbose = 0;

+	git_config(git_cherry_config, NULL);
+
 	if (argc > 1 && !strcmp(argv[1], "-v")) {
 		verbose = 1;
 		argc--;
diff --git a/cached-sha1-map.c b/cached-sha1-map.c
new file mode 100644
index 0000000..9cf7252
--- /dev/null
+++ b/cached-sha1-map.c
@@ -0,0 +1,293 @@
+#include "cached-sha1-map.h"
+
+union cached_sha1_map_header {
+	struct {
+		char signature[4]; /* CS1M */
+		uint32_t version;
+		uint32_t count;
+		uint32_t size;
+		uint32_t pad; /* pad to 20 bytes */
+	} u;
+	/* pad header out to 40 bytes.  As a consistency
+	 * check, pad.value stores the sha1 of pad.key. */
+	struct cached_sha1_entry pad;
+};
+
+static const char *signature = "CS1M";
+static const uint32_t version = 1;
+
+static int init_empty_map(struct cached_sha1_map *cache, uint32_t size)
+{
+	cache->count = 0;
+	cache->size = size;
+	cache->initialized = 1;
+	cache->mmapped = 0;
+	cache->dirty = 1;
+
+	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
+	if (!cache->entries) {
+		warning("failed to allocate empty map of size %"PRIu32" for %s",
+			size, git_path(cache->filename));
+		cache->size = 0;
+		cache->dirty = 0;
+		return -1;
+	}
+	return 0;
+}
+
+static int grow_map(struct cached_sha1_map *cache)
+{
+	struct cached_sha1_map new_cache;
+	uint32_t i;
+
+	if (cache->size * 2 == 0) {
+		warning("%s overflowed, so resetting to empty",
+			git_path(cache->filename));
+		return init_empty_map(cache, 64);
+	}
+
+	/* allocate cache with twice the size */
+	new_cache.filename = cache->filename;
+	if (init_empty_map(&new_cache, cache->size * 2)) {
+		warning("failed to grow %s to size %"PRIu32,
+			git_path(cache->filename), cache->size * 2);
+		return init_empty_map(cache, 64);
+	}
+
+	/* reinsert all entries */
+ 	for (i = 0; i < cache->size; i++)
+		if (!is_null_sha1(cache->entries[i].key))
+			set_cached_sha1_entry(&new_cache,
+				cache->entries[i].key, cache->entries[i].value);
+	/* finish */
+	free_cached_sha1_map(cache);
+	*cache = new_cache;
+	return 0;
+}
+
+/* Any errors that occur result in the cache being initialized to empty */
+static int init_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	int fd;
+	union cached_sha1_map_header header;
+	const char *filename;
+	size_t map_size;
+	SHA_CTX ctx;
+
+	if (cache->initialized)
+		return cache->size ? 0 : -1;
+
+	filename = git_path(cache->filename);
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			warning("failed to read '%s': %s", filename,
+				strerror(errno));
+		goto empty;
+	}
+
+	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header)) {
+		warning("cannot read %s header", filename);
+		goto empty;
+	}
+
+	if (memcmp(header.u.signature, signature, 4)) {
+		warning("%s has invalid header", filename);
+		goto empty;
+	}
+
+	if (ntohl(header.u.version) != version) {
+		warning("%s has unrecognized version %"PRIu32, filename,
+			ntohl(header.u.version));
+		goto empty;
+	}
+
+	cache->count = ntohl(header.u.count);
+	cache->size = ntohl(header.u.size);
+
+	if (cache->size & (cache->size-1)) {
+		warning("%s is corrupt: size %"PRIu32" is not a power of two",
+			filename, cache->size);
+		goto empty;
+	}
+
+	if (cache->count >= cache->size) {
+		warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
+			filename, cache->count, cache->size);
+		goto empty;
+	}
+
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.key, &ctx); /* reuse pad.key to store its sha1 */
+	if (hashcmp(header.pad.key, header.pad.value)) {
+		warning("%s header has invalid sha1", filename);
+		goto empty;
+	}
+
+	cache->dirty = 0;
+	cache->initialized = 1;
+	cache->mmapped = 1;
+
+	/* mmap entire file so that file / memory blocks are aligned */
+	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
+	cache->entries = mmap(NULL, map_size,
+		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (cache->entries == MAP_FAILED) {
+		/* this is just a cache, so don't free pack memory and retry */
+		warning("%s mmap failed: %s", filename, strerror(errno));
+		goto empty;
+	}
+	cache->entries += 1; /* skip header */
+	return 0;
+
+empty:
+	if (fd >= 0)
+		close(fd);
+	return init_empty_map(cache, 64);
+}
+
+int write_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	union cached_sha1_map_header header;
+	struct lock_file update_lock;
+	int fd;
+	size_t map_size;
+	const char *filename;
+	SHA_CTX ctx;
+
+	if (!cache->initialized || !cache->dirty)
+		return 0;
+
+	filename = git_path(cache->filename);
+	fd = hold_lock_file_for_update(&update_lock, filename, 0);
+
+	if (fd < 0)
+	{
+		warning("could not construct %s", filename);
+		return -1;
+	}
+
+	/* initialize header */
+	memcpy(header.u.signature, signature, 4);
+	header.u.version = htonl(version);
+	header.u.count = htonl(cache->count);
+	header.u.size = htonl(cache->size);
+	header.u.pad = 0; /* make header deterministic */
+
+	/* compute header sha1 */
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.value, &ctx);
+
+	map_size = sizeof(struct cached_sha1_entry) * cache->size;
+	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
+		|| write_in_full(fd, cache->entries, map_size) != map_size)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	if (commit_lock_file(&update_lock) < 0)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	cache->dirty = 0;
+	return 0;
+}
+
+void free_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	if (!cache->initialized)
+		return;
+
+	if (cache->mmapped)
+		munmap(cache->entries - 1,
+			sizeof(struct cached_sha1_entry) * (cache->size + 1));
+	else
+		free(cache->entries);
+}
+
+/* The fact that size is a power of two means count-1 <= INT32_MAX, so it
+ * is safe to return signed integers here. */
+static int32_t get_hash_index(const unsigned char *sha1)
+{
+	/* this is alignment safe since 40 is a multiple of 4 */
+	return ntohl(*(uint32_t*)sha1);
+}
+
+/*
+ * Returns the index if the entry exists, and the complemented index of
+ * the next free entry otherwise.  If the hash is full, returns the
+ * complement of a nonfree entry and sets count = size (this happens
+ * only if the file is corrupt).
+ */
+static int32_t find_helper(struct cached_sha1_map *cache,
+	const unsigned char *key)
+{
+	int32_t i, mask, full;
+
+	mask = cache->size - 1;
+	i = get_hash_index(key) & mask;
+	full = (i-1) & mask;
+
+	for (; ; i = (i+1) & mask) {
+		if (!hashcmp(key, cache->entries[i].key))
+			return i;
+		else if (is_null_sha1(cache->entries[i].key) || i == full)
+			return ~i;
+		if (i == full) {
+			cache->count = cache->size; /* fix count */
+			return ~1;
+		}
+	}
+}
+
+int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, unsigned char *value)
+{
+	int32_t i;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+	if(i < 0)
+		return -1;
+
+	/* entry found, return value */
+	hashcpy(value, cache->entries[i].value);
+	return 0;
+}
+
+int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value)
+{
+	int32_t i;
+	struct cached_sha1_entry *entry;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+
+	if (i < 0) { /* write new entry */
+		entry = cache->entries + ~i;
+		hashcpy(entry->key, key);
+		hashcpy(entry->value, value);
+		cache->count++;
+		cache->dirty = 1;
+	} else { /* overwrite existing entry */
+		entry = cache->entries + i;
+		if (hashcmp(value, entry->value)) {
+			hashcpy(entry->value, value);
+			cache->dirty = 1;
+		}
+	}
+
+	if (cache->count >= cache->size/4*3)
+		return grow_map(cache);
+	return 0;
+}
diff --git a/cached-sha1-map.h b/cached-sha1-map.h
new file mode 100644
index 0000000..296c17c
--- /dev/null
+++ b/cached-sha1-map.h
@@ -0,0 +1,45 @@
+#ifndef CACHED_SHA1_MAP_H
+#define CACHED_SHA1_MAP_H
+
+#include "cache.h"
+
+/*
+ * A cached-sha1-map is a file storing a hash map from sha1 to sha1.
+ *
+ * The file is mmap'ed, updated in memory during operation, and flushed
+ * back to disk when freed.  Currently the entire file is rewritten for
+ * any change.  This could be a significant bottleneck for common uses,
+ * so it would be good to fix this later if possible.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ */
+
+struct cached_sha1_entry {
+	unsigned char key[20];
+	unsigned char value[20];
+};
+
+struct cached_sha1_map {
+	const char *filename; /* relative to GIT_DIR */
+
+	/* rest is for internal use */
+	uint32_t count, size;
+	unsigned int initialized : 1;
+	unsigned int dirty : 1;
+	unsigned int mmapped : 1;
+	struct cached_sha1_entry *entries; /* pointer to mmap'ed memory + 1 */
+};
+
+extern int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key,unsigned char *value);
+
+extern int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value);
+
+extern int write_cached_sha1_map(struct cached_sha1_map *cache);
+
+extern void free_cached_sha1_map(struct cached_sha1_map *cache);
+
+#endif
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..5ba12fb 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -2,17 +2,49 @@
 #include "diff.h"
 #include "commit.h"
 #include "patch-ids.h"
+#include "cached-sha1-map.h"
+
+int cache_patch_ids = 1;
+static struct cached_sha1_map patch_id_cache;
+
+static struct diff_options default_options;
+#define IGNORED_DIFF_OPTS (DIFF_OPT_HAS_CHANGES | DIFF_OPT_CHECK_FAILED)

 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	int use_cache = 0;
+	int ret;
+
+	/* only cache if diff options are defaults */
+	if (cache_patch_ids) {
+		default_options.found_changes = options->found_changes;
+		default_options.flags = (options->flags & IGNORED_DIFF_OPTS)
+			| (default_options.flags & ~IGNORED_DIFF_OPTS);
+		use_cache = !memcmp(options, &default_options,
+				    sizeof(struct diff_options));
+	}
+
+	/* pull patch-id out of the cache if possible */
+	patch_id_cache.filename = "patch-id-cache";
+	if (use_cache && !get_cached_sha1_entry(&patch_id_cache,
+			commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	/* record commit, patch-id pair in cache */
+	if (use_cache)
+		set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -124,6 +156,7 @@ int init_patch_ids(struct patch_ids *ids)
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	if (diff_setup_done(&ids->diffopts) < 0)
 		return error("diff_setup_done failed");
+	default_options = ids->diffopts; /* remember defaults */
 	return 0;
 }

@@ -136,6 +169,11 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	/* write cached patch-ids and ignore any errors that arise
+	 * (e.g. if the repository is write protected) */
+	if (cache_patch_ids)
+		write_cached_sha1_map(&patch_id_cache);
 	return 0;
 }

diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c0ebdc1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -18,4 +18,6 @@ int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
 struct patch_id *has_commit_patch_id(struct commit *, struct patch_ids *);

+extern int cache_patch_ids;
+
 #endif /* PATCH_IDS_H */
-- 
1.5.6.2.256.g3ef05.dirty

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-15 16:57                             ` Geoffrey Irving
@ 2008-07-15 21:52                               ` Johannes Schindelin
  2008-07-15 22:14                                 ` Junio C Hamano
  0 siblings, 1 reply; 21+ messages in thread
From: Johannes Schindelin @ 2008-07-15 21:52 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Junio C Hamano, git@vger.kernel.org

Hi,

Okay, it seems like I never have time to review this, so I'll just 
take a few minutes to comment on some aspects:

On Tue, 15 Jul 2008, Geoffrey Irving wrote:

> +static int git_cherry_config(const char *var, const char *value, void *cb)
> +{
> +	if (!strcmp(var, "cherry.cachepatchids")) {
> +		cache_patch_ids = git_config_bool(var, value);
> +		return 0;
> +	}
> +
> +	return 0;
> +}
> +
>  static const char cherry_usage[] =
>  "git-cherry [-v] <upstream> [<head>] [<limit>]";
>  int cmd_cherry(int argc, const char **argv, const char *prefix)
> @@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
> const char *prefix)
>  	const char *limit = NULL;
>  	int verbose = 0;
> 
> +	git_config(git_cherry_config, NULL);
> +
>  	if (argc > 1 && !strcmp(argv[1], "-v")) {
>  		verbose = 1;
>  		argc--;

Is this really purely for cherry, and not at all for "log --cherry-pick"?  
Maybe it should be "cache.patchIds" to begin with.

> diff --git a/cached-sha1-map.c b/cached-sha1-map.c
> new file mode 100644
> index 0000000..9cf7252
> --- /dev/null
> +++ b/cached-sha1-map.c
> @@ -0,0 +1,293 @@
> +#include "cached-sha1-map.h"
> +
> +union cached_sha1_map_header {
> +	struct {
> +		char signature[4]; /* CS1M */
> +		uint32_t version;
> +		uint32_t count;
> +		uint32_t size;
> +		uint32_t pad; /* pad to 20 bytes */
> +	} u;
> +	/* pad header out to 40 bytes.  As a consistency
> +	 * check, pad.value stores the sha1 of pad.key. */
> +	struct cached_sha1_entry pad;

Why does it have to be a union?

> +};
> +
> +static const char *signature = "CS1M";

Carrie Scr*ws 1 Man?

> +static int init_empty_map(struct cached_sha1_map *cache, uint32_t size)
> +{
> +	cache->count = 0;
> +	cache->size = size;

We seem to call this "alloc" (almost) everywhere else.

> +	cache->initialized = 1;

Maybe we do not need that: when size is != 0, it was initialized.

> +	cache->mmapped = 0;
> +	cache->dirty = 1;

Is it already dirty?  I don't think so.

> +	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
> +	if (!cache->entries) {
> +		warning("failed to allocate empty map of size %"PRIu32" for %s",
> +			size, git_path(cache->filename));

xcalloc() to the rescue.

> +static int grow_map(struct cached_sha1_map *cache)
> +{
> +	struct cached_sha1_map new_cache;
> +	uint32_t i;
> +
> +	if (cache->size * 2 == 0) {
> +		warning("%s overflowed, so resetting to empty",
> +			git_path(cache->filename));

IMHO we can safely ignore that case: If that is true, we have seen at 
least 2^32 objects.  However, each object takes more than 4 bytes, so that 
is a literal impossibility.

I'd rather not bother with this case.

> +	/* allocate cache with twice the size */
> +	new_cache.filename = cache->filename;
> +	if (init_empty_map(&new_cache, cache->size * 2)) {

Really, I think that these checks should be _made_ unnecessary, by 
restricting the size of the cache.  IMO Caching more than 2^10 patch ids 
(completely made up on the spot) is probably even detrimental, and it 
might be better to just scratch them all and start with a new cache then.

Besides, the file would have a substantial size by then.

> +static int init_cached_sha1_map(struct cached_sha1_map *cache)
> +{
>
> [...]
>
> +	SHA1_Init(&ctx);
> +	SHA1_Update(&ctx, header.pad.key, 20);
> +	SHA1_Final(header.pad.key, &ctx); /* reuse pad.key to store its sha1 */
> +	if (hashcmp(header.pad.key, header.pad.value)) {
> +		warning("%s header has invalid sha1", filename);
> +		goto empty;
> +	}

I do not think that it is worth checking that.  If you do not trust your 
hard disk, you might just as well jump out the window.

Checking just takes too much time.

> +	/* mmap entire file so that file / memory blocks are aligned */
> +	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
> +	cache->entries = mmap(NULL, map_size,
> +		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);

AFAIR there were _serious_ performance issues with mmap() on non-Linux 
platforms.  I chose pread() in my original implementation for a reason.

> +static int32_t find_helper(struct cached_sha1_map *cache,
> +	const unsigned char *key)
> +{
> +	int32_t i, mask, full;
> +
> +	mask = cache->size - 1;
> +	i = get_hash_index(key) & mask;
> +	full = (i-1) & mask;
> +
> +	for (; ; i = (i+1) & mask) {

Wow, that is ugly.

> +struct cached_sha1_map {
> +	const char *filename; /* relative to GIT_DIR */

Why does the map need to know its name?  The index does not.

> +static struct diff_options default_options;
> +#define IGNORED_DIFF_OPTS (DIFF_OPT_HAS_CHANGES | DIFF_OPT_CHECK_FAILED)
> 
>  static int commit_patch_id(struct commit *commit, struct diff_options *options,
>  		    unsigned char *sha1)
>  {
> +	int use_cache = 0;
> +	int ret;
> +
> +	/* only cache if diff options are defaults */
> +	if (cache_patch_ids) {
> +		default_options.found_changes = options->found_changes;
> +		default_options.flags = (options->flags & IGNORED_DIFF_OPTS)
> +			| (default_options.flags & ~IGNORED_DIFF_OPTS);
> +		use_cache = !memcmp(options, &default_options,
> +				    sizeof(struct diff_options));
> +	}

Hmm.

I'd rather set "revs.diff" late, and unset "cache_patch_ids" if it is set.  
IOW let the rev_opt parser decide.

Unfortunately, I do not have time to look into your patch in more detail, 
even if I like the idea.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-15 21:52                               ` Johannes Schindelin
@ 2008-07-15 22:14                                 ` Junio C Hamano
  2008-07-16  6:57                                   ` Karl Hasselström
  2008-07-16  7:22                                   ` Johan Herland
  0 siblings, 2 replies; 21+ messages in thread
From: Junio C Hamano @ 2008-07-15 22:14 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Geoffrey Irving, git@vger.kernel.org

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Okay, it seems like I never have time to review this, so I'll just 
> take a few minutes to comment on some aspects:
>
>> @@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
>> const char *prefix)
>>  	const char *limit = NULL;
>>  	int verbose = 0;
>> 
>> +	git_config(git_cherry_config, NULL);
>> +
>>  	if (argc > 1 && !strcmp(argv[1], "-v")) {
>>  		verbose = 1;
>>  		argc--;
>
> Is this really purely for cherry, and not at all for "log --cherry-pick"?  
> Maybe it should be "cache.patchIds" to begin with.

What other things would we want caches for?

As a general rule, I'd prefer keeping these unproven new features opt
in (i.e. default to false unless explicitly asked for).

>> +union cached_sha1_map_header {
>> +	struct {
>> +		char signature[4]; /* CS1M */
>> +		uint32_t version;
>> +		uint32_t count;
>> +		uint32_t size;
>> +		uint32_t pad; /* pad to 20 bytes */
>> +	} u;
>> +	/* pad header out to 40 bytes.  As a consistency
>> +	 * check, pad.value stores the sha1 of pad.key. */
>> +	struct cached_sha1_entry pad;
>
> Why does it have to be a union?

Hmm.  I think you are right.

	struct cached_sha1_map_header {
        	char signature[4];
                uint32_t version;
                uint32_t count;
                uint32_t size;
                uint32_t unused;
		unsigned char csum[20];
	};

would equally be good, as long as we assume the struct is naturally
packed.  I do agree with you that it may not worth checking only the
header, though. 

>> +static const char *signature = "CS1M";
>
> Carrie Scr*ws 1 Man?

No Idea ;-)

>> +	cache->mmapped = 0;
>> +	cache->dirty = 1;
>
> Is it already dirty?  I don't think so.

This flag is more about "do we need to write it back to file", and when it
starts out without reading from an existing file, we always need to as
long as the table contains something at the end of the processing.

You could instead check (!cache->mmapped && cache->count) for that, I
guess.

>> +	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
>> +	if (!cache->entries) {
>> +		warning("failed to allocate empty map of size %"PRIu32" for %s",
>> +			size, git_path(cache->filename));
>
> xcalloc() to the rescue.

This is purely optional cache and we would want to degrade to operate
without it if any of these fails.  xcalloc() won't let you do so.

> Really, I think that these checks should be _made_ unnecessary, by 
> restricting the size of the cache.  IMO Caching more than 2^10 patch ids 
> (completely made up on the spot) is probably even detrimental, and it 
> might be better to just scratch them all and start with a new cache then.

Probably.  Or fall back on uncached operation.

>> +static int init_cached_sha1_map(struct cached_sha1_map *cache)
>> +{
>>
>> [...]
>>
>> +	SHA1_Init(&ctx);
>> +	SHA1_Update(&ctx, header.pad.key, 20);
>> +	SHA1_Final(header.pad.key, &ctx); /* reuse pad.key to store its sha1 */
>> +	if (hashcmp(header.pad.key, header.pad.value)) {
>> +		warning("%s header has invalid sha1", filename);
>> +		goto empty;
>> +	}
>
> I do not think that it is worth checking that.  If you do not trust your 
> hard disk, you might just as well jump out the window.
>
> Checking just takes too much time.

This is only checking the header, so it won't take much time, but I tend
to doubt the value of this.

>> +	/* mmap entire file so that file / memory blocks are aligned */
>> +	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
>> +	cache->entries = mmap(NULL, map_size,
>> +		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
>
> AFAIR there were _serious_ performance issues with mmap() on non-Linux 
> platforms.  I chose pread() in my original implementation for a reason.

That is not a reason to punish users on platforms with working mmap(2) ;-).

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-15 22:14                                 ` Junio C Hamano
@ 2008-07-16  6:57                                   ` Karl Hasselström
  2008-07-16  7:22                                   ` Johan Herland
  1 sibling, 0 replies; 21+ messages in thread
From: Karl Hasselström @ 2008-07-16  6:57 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Schindelin, Geoffrey Irving, git@vger.kernel.org

On 2008-07-15 15:14:38 -0700, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> > > +static const char *signature = "CS1M";
> >
> > Carrie Scr*ws 1 Man?
>
> No Idea ;-)

Given the "cached_sha1_map_header" union (or struct) earlier in the
patch, I know what my guess is. :-)

-- 
Karl Hasselström, kha@treskal.com
      www.treskal.com/kalle

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] cherry: cache patch-ids to avoid repeating work
  2008-07-15 22:14                                 ` Junio C Hamano
  2008-07-16  6:57                                   ` Karl Hasselström
@ 2008-07-16  7:22                                   ` Johan Herland
  1 sibling, 0 replies; 21+ messages in thread
From: Johan Herland @ 2008-07-16  7:22 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Geoffrey Irving

On Wednesday 16 July 2008, Junio C Hamano wrote:
> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> > Okay, it seems like I never have time to review this, so I'll just
> >
> > take a few minutes to comment on some aspects:
> >> @@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
> >> const char *prefix)
> >>  	const char *limit = NULL;
> >>  	int verbose = 0;
> >>
> >> +	git_config(git_cherry_config, NULL);
> >> +
> >>  	if (argc > 1 && !strcmp(argv[1], "-v")) {
> >>  		verbose = 1;
> >>  		argc--;
> >
> > Is this really purely for cherry, and not at all for "log
> > --cherry-pick"? Maybe it should be "cache.patchIds" to begin with.
>
> What other things would we want caches for?

This should be fairly obvious:

- git-notes (uses sha1-to-sha1 cache for storing commit-to-note
  relationship)

- integrated bug trackers (uses sha1-to-sha1 cache for storing
  commit-to-bugreport (or similar) relationships)

- ...any other mechanism that want to quickly map from a git object to some
  associated data

There are probably plenty more ideas and use cases if people start looking.

AFAICS, each different use case would keep its cache in a separate file.

For local-repo-only caches the cache is kept within $GIT_DIR, and for shared 
caches (IF that makes sense in any of the use cases) the cache could be 
located in the working tree (either as a .git_foo file on relevant 
branches, or as a file on a separate domain-specific branch).


Have fun! :)

...Johan

-- 
Johan Herland, <johan@herland.net>
www.herland.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2008-07-16  7:23 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-09  3:53 [PATCH 1/3] cherry: cache patch-ids to avoid repeating work Geoffrey Irving
2008-07-09  5:14 ` Junio C Hamano
2008-07-09  5:26   ` Geoffrey Irving
2008-07-09  6:24     ` Junio C Hamano
2008-07-09 12:18       ` Johannes Schindelin
2008-07-10  3:34       ` [PATCH] " Geoffrey Irving
2008-07-10 14:09         ` Geoffrey Irving
2008-07-10 14:28           ` Johannes Schindelin
2008-07-10 14:33             ` Geoffrey Irving
2008-07-10 15:56               ` Johannes Schindelin
2008-07-11  6:54               ` Junio C Hamano
2008-07-11 14:58                 ` Geoffrey Irving
2008-07-11 15:36                   ` Johannes Schindelin
2008-07-11 15:41                     ` Geoffrey Irving
2008-07-11 15:48                       ` Johannes Schindelin
     [not found]                         ` <7vej60jln6.fsf@gitster.siamese.dyndns.org>
2008-07-13  3:14                           ` Geoffrey Irving
2008-07-15 16:57                             ` Geoffrey Irving
2008-07-15 21:52                               ` Johannes Schindelin
2008-07-15 22:14                                 ` Junio C Hamano
2008-07-16  6:57                                   ` Karl Hasselström
2008-07-16  7:22                                   ` Johan Herland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).