All of lore.kernel.org
 help / color / mirror / Atom feed
From: Duy Nguyen <pclouds@gmail.com>
To: Jeff King <peff@peff.net>
Cc: git@vger.kernel.org, "Shawn O. Pearce" <spearce@spearce.org>
Subject: Re: [PATCH 4/6] introduce a commit metapack
Date: Thu, 31 Jan 2013 18:06:56 +0700	[thread overview]
Message-ID: <20130131110656.GA28093@lanh> (raw)
In-Reply-To: <CACsJy8Bqg6knVtddwaGSqtiXzVDgbO1JjbFOPMbF_RqrxM2rFQ@mail.gmail.com>

On Wed, Jan 30, 2013 at 09:16:29PM +0700, Duy Nguyen wrote:
> Perhaps we could store abbrev sha-1 instead of full sha-1. Nice
> space/time trade-off.

Following the on-disk format experiment yesterday, I changed the
format to:

 - a list a _short_ SHA-1 of cached commits
 - a list of cache entries, each (5 uint32_t) consists of:
   - uint32_t for the index in .idx sha-1 table to get full SHA-1 of
     the commit
   - uint32_t for timestamp
   - uint32_t for tree, 1st and 2nd parents for the index in .idx
     table

The length of SHA-1 is chosen to be able to unambiguously identify any
cached commits. Full SHA-1 check is done after to catch false
positives. For linux-2.6, SHA-1 length is 6 bytes, git and many
moderate-sized projects are 4 bytes. So it's 26 bytes per commit for
linux-2.6.git, or 8MB .commits file. Not as good as revindex approach
(5.5MB) but way better than the current one (27MB). And still good
enough for caching 2 parents unconditionally.

Performance seems to improve a tiny bit, probably because of more
compact search space: 0.6s with my patch vs 0.7s without (vs 3.4s
without cache).

-- 8< --
diff --git a/cache.h b/cache.h
index 1f96f65..8048d5b 100644
--- a/cache.h
+++ b/cache.h
@@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int);
 extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern int find_pack_entry_pos(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
 extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/commit-metapack.c b/commit-metapack.c
index 2c19f48..c984b8e 100644
--- a/commit-metapack.c
+++ b/commit-metapack.c
@@ -4,11 +4,21 @@
 #include "commit.h"
 #include "sha1-lookup.h"
 
+struct commit_entry {
+	uint32_t commit;	/* nth_packed_object_sha1 to get own SHA-1 */
+	uint32_t timestamp;
+	uint32_t tree;		/* nth_packed_object_sha1 to get tree SHA-1 */
+	uint32_t parent1; /* nth_packed_object_sha1 to get 1st parent SHA-1 */
+	uint32_t parent2; /* nth_packed_object_sha1 to get 2nd parent SHA-1 */
+};
+
 struct commit_metapack {
 	struct metapack mp;
 	uint32_t nr;
+	uint32_t abbrev_len;
+	struct packed_git *pack;
 	unsigned char *index;
-	unsigned char *data;
+	struct commit_entry *data;
 	struct commit_metapack *next;
 };
 static struct commit_metapack *commit_metapacks;
@@ -41,20 +51,23 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
 	}
 	memcpy(&it->nr, it->mp.data, 4);
 	it->nr = ntohl(it->nr);
+	memcpy(&it->abbrev_len, it->mp.data + 4, 4);
+	it->abbrev_len = ntohl(it->abbrev_len);
+	it->pack = pack;
 
 	/*
-	 * We need 84 bytes for each entry: sha1(20), date(4), tree(20),
-	 * parents(40).
+	 * We need 20+abbrev_len bytes for each entry: abbrev sha-1,
+	 * date(4), tree index(4), parent indexes(8).
 	 */
-	if (it->mp.len < (84 * it->nr + 4)) {
+	if (it->mp.len < ((sizeof(*it->data) + it->abbrev_len) * it->nr + 8)) {
 		warning("commit metapack for '%s' is truncated", pack->pack_name);
 		metapack_close(&it->mp);
 		free(it);
 		return NULL;
 	}
 
-	it->index = it->mp.data + 4;
-	it->data = it->index + 20 * it->nr;
+	it->index = it->mp.data + 8;
+	it->data = (struct commit_entry*)(it->index + it->abbrev_len * it->nr);
 
 	return it;
 }
@@ -83,29 +96,51 @@ static void prepare_commit_metapacks(void)
 
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2)
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2)
 {
 	struct commit_metapack *p;
 
 	prepare_commit_metapacks();
 	for (p = commit_metapacks; p; p = p->next) {
-		unsigned char *data;
-		int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1);
+		struct commit_entry *data;
+		uint32_t p1, p2;
+		unsigned lo, hi, mi;
+		int pos;
+
+		/* sha1_entry_pos does not work with abbreviated sha-1 */
+		lo = 0;
+		hi = p->nr;
+		pos = -1;
+		do {
+			unsigned mi = (lo + hi) / 2;
+			int cmp = memcmp(p->index + mi * p->abbrev_len, sha1, p->abbrev_len);
+
+			if (!cmp) {
+				pos = mi;
+				break;
+			}
+			if (cmp > 0)
+				hi = mi;
+			else
+				lo = mi+1;
+		} while (lo < hi);
 		if (pos < 0)
 			continue;
 
-		/* timestamp(4) + tree(20) + parents(40) */
-		data = p->data + 64 * pos;
-		*timestamp = *(uint32_t *)data;
-		*timestamp = ntohl(*timestamp);
-		data += 4;
-		*tree = data;
-		data += 20;
-		*parent1 = data;
-		data += 20;
-		*parent2 = data;
+		data = p->data + pos;
+
+		/* full sha-1 check again */
+		if (hashcmp(nth_packed_object_sha1(p->pack, ntohl(data->commit)), sha1))
+			continue;
+
+		*timestamp = ntohl(data->timestamp);
+		*tree = nth_packed_object_sha1(p->pack, ntohl(data->tree));
+		p1 = ntohl(data->parent1);
+		*parent1 = nth_packed_object_sha1(p->pack, p1);
+		p2 = ntohl(data->parent2);
+		*parent2 = p1 == p2 ? null_sha1 : nth_packed_object_sha1(p->pack, p2);
 
 		return 0;
 	}
@@ -113,13 +148,20 @@ int commit_metapack(unsigned char *sha1,
 	return -1;
 }
 
+struct write_cb {
+	struct commit_list **tail;
+	int abbrev_len;
+	const unsigned char *last_sha1;
+};
+
 static void get_commits(struct metapack_writer *mw,
 			const unsigned char *sha1,
 			void *data)
 {
-	struct commit_list ***tail = data;
+	struct write_cb *write_cb = (struct write_cb *)data;
 	enum object_type type = sha1_object_info(sha1, NULL);
 	struct commit *c;
+	int p1, p2;
 
 	if (type != OBJ_COMMIT)
 		return;
@@ -128,47 +170,95 @@ static void get_commits(struct metapack_writer *mw,
 	if (!c || parse_commit(c))
 		die("unable to read commit %s", sha1_to_hex(sha1));
 
+	if (c->buffer) {
+		free(c->buffer);
+		c->buffer = NULL;
+	}
+
 	/*
 	 * Our fixed-size parent list cannot represent root commits, nor
 	 * octopus merges. Just skip those commits, as we can fallback
 	 * in those rare cases to reading the actual commit object.
 	 */
 	if (!c->parents ||
-	    (c->parents && c->parents->next && c->parents->next->next))
+	    (c->parents && c->parents->next && c->parents->next->next) ||
+	    /* edge commits are out too */
+	    find_pack_entry_pos(c->tree->object.sha1, mw->pack) == -1 ||
+	    (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 ||
+	    (c->parents->next &&
+	     (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) ||
+	    /*
+	     * we set the 2nd parent the same as 1st parent as an
+	     * indication that 2nd parent does not exist. Normal
+	     * commits should never have two same parents, but just in
+	     * case..
+	     */
+	    p1 == p2)
 		return;
 
-	*tail = &commit_list_insert(c, *tail)->next;
+	/*
+	 * Make sure we store the abbr sha-1 long enough to
+	 * unambiguously identify any cached commits in the pack.
+	 */
+	while (write_cb->abbrev_len < 20 &&
+	       write_cb->last_sha1 &&
+	       !memcmp(write_cb->last_sha1, sha1, write_cb->abbrev_len))
+		write_cb->abbrev_len++;
+	/*
+	 * A bit sensitive to metapack_writer_foreach. "sha1" must not
+	 * be changed even after this function exits.
+	 */
+	write_cb->last_sha1 = sha1;
+
+	write_cb->tail = &commit_list_insert(c, write_cb->tail)->next;
 }
 
 void commit_metapack_write(const char *idx)
 {
 	struct metapack_writer mw;
 	struct commit_list *commits = NULL, *p;
-	struct commit_list **tail = &commits;
+	struct write_cb write_cb;
 	uint32_t nr = 0;
 
 	metapack_writer_init(&mw, idx, "commits", 1);
 
+	write_cb.tail = &commits;
+	write_cb.abbrev_len = 1;
+	write_cb.last_sha1 = NULL;
+
 	/* Figure out how many eligible commits we've got in this pack. */
-	metapack_writer_foreach(&mw, get_commits, &tail);
+	metapack_writer_foreach(&mw, get_commits, &write_cb);
 	for (p = commits; p; p = p->next)
 		nr++;
+
 	metapack_writer_add_uint32(&mw, nr);
+	metapack_writer_add_uint32(&mw, write_cb.abbrev_len);
 
 	/* Then write an index of commit sha1s */
 	for (p = commits; p; p = p->next)
-		metapack_writer_add(&mw, p->item->object.sha1, 20);
+		metapack_writer_add(&mw, p->item->object.sha1, write_cb.abbrev_len);
 
 	/* Followed by the actual date/tree/parents data */
 	for (p = commits; p; p = p->next) {
 		struct commit *c = p->item;
+		int pos;
+
+		pos = find_pack_entry_pos(c->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
 		metapack_writer_add_uint32(&mw, c->date);
-		metapack_writer_add(&mw, c->tree->object.sha1, 20);
-		metapack_writer_add(&mw, c->parents->item->object.sha1, 20);
-		metapack_writer_add(&mw,
-				    c->parents->next ?
-				    c->parents->next->item->object.sha1 :
-				    null_sha1, 20);
+
+		pos = find_pack_entry_pos(c->tree->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
+		pos = find_pack_entry_pos(c->parents->item->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
+		if (c->parents->next) {
+			struct object *o = &c->parents->next->item->object;
+			pos = find_pack_entry_pos(o->sha1, mw.pack);
+		}
+		metapack_writer_add_uint32(&mw, pos);
 	}
 
 	metapack_writer_finish(&mw);
diff --git a/commit-metapack.h b/commit-metapack.h
index 4684573..caf85be 100644
--- a/commit-metapack.h
+++ b/commit-metapack.h
@@ -3,9 +3,9 @@
 
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2);
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2);
 
 void commit_metapack_write(const char *idx_file);
 
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1acab8c 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
 	}
 }
 
-off_t find_pack_entry_one(const unsigned char *sha1,
-				  struct packed_git *p)
+int find_pack_entry_pos(const unsigned char *sha1,
+			struct packed_git *p)
 {
 	const uint32_t *level1_ofs = p->index_data;
 	const unsigned char *index = p->index_data;
@@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 
 	if (!index) {
 		if (open_pack_index(p))
-			return 0;
+			return -1;
 		level1_ofs = p->index_data;
 		index = p->index_data;
 	}
@@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 	if (use_lookup) {
 		int pos = sha1_entry_pos(index, stride, 0,
 					 lo, hi, p->num_objects, sha1);
-		if (pos < 0)
-			return 0;
-		return nth_packed_object_offset(p, pos);
+		return pos;
 	}
 
 	do {
@@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 			printf("lo %u hi %u rg %u mi %u\n",
 			       lo, hi, hi - lo, mi);
 		if (!cmp)
-			return nth_packed_object_offset(p, mi);
+			return mi;
 		if (cmp > 0)
 			hi = mi;
 		else
 			lo = mi+1;
 	} while (lo < hi);
-	return 0;
+	return -1;
 }
 
+off_t find_pack_entry_one(const unsigned char *sha1,
+				  struct packed_git *p)
+{
+	int pos = find_pack_entry_pos(sha1, p);
+	if (pos < 0)
+		return 0;
+	else
+		return nth_packed_object_offset(p, pos);
+}
 int is_pack_valid(struct packed_git *p)
 {
 	/* An already open pack is known to be valid. */
-- 8< --

  reply	other threads:[~2013-01-31 11:06 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-01-29  9:14 [PATCH/RFC 0/6] commit caching Jeff King
2013-01-29  9:15 ` [PATCH 1/6] csum-file: make sha1write const-correct Jeff King
2013-01-29  9:15 ` [PATCH 2/6] strbuf: add string-chomping functions Jeff King
2013-01-29 10:15   ` Michael Haggerty
2013-01-29 11:10     ` Jeff King
2013-01-30  5:00       ` Michael Haggerty
2013-01-29  9:15 ` [PATCH 3/6] introduce pack metadata cache files Jeff King
2013-01-29 17:35   ` Junio C Hamano
2013-01-30  6:47     ` Jeff King
2013-01-30  1:30   ` Duy Nguyen
2013-01-30  6:50     ` Jeff King
2013-01-29  9:16 ` [PATCH 4/6] introduce a commit metapack Jeff King
2013-01-29 10:24   ` Michael Haggerty
2013-01-29 11:13     ` Jeff King
2013-01-29 17:38   ` Junio C Hamano
2013-01-29 18:08     ` Junio C Hamano
2013-01-30  7:12       ` Jeff King
2013-01-30  7:17         ` Junio C Hamano
2013-02-01  9:21           ` Jeff King
2013-01-30 15:56         ` Junio C Hamano
2013-01-31 17:03           ` Shawn Pearce
2013-02-01  9:42             ` Jeff King
2013-02-02 17:49               ` Junio C Hamano
2013-01-30  7:07     ` Jeff King
2013-01-30  3:36   ` Duy Nguyen
2013-01-30  7:12     ` Jeff King
2013-01-30 13:56   ` Duy Nguyen
2013-01-30 14:16     ` Duy Nguyen
2013-01-31 11:06       ` Duy Nguyen [this message]
2013-02-01 10:15         ` Jeff King
2013-02-02  9:49           ` Duy Nguyen
2013-02-01 10:40         ` Jeff King
2013-03-17 13:21         ` Duy Nguyen
2013-03-18 12:20           ` Jeff King
2013-02-01 10:00     ` Jeff King
2013-01-29  9:16 ` [PATCH 5/6] add git-metapack command Jeff King
2013-01-29  9:16 ` [PATCH 6/6] commit: look up commit info in metapack Jeff King
2013-01-30  3:31 ` [PATCH/RFC 0/6] commit caching Duy Nguyen
2013-01-30  7:18   ` Jeff King
2013-01-30  8:32     ` Duy Nguyen
2013-01-31 17:14 ` Shawn Pearce
2013-02-01  9:11   ` Jeff King
2013-02-02 10:04     ` Shawn Pearce

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130131110656.GA28093@lanh \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=peff@peff.net \
    --cc=spearce@spearce.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.