From: Duy Nguyen <pclouds@gmail.com>
To: Jeff King <peff@peff.net>
Cc: git@vger.kernel.org, "Shawn O. Pearce" <spearce@spearce.org>
Subject: Re: [PATCH 4/6] introduce a commit metapack
Date: Thu, 31 Jan 2013 18:06:56 +0700 [thread overview]
Message-ID: <20130131110656.GA28093@lanh> (raw)
In-Reply-To: <CACsJy8Bqg6knVtddwaGSqtiXzVDgbO1JjbFOPMbF_RqrxM2rFQ@mail.gmail.com>
On Wed, Jan 30, 2013 at 09:16:29PM +0700, Duy Nguyen wrote:
> Perhaps we could store abbrev sha-1 instead of full sha-1. Nice
> space/time trade-off.
Following the on-disk format experiment yesterday, I changed the
format to:
- a list a _short_ SHA-1 of cached commits
- a list of cache entries, each (5 uint32_t) consists of:
- uint32_t for the index in .idx sha-1 table to get full SHA-1 of
the commit
- uint32_t for timestamp
- uint32_t for tree, 1st and 2nd parents for the index in .idx
table
The length of SHA-1 is chosen to be able to unambiguously identify any
cached commits. Full SHA-1 check is done after to catch false
positives. For linux-2.6, SHA-1 length is 6 bytes, git and many
moderate-sized projects are 4 bytes. So it's 26 bytes per commit for
linux-2.6.git, or 8MB .commits file. Not as good as revindex approach
(5.5MB) but way better than the current one (27MB). And still good
enough for caching 2 parents unconditionally.
Performance seems to improve a tiny bit, probably because of more
compact search space: 0.6s with my patch vs 0.7s without (vs 3.4s
without cache).
-- 8< --
diff --git a/cache.h b/cache.h
index 1f96f65..8048d5b 100644
--- a/cache.h
+++ b/cache.h
@@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int);
extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern int find_pack_entry_pos(const unsigned char *, struct packed_git *);
extern int is_pack_valid(struct packed_git *);
extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/commit-metapack.c b/commit-metapack.c
index 2c19f48..c984b8e 100644
--- a/commit-metapack.c
+++ b/commit-metapack.c
@@ -4,11 +4,21 @@
#include "commit.h"
#include "sha1-lookup.h"
+struct commit_entry {
+ uint32_t commit; /* nth_packed_object_sha1 to get own SHA-1 */
+ uint32_t timestamp;
+ uint32_t tree; /* nth_packed_object_sha1 to get tree SHA-1 */
+ uint32_t parent1; /* nth_packed_object_sha1 to get 1st parent SHA-1 */
+ uint32_t parent2; /* nth_packed_object_sha1 to get 2nd parent SHA-1 */
+};
+
struct commit_metapack {
struct metapack mp;
uint32_t nr;
+ uint32_t abbrev_len;
+ struct packed_git *pack;
unsigned char *index;
- unsigned char *data;
+ struct commit_entry *data;
struct commit_metapack *next;
};
static struct commit_metapack *commit_metapacks;
@@ -41,20 +51,23 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
}
memcpy(&it->nr, it->mp.data, 4);
it->nr = ntohl(it->nr);
+ memcpy(&it->abbrev_len, it->mp.data + 4, 4);
+ it->abbrev_len = ntohl(it->abbrev_len);
+ it->pack = pack;
/*
- * We need 84 bytes for each entry: sha1(20), date(4), tree(20),
- * parents(40).
+ * We need 20+abbrev_len bytes for each entry: abbrev sha-1,
+ * date(4), tree index(4), parent indexes(8).
*/
- if (it->mp.len < (84 * it->nr + 4)) {
+ if (it->mp.len < ((sizeof(*it->data) + it->abbrev_len) * it->nr + 8)) {
warning("commit metapack for '%s' is truncated", pack->pack_name);
metapack_close(&it->mp);
free(it);
return NULL;
}
- it->index = it->mp.data + 4;
- it->data = it->index + 20 * it->nr;
+ it->index = it->mp.data + 8;
+ it->data = (struct commit_entry*)(it->index + it->abbrev_len * it->nr);
return it;
}
@@ -83,29 +96,51 @@ static void prepare_commit_metapacks(void)
int commit_metapack(unsigned char *sha1,
uint32_t *timestamp,
- unsigned char **tree,
- unsigned char **parent1,
- unsigned char **parent2)
+ const unsigned char **tree,
+ const unsigned char **parent1,
+ const unsigned char **parent2)
{
struct commit_metapack *p;
prepare_commit_metapacks();
for (p = commit_metapacks; p; p = p->next) {
- unsigned char *data;
- int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1);
+ struct commit_entry *data;
+ uint32_t p1, p2;
+ unsigned lo, hi, mi;
+ int pos;
+
+ /* sha1_entry_pos does not work with abbreviated sha-1 */
+ lo = 0;
+ hi = p->nr;
+ pos = -1;
+ do {
+ unsigned mi = (lo + hi) / 2;
+ int cmp = memcmp(p->index + mi * p->abbrev_len, sha1, p->abbrev_len);
+
+ if (!cmp) {
+ pos = mi;
+ break;
+ }
+ if (cmp > 0)
+ hi = mi;
+ else
+ lo = mi+1;
+ } while (lo < hi);
if (pos < 0)
continue;
- /* timestamp(4) + tree(20) + parents(40) */
- data = p->data + 64 * pos;
- *timestamp = *(uint32_t *)data;
- *timestamp = ntohl(*timestamp);
- data += 4;
- *tree = data;
- data += 20;
- *parent1 = data;
- data += 20;
- *parent2 = data;
+ data = p->data + pos;
+
+ /* full sha-1 check again */
+ if (hashcmp(nth_packed_object_sha1(p->pack, ntohl(data->commit)), sha1))
+ continue;
+
+ *timestamp = ntohl(data->timestamp);
+ *tree = nth_packed_object_sha1(p->pack, ntohl(data->tree));
+ p1 = ntohl(data->parent1);
+ *parent1 = nth_packed_object_sha1(p->pack, p1);
+ p2 = ntohl(data->parent2);
+ *parent2 = p1 == p2 ? null_sha1 : nth_packed_object_sha1(p->pack, p2);
return 0;
}
@@ -113,13 +148,20 @@ int commit_metapack(unsigned char *sha1,
return -1;
}
+struct write_cb {
+ struct commit_list **tail;
+ int abbrev_len;
+ const unsigned char *last_sha1;
+};
+
static void get_commits(struct metapack_writer *mw,
const unsigned char *sha1,
void *data)
{
- struct commit_list ***tail = data;
+ struct write_cb *write_cb = (struct write_cb *)data;
enum object_type type = sha1_object_info(sha1, NULL);
struct commit *c;
+ int p1, p2;
if (type != OBJ_COMMIT)
return;
@@ -128,47 +170,95 @@ static void get_commits(struct metapack_writer *mw,
if (!c || parse_commit(c))
die("unable to read commit %s", sha1_to_hex(sha1));
+ if (c->buffer) {
+ free(c->buffer);
+ c->buffer = NULL;
+ }
+
/*
* Our fixed-size parent list cannot represent root commits, nor
* octopus merges. Just skip those commits, as we can fallback
* in those rare cases to reading the actual commit object.
*/
if (!c->parents ||
- (c->parents && c->parents->next && c->parents->next->next))
+ (c->parents && c->parents->next && c->parents->next->next) ||
+ /* edge commits are out too */
+ find_pack_entry_pos(c->tree->object.sha1, mw->pack) == -1 ||
+ (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 ||
+ (c->parents->next &&
+ (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) ||
+ /*
+ * we set the 2nd parent the same as 1st parent as an
+ * indication that 2nd parent does not exist. Normal
+ * commits should never have two same parents, but just in
+ * case..
+ */
+ p1 == p2)
return;
- *tail = &commit_list_insert(c, *tail)->next;
+ /*
+ * Make sure we store the abbr sha-1 long enough to
+ * unambiguously identify any cached commits in the pack.
+ */
+ while (write_cb->abbrev_len < 20 &&
+ write_cb->last_sha1 &&
+ !memcmp(write_cb->last_sha1, sha1, write_cb->abbrev_len))
+ write_cb->abbrev_len++;
+ /*
+ * A bit sensitive to metapack_writer_foreach. "sha1" must not
+ * be changed even after this function exits.
+ */
+ write_cb->last_sha1 = sha1;
+
+ write_cb->tail = &commit_list_insert(c, write_cb->tail)->next;
}
void commit_metapack_write(const char *idx)
{
struct metapack_writer mw;
struct commit_list *commits = NULL, *p;
- struct commit_list **tail = &commits;
+ struct write_cb write_cb;
uint32_t nr = 0;
metapack_writer_init(&mw, idx, "commits", 1);
+ write_cb.tail = &commits;
+ write_cb.abbrev_len = 1;
+ write_cb.last_sha1 = NULL;
+
/* Figure out how many eligible commits we've got in this pack. */
- metapack_writer_foreach(&mw, get_commits, &tail);
+ metapack_writer_foreach(&mw, get_commits, &write_cb);
for (p = commits; p; p = p->next)
nr++;
+
metapack_writer_add_uint32(&mw, nr);
+ metapack_writer_add_uint32(&mw, write_cb.abbrev_len);
/* Then write an index of commit sha1s */
for (p = commits; p; p = p->next)
- metapack_writer_add(&mw, p->item->object.sha1, 20);
+ metapack_writer_add(&mw, p->item->object.sha1, write_cb.abbrev_len);
/* Followed by the actual date/tree/parents data */
for (p = commits; p; p = p->next) {
struct commit *c = p->item;
+ int pos;
+
+ pos = find_pack_entry_pos(c->object.sha1, mw.pack);
+ metapack_writer_add_uint32(&mw, pos);
+
metapack_writer_add_uint32(&mw, c->date);
- metapack_writer_add(&mw, c->tree->object.sha1, 20);
- metapack_writer_add(&mw, c->parents->item->object.sha1, 20);
- metapack_writer_add(&mw,
- c->parents->next ?
- c->parents->next->item->object.sha1 :
- null_sha1, 20);
+
+ pos = find_pack_entry_pos(c->tree->object.sha1, mw.pack);
+ metapack_writer_add_uint32(&mw, pos);
+
+ pos = find_pack_entry_pos(c->parents->item->object.sha1, mw.pack);
+ metapack_writer_add_uint32(&mw, pos);
+
+ if (c->parents->next) {
+ struct object *o = &c->parents->next->item->object;
+ pos = find_pack_entry_pos(o->sha1, mw.pack);
+ }
+ metapack_writer_add_uint32(&mw, pos);
}
metapack_writer_finish(&mw);
diff --git a/commit-metapack.h b/commit-metapack.h
index 4684573..caf85be 100644
--- a/commit-metapack.h
+++ b/commit-metapack.h
@@ -3,9 +3,9 @@
int commit_metapack(unsigned char *sha1,
uint32_t *timestamp,
- unsigned char **tree,
- unsigned char **parent1,
- unsigned char **parent2);
+ const unsigned char **tree,
+ const unsigned char **parent1,
+ const unsigned char **parent2);
void commit_metapack_write(const char *idx_file);
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1acab8c 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
}
}
-off_t find_pack_entry_one(const unsigned char *sha1,
- struct packed_git *p)
+int find_pack_entry_pos(const unsigned char *sha1,
+ struct packed_git *p)
{
const uint32_t *level1_ofs = p->index_data;
const unsigned char *index = p->index_data;
@@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
if (!index) {
if (open_pack_index(p))
- return 0;
+ return -1;
level1_ofs = p->index_data;
index = p->index_data;
}
@@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
if (use_lookup) {
int pos = sha1_entry_pos(index, stride, 0,
lo, hi, p->num_objects, sha1);
- if (pos < 0)
- return 0;
- return nth_packed_object_offset(p, pos);
+ return pos;
}
do {
@@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1,
printf("lo %u hi %u rg %u mi %u\n",
lo, hi, hi - lo, mi);
if (!cmp)
- return nth_packed_object_offset(p, mi);
+ return mi;
if (cmp > 0)
hi = mi;
else
lo = mi+1;
} while (lo < hi);
- return 0;
+ return -1;
}
+off_t find_pack_entry_one(const unsigned char *sha1,
+ struct packed_git *p)
+{
+ int pos = find_pack_entry_pos(sha1, p);
+ if (pos < 0)
+ return 0;
+ else
+ return nth_packed_object_offset(p, pos);
+}
int is_pack_valid(struct packed_git *p)
{
/* An already open pack is known to be valid. */
-- 8< --
next prev parent reply other threads:[~2013-01-31 11:06 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-01-29 9:14 [PATCH/RFC 0/6] commit caching Jeff King
2013-01-29 9:15 ` [PATCH 1/6] csum-file: make sha1write const-correct Jeff King
2013-01-29 9:15 ` [PATCH 2/6] strbuf: add string-chomping functions Jeff King
2013-01-29 10:15 ` Michael Haggerty
2013-01-29 11:10 ` Jeff King
2013-01-30 5:00 ` Michael Haggerty
2013-01-29 9:15 ` [PATCH 3/6] introduce pack metadata cache files Jeff King
2013-01-29 17:35 ` Junio C Hamano
2013-01-30 6:47 ` Jeff King
2013-01-30 1:30 ` Duy Nguyen
2013-01-30 6:50 ` Jeff King
2013-01-29 9:16 ` [PATCH 4/6] introduce a commit metapack Jeff King
2013-01-29 10:24 ` Michael Haggerty
2013-01-29 11:13 ` Jeff King
2013-01-29 17:38 ` Junio C Hamano
2013-01-29 18:08 ` Junio C Hamano
2013-01-30 7:12 ` Jeff King
2013-01-30 7:17 ` Junio C Hamano
2013-02-01 9:21 ` Jeff King
2013-01-30 15:56 ` Junio C Hamano
2013-01-31 17:03 ` Shawn Pearce
2013-02-01 9:42 ` Jeff King
2013-02-02 17:49 ` Junio C Hamano
2013-01-30 7:07 ` Jeff King
2013-01-30 3:36 ` Duy Nguyen
2013-01-30 7:12 ` Jeff King
2013-01-30 13:56 ` Duy Nguyen
2013-01-30 14:16 ` Duy Nguyen
2013-01-31 11:06 ` Duy Nguyen [this message]
2013-02-01 10:15 ` Jeff King
2013-02-02 9:49 ` Duy Nguyen
2013-02-01 10:40 ` Jeff King
2013-03-17 13:21 ` Duy Nguyen
2013-03-18 12:20 ` Jeff King
2013-02-01 10:00 ` Jeff King
2013-01-29 9:16 ` [PATCH 5/6] add git-metapack command Jeff King
2013-01-29 9:16 ` [PATCH 6/6] commit: look up commit info in metapack Jeff King
2013-01-30 3:31 ` [PATCH/RFC 0/6] commit caching Duy Nguyen
2013-01-30 7:18 ` Jeff King
2013-01-30 8:32 ` Duy Nguyen
2013-01-31 17:14 ` Shawn Pearce
2013-02-01 9:11 ` Jeff King
2013-02-02 10:04 ` Shawn Pearce
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130131110656.GA28093@lanh \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=peff@peff.net \
--cc=spearce@spearce.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).