From: Duy Nguyen <pclouds@gmail.com>
To: Jeff King <peff@peff.net>
Cc: git@vger.kernel.org, "Shawn O. Pearce" <spearce@spearce.org>
Subject: Re: [PATCH 4/6] introduce a commit metapack
Date: Wed, 30 Jan 2013 20:56:07 +0700 [thread overview]
Message-ID: <20130130135607.GA23154@lanh> (raw)
In-Reply-To: <20130129091610.GD9999@sigill.intra.peff.net>
On Tue, Jan 29, 2013 at 04:16:11AM -0500, Jeff King wrote:
> When we are doing a commit traversal that does not need to
> look at the commit messages themselves (e.g., rev-list,
> merge-base, etc), we spend a lot of time accessing,
> decompressing, and parsing the commit objects just to find
> the parent and timestamp information. We can make a
> space-time tradeoff by caching that information on disk in a
> compact, uncompressed format.
And this is a (messy) patch on top that avoids storing SHA-1
directly. On my linux-2.6.git (575 MB pack, 73 MB index), .commits
file is 5.2 MB and 27 MB with and without my patch respectively. Nice
shrinkage.
However, performance seems to suffer too. Maybe I do more lookups than
necessary, I don't know. I should probably measure the cost of
revindex separately.
git rev-list --all --quiet on vanilla git:
real 0m3.645s
user 0m3.556s
sys 0m0.080s
commit cache without my patch:
real 0m0.723s
user 0m0.677s
sys 0m0.045s
and with my patch:
real 0m1.338s
user 0m1.259s
sys 0m0.075s
Another point, but not really important at this stage, I think we have
memory leak somewhere (lookup_commit??). It used up to 800 MB RES on
linux-2.6.git while generating the cache.
-- 8< --
diff --git a/cache.h b/cache.h
index 1f96f65..8048d5b 100644
--- a/cache.h
+++ b/cache.h
@@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int);
extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern int find_pack_entry_pos(const unsigned char *, struct packed_git *);
extern int is_pack_valid(struct packed_git *);
extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/commit-metapack.c b/commit-metapack.c
index 2c19f48..55f7ea9 100644
--- a/commit-metapack.c
+++ b/commit-metapack.c
@@ -3,11 +3,12 @@
#include "metapack.h"
#include "commit.h"
#include "sha1-lookup.h"
+#include "pack-revindex.h"
struct commit_metapack {
struct metapack mp;
- uint32_t nr;
- unsigned char *index;
+ struct packed_git *pack;
+ uint32_t first, last;
unsigned char *data;
struct commit_metapack *next;
};
@@ -16,7 +17,7 @@ static struct commit_metapack *commit_metapacks;
static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
{
struct commit_metapack *it = xcalloc(1, sizeof(*it));
- uint32_t version;
+ uint32_t version, nr;
if (metapack_init(&it->mp, pack, "commits", &version) < 0) {
free(it);
@@ -39,22 +40,25 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
free(it);
return NULL;
}
- memcpy(&it->nr, it->mp.data, 4);
- it->nr = ntohl(it->nr);
+ memcpy(&it->first, it->mp.data, 4);
+ it->first = ntohl(it->first);
+ memcpy(&it->last, it->mp.data + 4, 4);
+ it->last = ntohl(it->last);
+ nr = it->last - it->first + 1;
+ it->pack = pack;
/*
- * We need 84 bytes for each entry: sha1(20), date(4), tree(20),
- * parents(40).
+ * We need 16 bytes for each entry: date(4), tree index(4),
+ * parent indexes(8).
*/
- if (it->mp.len < (84 * it->nr + 4)) {
+ if (it->mp.len < (16 * nr + 8)) {
warning("commit metapack for '%s' is truncated", pack->pack_name);
metapack_close(&it->mp);
free(it);
return NULL;
}
- it->index = it->mp.data + 4;
- it->data = it->index + 20 * it->nr;
+ it->data = it->mp.data + 8;
return it;
}
@@ -81,31 +85,61 @@ static void prepare_commit_metapacks(void)
initialized = 1;
}
+static const unsigned char *idx_to_sha1(struct packed_git *p,
+ uint32_t nth)
+{
+ struct revindex_entry *revindex = get_revindex(p);
+ if (!revindex)
+ return NULL;
+ return nth_packed_object_sha1(p, revindex[nth].nr);
+}
+
int commit_metapack(unsigned char *sha1,
uint32_t *timestamp,
- unsigned char **tree,
- unsigned char **parent1,
- unsigned char **parent2)
+ const unsigned char **tree,
+ const unsigned char **parent1,
+ const unsigned char **parent2)
{
struct commit_metapack *p;
prepare_commit_metapacks();
for (p = commit_metapacks; p; p = p->next) {
unsigned char *data;
- int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1);
- if (pos < 0)
+ uint32_t p1, p2;
+ struct revindex_entry *re, *base;
+ off_t off;
+ uint32_t pos;
+
+ base = get_revindex(p->pack);
+ off = find_pack_entry_one(sha1, p->pack);
+ if (!off)
+ continue;
+ re = find_pack_revindex(p->pack, off);
+ if (!re)
+ continue;
+ pos = re - base;
+ if (pos < p->first || pos > p->last)
continue;
/* timestamp(4) + tree(20) + parents(40) */
- data = p->data + 64 * pos;
+ data = p->data + 16 * (pos - p->first);
*timestamp = *(uint32_t *)data;
*timestamp = ntohl(*timestamp);
+ if (!*timestamp)
+ return -1;
data += 4;
- *tree = data;
- data += 20;
- *parent1 = data;
- data += 20;
- *parent2 = data;
+ *tree = idx_to_sha1(p->pack, ntohl(*(uint32_t*)data));
+ data += 4;
+ p1 = ntohl(*(uint32_t*)data);
+ *parent1 = idx_to_sha1(p->pack, p1);
+ data += 4;
+ p2 = ntohl(*(uint32_t*)data);
+ if (p1 == p2)
+ *parent2 = null_sha1;
+ else
+ *parent2 = idx_to_sha1(p->pack, p2);
+ if (!*tree || !*parent1 || !*parent2)
+ return -1;
return 0;
}
@@ -113,63 +147,114 @@ int commit_metapack(unsigned char *sha1,
return -1;
}
-static void get_commits(struct metapack_writer *mw,
- const unsigned char *sha1,
- void *data)
+static int get_commits(struct metapack_writer *mw,
+ uint32_t nth,
+ int dry_run)
{
- struct commit_list ***tail = data;
+ const unsigned char *sha1 = nth_packed_object_sha1(mw->pack, nth);
enum object_type type = sha1_object_info(sha1, NULL);
struct commit *c;
+ struct revindex_entry *revindex, *ridx;
+ int pt, p1, p2 = -1;
- if (type != OBJ_COMMIT)
- return;
+ if (type != OBJ_COMMIT) {
+ if (dry_run)
+ return -1;
+ metapack_writer_add_uint32(mw, 0); /* date */
+ metapack_writer_add_uint32(mw, 0); /* tree */
+ metapack_writer_add_uint32(mw, 0); /* 1st parent */
+ metapack_writer_add_uint32(mw, 0); /* 2nd tree */
+ return 0;
+ }
c = lookup_commit(sha1);
if (!c || parse_commit(c))
die("unable to read commit %s", sha1_to_hex(sha1));
+ if (c->buffer) {
+ free(c->buffer);
+ c->buffer = NULL;
+ }
+
/*
* Our fixed-size parent list cannot represent root commits, nor
* octopus merges. Just skip those commits, as we can fallback
* in those rare cases to reading the actual commit object.
*/
if (!c->parents ||
- (c->parents && c->parents->next && c->parents->next->next))
- return;
+ (c->parents && c->parents->next && c->parents->next->next) ||
+ /* edge commits are out too */
+ (pt = find_pack_entry_pos(c->tree->object.sha1, mw->pack)) == -1 ||
+ (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 ||
+ (c->parents->next &&
+ (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) ||
+ /*
+ * we set the 2nd parent the same as 1st parent as an
+ * indication that 2nd parent does not exist. Normal
+ * commits should never have two same parents, but just in
+ * case..
+ */
+ p1 == p2 ||
+ /* zero date is reserved to say this is not a valid entry */
+ c->date == 0) {
+ if (dry_run)
+ return -1;
+ metapack_writer_add_uint32(mw, 0); /* date */
+ metapack_writer_add_uint32(mw, 0); /* tree */
+ metapack_writer_add_uint32(mw, 0); /* 1st parent */
+ metapack_writer_add_uint32(mw, 0); /* 2nd tree */
+ return 0;
+ }
+
+ if (dry_run)
+ return 0;
+
+ revindex = get_revindex(mw->pack);
- *tail = &commit_list_insert(c, *tail)->next;
+ metapack_writer_add_uint32(mw, c->date);
+ ridx = find_pack_revindex(mw->pack,
+ nth_packed_object_offset(mw->pack, pt));
+ metapack_writer_add_uint32(mw, ridx - revindex);
+ ridx = find_pack_revindex(mw->pack,
+ nth_packed_object_offset(mw->pack, p1));
+ metapack_writer_add_uint32(mw, ridx - revindex);
+ if (p2 != -1)
+ ridx = find_pack_revindex(mw->pack,
+ nth_packed_object_offset(mw->pack, p2));
+ metapack_writer_add_uint32(mw, ridx - revindex);
+ return 0;
}
void commit_metapack_write(const char *idx)
{
struct metapack_writer mw;
- struct commit_list *commits = NULL, *p;
- struct commit_list **tail = &commits;
- uint32_t nr = 0;
+ uint32_t i, first = 0xffffffff, last = 0;
+ struct revindex_entry *revidx;
metapack_writer_init(&mw, idx, "commits", 1);
- /* Figure out how many eligible commits we've got in this pack. */
- metapack_writer_foreach(&mw, get_commits, &tail);
- for (p = commits; p; p = p->next)
- nr++;
- metapack_writer_add_uint32(&mw, nr);
+ packed_git = mw.pack;
+ revidx = get_revindex(mw.pack);
- /* Then write an index of commit sha1s */
- for (p = commits; p; p = p->next)
- metapack_writer_add(&mw, p->item->object.sha1, 20);
+ /*
+ * Figure out how many eligible commits we've got in this pack.
+ */
+ for (i = 0; i < mw.pack->num_objects; i++) {
+ int ret = get_commits(&mw, revidx[i].nr, 1);
+ if (ret == -1) /* not cached */
+ continue;
+ if (i < first)
+ first = i;
+ if (i > last)
+ last = i;
+ }
+
+ metapack_writer_add_uint32(&mw, first);
+ metapack_writer_add_uint32(&mw, last);
/* Followed by the actual date/tree/parents data */
- for (p = commits; p; p = p->next) {
- struct commit *c = p->item;
- metapack_writer_add_uint32(&mw, c->date);
- metapack_writer_add(&mw, c->tree->object.sha1, 20);
- metapack_writer_add(&mw, c->parents->item->object.sha1, 20);
- metapack_writer_add(&mw,
- c->parents->next ?
- c->parents->next->item->object.sha1 :
- null_sha1, 20);
- }
+ for (i = first; i <= last; i++)
+ get_commits(&mw, revidx[i].nr, 0);
metapack_writer_finish(&mw);
}
diff --git a/commit-metapack.h b/commit-metapack.h
index 4684573..caf85be 100644
--- a/commit-metapack.h
+++ b/commit-metapack.h
@@ -3,9 +3,9 @@
int commit_metapack(unsigned char *sha1,
uint32_t *timestamp,
- unsigned char **tree,
- unsigned char **parent1,
- unsigned char **parent2);
+ const unsigned char **tree,
+ const unsigned char **parent1,
+ const unsigned char **parent2);
void commit_metapack_write(const char *idx_file);
diff --git a/pack-revindex.c b/pack-revindex.c
index 77a0465..d58dd02 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -111,12 +111,10 @@ static void create_pack_revindex(struct pack_revindex *rix)
qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
}
-struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
+struct revindex_entry *get_revindex(struct packed_git *p)
{
int num;
- int lo, hi;
struct pack_revindex *rix;
- struct revindex_entry *revindex;
if (!pack_revindex_hashsz)
init_pack_revindex();
@@ -127,7 +125,13 @@ struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
rix = &pack_revindex[num];
if (!rix->revindex)
create_pack_revindex(rix);
- revindex = rix->revindex;
+ return rix->revindex;
+}
+
+struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
+{
+ int lo, hi;
+ struct revindex_entry *revindex = get_revindex(p);
lo = 0;
hi = p->num_objects + 1;
diff --git a/pack-revindex.h b/pack-revindex.h
index 8d5027a..cea85db 100644
--- a/pack-revindex.h
+++ b/pack-revindex.h
@@ -6,6 +6,7 @@ struct revindex_entry {
unsigned int nr;
};
+struct revindex_entry *get_revindex(struct packed_git *p);
struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs);
void discard_revindex(void);
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1acab8c 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
}
}
-off_t find_pack_entry_one(const unsigned char *sha1,
- struct packed_git *p)
+int find_pack_entry_pos(const unsigned char *sha1,
+ struct packed_git *p)
{
const uint32_t *level1_ofs = p->index_data;
const unsigned char *index = p->index_data;
@@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
if (!index) {
if (open_pack_index(p))
- return 0;
+ return -1;
level1_ofs = p->index_data;
index = p->index_data;
}
@@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
if (use_lookup) {
int pos = sha1_entry_pos(index, stride, 0,
lo, hi, p->num_objects, sha1);
- if (pos < 0)
- return 0;
- return nth_packed_object_offset(p, pos);
+ return pos;
}
do {
@@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1,
printf("lo %u hi %u rg %u mi %u\n",
lo, hi, hi - lo, mi);
if (!cmp)
- return nth_packed_object_offset(p, mi);
+ return mi;
if (cmp > 0)
hi = mi;
else
lo = mi+1;
} while (lo < hi);
- return 0;
+ return -1;
}
+off_t find_pack_entry_one(const unsigned char *sha1,
+ struct packed_git *p)
+{
+ int pos = find_pack_entry_pos(sha1, p);
+ if (pos < 0)
+ return 0;
+ else
+ return nth_packed_object_offset(p, pos);
+}
int is_pack_valid(struct packed_git *p)
{
/* An already open pack is known to be valid. */
--
1.8.1.1.459.g5970e58
-- 8< --
next prev parent reply other threads:[~2013-01-30 13:56 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-01-29 9:14 [PATCH/RFC 0/6] commit caching Jeff King
2013-01-29 9:15 ` [PATCH 1/6] csum-file: make sha1write const-correct Jeff King
2013-01-29 9:15 ` [PATCH 2/6] strbuf: add string-chomping functions Jeff King
2013-01-29 10:15 ` Michael Haggerty
2013-01-29 11:10 ` Jeff King
2013-01-30 5:00 ` Michael Haggerty
2013-01-29 9:15 ` [PATCH 3/6] introduce pack metadata cache files Jeff King
2013-01-29 17:35 ` Junio C Hamano
2013-01-30 6:47 ` Jeff King
2013-01-30 1:30 ` Duy Nguyen
2013-01-30 6:50 ` Jeff King
2013-01-29 9:16 ` [PATCH 4/6] introduce a commit metapack Jeff King
2013-01-29 10:24 ` Michael Haggerty
2013-01-29 11:13 ` Jeff King
2013-01-29 17:38 ` Junio C Hamano
2013-01-29 18:08 ` Junio C Hamano
2013-01-30 7:12 ` Jeff King
2013-01-30 7:17 ` Junio C Hamano
2013-02-01 9:21 ` Jeff King
2013-01-30 15:56 ` Junio C Hamano
2013-01-31 17:03 ` Shawn Pearce
2013-02-01 9:42 ` Jeff King
2013-02-02 17:49 ` Junio C Hamano
2013-01-30 7:07 ` Jeff King
2013-01-30 3:36 ` Duy Nguyen
2013-01-30 7:12 ` Jeff King
2013-01-30 13:56 ` Duy Nguyen [this message]
2013-01-30 14:16 ` Duy Nguyen
2013-01-31 11:06 ` Duy Nguyen
2013-02-01 10:15 ` Jeff King
2013-02-02 9:49 ` Duy Nguyen
2013-02-01 10:40 ` Jeff King
2013-03-17 13:21 ` Duy Nguyen
2013-03-18 12:20 ` Jeff King
2013-02-01 10:00 ` Jeff King
2013-01-29 9:16 ` [PATCH 5/6] add git-metapack command Jeff King
2013-01-29 9:16 ` [PATCH 6/6] commit: look up commit info in metapack Jeff King
2013-01-30 3:31 ` [PATCH/RFC 0/6] commit caching Duy Nguyen
2013-01-30 7:18 ` Jeff King
2013-01-30 8:32 ` Duy Nguyen
2013-01-31 17:14 ` Shawn Pearce
2013-02-01 9:11 ` Jeff King
2013-02-02 10:04 ` Shawn Pearce
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130130135607.GA23154@lanh \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=peff@peff.net \
--cc=spearce@spearce.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).