From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Martin Fick" <mfick@codeaurora.org>,
"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH 2/3] Add commit cache to help speed up commit traversal
Date: Tue, 3 Apr 2012 13:55:08 +0700 [thread overview]
Message-ID: <1333436109-16526-3-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <53707c0a-3782-47a4-8a35-da7136ff4822@email.android.com>
This patch extracts tree, parent sha-1 and committer date of packed
commits that have exactly one parent out. So that revision traversal
code can avoid inflating commit for these sha-1. The assumption is
commits with one parent dominate.
Two new files are created per pack: one file contains sha-1 and dates.
The other file contains commit sha-1 mapping to the offset in the former
file.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
Makefile | 3 +
builtin/index-pack.c | 113 ++++++++++++++++++++++++++++++++++-
cache.h | 9 +++
pack-write.c | 11 +++-
pack.h | 1 +
sha1_cache.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++
sha1_cache.h | 6 ++
sha1_file.c | 12 ++++-
test-sha1-cache.c | 19 ++++++
9 files changed, 331 insertions(+), 4 deletions(-)
create mode 100644 sha1_cache.c
create mode 100644 sha1_cache.h
create mode 100644 test-sha1-cache.c
diff --git a/Makefile b/Makefile
index 87fb30a..fced758 100644
--- a/Makefile
+++ b/Makefile
@@ -481,6 +481,7 @@ TEST_PROGRAMS_NEED_X += test-sha1
TEST_PROGRAMS_NEED_X += test-sigchain
TEST_PROGRAMS_NEED_X += test-subprocess
TEST_PROGRAMS_NEED_X += test-svn-fe
+TEST_PROGRAMS_NEED_X += test-sha1-cache
TEST_PROGRAMS = $(patsubst %,%$X,$(TEST_PROGRAMS_NEED_X))
@@ -606,6 +607,7 @@ LIB_H += run-command.h
LIB_H += sequencer.h
LIB_H += sha1-array.h
LIB_H += sha1-lookup.h
+LIB_H += sha1_cache.h
LIB_H += sideband.h
LIB_H += sigchain.h
LIB_H += strbuf.h
@@ -722,6 +724,7 @@ LIB_OBJS += setup.o
LIB_OBJS += sequencer.o
LIB_OBJS += sha1-array.o
LIB_OBJS += sha1-lookup.o
+LIB_OBJS += sha1_cache.o
LIB_OBJS += sha1_file.o
LIB_OBJS += sha1_name.o
LIB_OBJS += shallow.o
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dd1c5c9..67673ef 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -15,6 +15,7 @@ static const char index_pack_usage[] =
struct object_entry {
struct pack_idx_entry idx;
+ off_t commit_offset;
unsigned long size;
unsigned int hdr_size;
enum object_type type;
@@ -63,6 +64,7 @@ static int nr_resolved_deltas;
static int from_stdin;
static int strict;
static int verbose;
+static int write_cache = 1;
static struct progress *progress;
@@ -75,6 +77,13 @@ static git_SHA_CTX input_ctx;
static uint32_t input_crc32;
static int input_fd, output_fd, pack_fd;
+static int cache_fd;
+static int cache_written;
+static uint32_t cache_nr;
+static char cache_filename[PATH_MAX];
+static const char *cache_idxname;
+static unsigned char cache_sha1[20];
+
static int mark_link(struct object *obj, int type, void *data)
{
if (!obj)
@@ -682,6 +691,58 @@ static int compare_delta_entry(const void *a, const void *b)
objects[delta_b->obj_no].type);
}
+static unsigned long parse_commit_date(const char *buf, const char *tail)
+{
+ const char *dateptr;
+
+ if (buf + 6 >= tail)
+ return 0;
+ if (memcmp(buf, "author", 6))
+ return 0;
+ while (buf < tail && *buf++ != '\n')
+ /* nada */;
+ if (buf + 9 >= tail)
+ return 0;
+ if (memcmp(buf, "committer", 9))
+ return 0;
+ while (buf < tail && *buf++ != '>')
+ /* nada */;
+ if (buf >= tail)
+ return 0;
+ dateptr = buf;
+ while (buf < tail && *buf++ != '\n')
+ /* nada */;
+ if (buf >= tail)
+ return 0;
+ /* dateptr < buf && buf[-1] == '\n', so strtoul will stop at buf-1 */
+ return strtoul(dateptr, NULL, 10);
+}
+
+static void write_commit_sha1(struct object_entry *obj, const char *data)
+{
+ unsigned char sha1[20];
+ if (!obj->commit_offset &&
+ !memcmp(data, "tree ", 5) &&
+ !memcmp(data + 45, "\nparent ", 8) &&
+ memcmp(data + 45 + 48, "\nparent ", 8)) {
+ uint32_t date;
+ obj->commit_offset = cache_written;
+ cache_nr++;
+
+ get_sha1_hex(data + 5, sha1);
+ cache_written += xwrite(cache_fd, sha1, 20);
+ get_sha1_hex(data + 5 + 48, sha1);
+ cache_written += xwrite(cache_fd, sha1, 20);
+ date = parse_commit_date(data + 5 + 48 + 41, data + obj->size);
+ date = htonl(date);
+ cache_written += xwrite(cache_fd, &date, 4);
+ }
+#if 0
+ hashclr(sha1);
+ cache_written += xwrite(cache_fd, sha1, 20);
+#endif
+}
+
/* Parse all objects and return the pack content SHA1 hash */
static void parse_pack_objects(unsigned char *sha1)
{
@@ -707,8 +768,13 @@ static void parse_pack_objects(unsigned char *sha1)
nr_deltas++;
delta->obj_no = i;
delta++;
- } else
+ } else {
sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+
+ /* We assume that commits are never deltified */
+ if (write_cache && obj->real_type == OBJ_COMMIT)
+ write_commit_sha1(obj, data);
+ }
free(data);
display_progress(progress, i+1);
}
@@ -919,6 +985,18 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
} else if (from_stdin)
chmod(final_pack_name, 0444);
+ if (write_cache) {
+ snprintf(name, sizeof(name), "%s/pack/pack-%s.sha1",
+ get_object_directory(), sha1_to_hex(sha1));
+ if (move_temp_to_file(cache_filename, name))
+ die("cannot store commit cache file");
+
+ snprintf(name, sizeof(name), "%s/pack/pack-%s.sidx",
+ get_object_directory(), sha1_to_hex(sha1));
+ if (move_temp_to_file(cache_idxname, name))
+ die("cannot store commit cache file");
+ }
+
if (final_index_name != curr_index_name) {
if (!final_index_name) {
snprintf(name, sizeof(name), "%s/pack/pack-%s.idx",
@@ -1120,6 +1198,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
keep_msg = "";
} else if (!prefixcmp(arg, "--keep=")) {
keep_msg = arg + 7;
+ } else if (!strcmp(arg, "--no-cache")) {
+ write_cache = 0;
} else if (!prefixcmp(arg, "--pack_header=")) {
struct pack_header *hdr;
char *c;
@@ -1191,6 +1271,17 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
if (strict)
opts.flags |= WRITE_IDX_STRICT;
+ if (verify)
+ write_cache = 0;
+ if (write_cache) {
+ unsigned char sha1[20];
+ cache_fd = odb_mkstemp(cache_filename,
+ sizeof(cache_filename),
+ "pack/tmp_sha1_XXXXXX");
+ hashclr(sha1);
+ cache_written = xwrite(cache_fd, sha1, 20);
+ }
+
curr_pack = open_pack_file(pack_name);
parse_pack_header();
objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
@@ -1243,6 +1334,26 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
free(idx_objects);
+ if (write_cache) {
+ int nr;
+ struct pack_idx_entry **idx;
+
+ close(cache_fd);
+
+ idx = idx_objects = xmalloc((cache_nr) * sizeof(struct pack_idx_entry *));
+ for (nr = i = 0; i < nr_objects; i++) {
+ if (objects[i].commit_offset) {
+ objects[i].idx.offset = objects[i].commit_offset;
+ *idx++ = &objects[i].idx;
+ nr++;
+ }
+ }
+ assert(nr == cache_nr);
+ opts.flags |= WRITE_IDX_SHA1_CACHE;
+ cache_idxname = write_idx_file(NULL, idx_objects, cache_nr, &opts, cache_sha1);
+ free(idx_objects);
+ }
+
if (!verify)
final(pack_name, curr_pack,
index_name, curr_index,
diff --git a/cache.h b/cache.h
index 9bd8c2d..fd3953f 100644
--- a/cache.h
+++ b/cache.h
@@ -972,6 +972,14 @@ struct pack_window {
unsigned int inuse_cnt;
};
+struct sha1_cache {
+ const void *idx;
+ const void *data;
+ size_t idx_size;
+ size_t data_size;
+ uint32_t nr;
+};
+
extern struct packed_git {
struct packed_git *next;
struct pack_window *windows;
@@ -984,6 +992,7 @@ extern struct packed_git {
int index_version;
time_t mtime;
int pack_fd;
+ struct sha1_cache commit_cache;
unsigned pack_local:1,
pack_keep:1,
do_not_close:1;
diff --git a/pack-write.c b/pack-write.c
index ca9e63b..6da977a 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -85,8 +85,11 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
f = sha1fd(fd, index_name);
}
- /* if last object's offset is >= 2^31 we should use index V2 */
- index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
+ if (opts->flags & WRITE_IDX_SHA1_CACHE)
+ index_version = 2;
+ else
+ /* if last object's offset is >= 2^31 we should use index V2 */
+ index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
/* index versions 2 and above need a header */
if (index_version >= 2) {
@@ -138,6 +141,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
if (index_version >= 2) {
unsigned int nr_large_offset = 0;
+ if (opts->flags & WRITE_IDX_SHA1_CACHE)
+ goto skip_crc32;
+
/* write the crc32 table */
list = sorted_by_sha;
for (i = 0; i < nr_objects; i++) {
@@ -146,6 +152,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
sha1write(f, &crc32_val, 4);
}
+skip_crc32:
/* write the 32-bit offset table */
list = sorted_by_sha;
for (i = 0; i < nr_objects; i++) {
diff --git a/pack.h b/pack.h
index aa6ee7d..f002b6a 100644
--- a/pack.h
+++ b/pack.h
@@ -40,6 +40,7 @@ struct pack_idx_option {
/* flag bits */
#define WRITE_IDX_VERIFY 01 /* verify only, do not write the idx file */
#define WRITE_IDX_STRICT 02
+#define WRITE_IDX_SHA1_CACHE 04
uint32_t version;
uint32_t off32_limit;
diff --git a/sha1_cache.c b/sha1_cache.c
new file mode 100644
index 0000000..3f244bb
--- /dev/null
+++ b/sha1_cache.c
@@ -0,0 +1,161 @@
+#include "cache.h"
+#include "pack.h"
+#include "sha1_cache.h"
+
+static int git_open_noatime(const char *name)
+{
+ static int sha1_file_open_flag = O_NOATIME;
+
+ for (;;) {
+ int fd = open(name, O_RDONLY | sha1_file_open_flag);
+ if (fd >= 0)
+ return fd;
+
+ /* Might the failure be due to O_NOATIME? */
+ if (errno != ENOENT && sha1_file_open_flag) {
+ sha1_file_open_flag = 0;
+ continue;
+ }
+
+ return -1;
+ }
+}
+
+/*
+ * Commit cache format is basically pack index v2 except that:
+ * - no crc32 table
+ * - no large offset support
+ * - offset table contains offset in _this_ file, in the commit
+ * cache section
+ * - the commit cache section follows offset table, each entry
+ * consists of tree sha1, parent sha1 if any, and terminated
+ * by null sha-1.
+ */
+
+int open_sha1_cache(struct sha1_cache *cache,
+ const char *data_path, const char *idx_path)
+{
+ void *idx_map, *data;
+ struct pack_idx_header *hdr;
+ size_t idx_size, data_size;
+ uint32_t nr, i, *index;
+ int fd = git_open_noatime(idx_path);
+ struct stat st;
+
+ if (fd < 0)
+ return -1;
+ if (fstat(fd, &st)) {
+ close(fd);
+ return -1;
+ }
+ idx_size = xsize_t(st.st_size);
+ if (idx_size < 4 * 256 + 20 + 20) {
+ close(fd);
+ return error("index file %s is too small", idx_path);
+ }
+ idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+
+ fd = git_open_noatime(data_path);
+ if (fd < 0)
+ return -1;
+ if (fstat(fd, &st)) {
+ close(fd);
+ return -1;
+ }
+ data_size = xsize_t(st.st_size);
+ data = xmmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+
+ hdr = idx_map;
+ if (hdr->idx_signature != htonl(PACK_IDX_SIGNATURE))
+ return error("not a commit cache file %s", idx_path);
+ if (ntohl(hdr->idx_version) != 2)
+ return error("wrong version %s %d", idx_path, ntohl(hdr->idx_version));
+
+ nr = 0;
+ index = idx_map;
+ index += 2; /* skip index header */
+ for (i = 0; i < 256; i++) {
+ uint32_t n = ntohl(index[i]);
+ if (n < nr) {
+ munmap(idx_map, idx_size);
+ return error("non-monotonic index %s", idx_path);
+ }
+ nr = n;
+ }
+
+ cache->idx = idx_map;
+ cache->idx_size = idx_size;
+ cache->nr = nr;
+ cache->data = data;
+ cache->data_size = data_size;
+ return 0;
+}
+
+static const void *object_offset(const struct sha1_cache *cache, uint32_t n)
+{
+ const unsigned char *index = cache->idx;
+ uint32_t off;
+ index += 4 * 256;
+ index += 8 + cache->nr * 20;
+ off = ntohl(*((uint32_t *)(index + 4 * n)));
+ return (const char *)cache->data + off;
+}
+
+const void *find_sha1_in_cache(const unsigned char *sha1)
+{
+ const uint32_t *level1_ofs;
+ const unsigned char *index;
+ unsigned hi, lo, stride;
+
+ struct packed_git *p = find_sha1_pack(sha1, packed_git);
+ if (!p)
+ return 0;
+
+ open_pack_index(p);
+ if (!p->commit_cache.nr)
+ return 0;
+
+ level1_ofs = p->commit_cache.idx;
+ index = p->commit_cache.idx;
+
+ /* skip header */
+ level1_ofs += 2;
+ index += 8;
+
+ /* fanout table */
+ index += 4 * 256;
+ hi = ntohl(level1_ofs[*sha1]);
+ lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
+ stride = 20;
+
+ do {
+ unsigned mi = (lo + hi) / 2;
+ int cmp = hashcmp(index + mi * stride, sha1);
+
+ if (!cmp)
+ return object_offset(&p->commit_cache, mi);
+ if (cmp > 0)
+ hi = mi;
+ else
+ lo = mi+1;
+ } while (lo < hi);
+ return NULL;
+}
+
+int has_commit_cache(const unsigned char *sha1,
+ unsigned char *tree,
+ unsigned char *parent,
+ unsigned long *date)
+{
+ const unsigned char *data;
+ data = find_sha1_in_cache(sha1);
+ if (!data)
+ return 0;
+
+ hashcpy(tree, data);
+ hashcpy(parent, data + 20);
+ *date = ntohl(*((uint32_t*)(data + 40)));
+ return 1;
+}
diff --git a/sha1_cache.h b/sha1_cache.h
new file mode 100644
index 0000000..59823cf
--- /dev/null
+++ b/sha1_cache.h
@@ -0,0 +1,6 @@
+extern int open_sha1_cache(struct sha1_cache *cache,
+ const char *data_path, const char *idx_path);
+extern const void *find_sha1_in_cache(const unsigned char *sha1);
+extern int has_commit_cache(const unsigned char *sha1,
+ unsigned char *tree, unsigned char *parent,
+ unsigned long *date);
diff --git a/sha1_file.c b/sha1_file.c
index 88f2151..fc2f460 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -19,6 +19,7 @@
#include "pack-revindex.h"
#include "sha1-lookup.h"
#include "bulk-checkin.h"
+#include "sha1_cache.h"
#ifndef O_NOATIME
#if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -578,14 +579,23 @@ static int check_packed_git_idx(const char *path, struct packed_git *p)
int open_pack_index(struct packed_git *p)
{
char *idx_name;
+ int baselen = strlen(p->pack_name) - strlen(".pack");
int ret;
if (p->index_data)
return 0;
idx_name = xstrdup(p->pack_name);
- strcpy(idx_name + strlen(idx_name) - strlen(".pack"), ".idx");
+ strcpy(idx_name + baselen, ".idx");
ret = check_packed_git_idx(idx_name, p);
+ if (!ret) {
+ char *cache_name;
+ cache_name = xstrdup(p->pack_name);
+ strcpy(idx_name + baselen, ".sidx");
+ strcpy(cache_name + baselen, ".sha1");
+ open_sha1_cache(&p->commit_cache, cache_name, idx_name);
+ free(cache_name);
+ }
free(idx_name);
return ret;
}
diff --git a/test-sha1-cache.c b/test-sha1-cache.c
new file mode 100644
index 0000000..7b7f32c
--- /dev/null
+++ b/test-sha1-cache.c
@@ -0,0 +1,19 @@
+#include "cache.h"
+#include "sha1_cache.h"
+
+int main(int argc, char **argv)
+{
+ unsigned char sha1[20];
+ unsigned char tree[20];
+ unsigned char parent[20];
+ unsigned long date;
+
+ setup_git_directory();
+ prepare_packed_git();
+ get_sha1_hex(argv[1], sha1);
+ if (!has_commit_cache(sha1, tree, parent, &date))
+ return 1;
+ printf("tree %s\nparent %s\ndate %ld\n",
+ sha1_to_hex(tree), sha1_to_hex(parent), date);
+ return 0;
+}
--
1.7.3.1.256.g2539c.dirty
next prev parent reply other threads:[~2012-04-03 6:56 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-03-30 0:18 Git push performance problems with ~100K refs Martin Fick
2012-03-30 2:12 ` Junio C Hamano
2012-03-30 2:43 ` Martin Fick
2012-03-30 9:32 ` Jeff King
2012-03-30 9:40 ` Jeff King
2012-03-30 14:22 ` Martin Fick
2012-03-31 22:10 ` [PATCH 1/3] add mergesort() for linked lists René Scharfe
2012-04-05 19:17 ` Junio C Hamano
2012-04-08 20:32 ` René Scharfe
2012-04-09 18:26 ` Junio C Hamano
2012-04-11 6:19 ` Stephen Boyd
2012-04-11 16:44 ` Junio C Hamano
2012-03-31 22:10 ` [PATCH 2/3] commit: use mergesort() in commit_list_sort_by_date() René Scharfe
2012-03-31 22:11 ` [PATCH 3/3] revision: insert unsorted, then sort in prepare_revision_walk() René Scharfe
2012-03-31 22:36 ` Martin Fick
2012-03-31 23:45 ` Junio C Hamano
2012-04-02 16:24 ` Martin Fick
2012-04-02 16:39 ` Shawn Pearce
2012-04-02 16:49 ` Martin Fick
2012-04-02 16:51 ` Shawn Pearce
2012-04-02 20:37 ` Jeff King
2012-04-02 20:51 ` Jeff King
2012-04-02 23:16 ` Martin Fick
2012-04-03 3:49 ` Nguyen Thai Ngoc Duy
2012-04-03 5:55 ` Martin Fick
2012-04-03 6:55 ` [PATCH 0/3] Commit cache Nguyễn Thái Ngọc Duy
2012-04-03 6:55 ` [PATCH 1/3] parse_commit_buffer: rename a confusing variable name Nguyễn Thái Ngọc Duy
2012-04-03 6:55 ` Nguyễn Thái Ngọc Duy [this message]
2012-04-03 6:55 ` [PATCH 3/3] Add parse_commit_for_rev() to take advantage of sha1-cache Nguyễn Thái Ngọc Duy
2012-04-05 13:02 ` [PATCH 3/3] revision: insert unsorted, then sort in prepare_revision_walk() Nguyen Thai Ngoc Duy
2012-04-06 19:21 ` Shawn Pearce
2012-04-07 4:20 ` Nguyen Thai Ngoc Duy
2012-04-03 3:44 ` Nguyen Thai Ngoc Duy
2012-04-02 20:14 ` Jeff King
2012-04-02 22:54 ` René Scharfe
2012-04-03 8:40 ` Jeff King
2012-04-03 9:19 ` Jeff King
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1333436109-16526-3-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=mfick@codeaurora.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).