All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Martin Fick" <mfick@codeaurora.org>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH 2/3] Add commit cache to help speed up commit traversal
Date: Tue,  3 Apr 2012 13:55:08 +0700	[thread overview]
Message-ID: <1333436109-16526-3-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <53707c0a-3782-47a4-8a35-da7136ff4822@email.android.com>

This patch extracts tree, parent sha-1 and committer date of packed
commits that have exactly one parent out. So that revision traversal
code can avoid inflating commit for these sha-1. The assumption is
commits with one parent dominate.

Two new files are created per pack: one file contains sha-1 and dates.
The other file contains commit sha-1 mapping to the offset in the former
file.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Makefile             |    3 +
 builtin/index-pack.c |  113 ++++++++++++++++++++++++++++++++++-
 cache.h              |    9 +++
 pack-write.c         |   11 +++-
 pack.h               |    1 +
 sha1_cache.c         |  161 ++++++++++++++++++++++++++++++++++++++++++++++++++
 sha1_cache.h         |    6 ++
 sha1_file.c          |   12 ++++-
 test-sha1-cache.c    |   19 ++++++
 9 files changed, 331 insertions(+), 4 deletions(-)
 create mode 100644 sha1_cache.c
 create mode 100644 sha1_cache.h
 create mode 100644 test-sha1-cache.c

diff --git a/Makefile b/Makefile
index 87fb30a..fced758 100644
--- a/Makefile
+++ b/Makefile
@@ -481,6 +481,7 @@ TEST_PROGRAMS_NEED_X += test-sha1
 TEST_PROGRAMS_NEED_X += test-sigchain
 TEST_PROGRAMS_NEED_X += test-subprocess
 TEST_PROGRAMS_NEED_X += test-svn-fe
+TEST_PROGRAMS_NEED_X += test-sha1-cache
 
 TEST_PROGRAMS = $(patsubst %,%$X,$(TEST_PROGRAMS_NEED_X))
 
@@ -606,6 +607,7 @@ LIB_H += run-command.h
 LIB_H += sequencer.h
 LIB_H += sha1-array.h
 LIB_H += sha1-lookup.h
+LIB_H += sha1_cache.h
 LIB_H += sideband.h
 LIB_H += sigchain.h
 LIB_H += strbuf.h
@@ -722,6 +724,7 @@ LIB_OBJS += setup.o
 LIB_OBJS += sequencer.o
 LIB_OBJS += sha1-array.o
 LIB_OBJS += sha1-lookup.o
+LIB_OBJS += sha1_cache.o
 LIB_OBJS += sha1_file.o
 LIB_OBJS += sha1_name.o
 LIB_OBJS += shallow.o
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dd1c5c9..67673ef 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -15,6 +15,7 @@ static const char index_pack_usage[] =
 
 struct object_entry {
 	struct pack_idx_entry idx;
+	off_t commit_offset;
 	unsigned long size;
 	unsigned int hdr_size;
 	enum object_type type;
@@ -63,6 +64,7 @@ static int nr_resolved_deltas;
 static int from_stdin;
 static int strict;
 static int verbose;
+static int write_cache = 1;
 
 static struct progress *progress;
 
@@ -75,6 +77,13 @@ static git_SHA_CTX input_ctx;
 static uint32_t input_crc32;
 static int input_fd, output_fd, pack_fd;
 
+static int cache_fd;
+static int cache_written;
+static uint32_t cache_nr;
+static char cache_filename[PATH_MAX];
+static const char *cache_idxname;
+static unsigned char cache_sha1[20];
+
 static int mark_link(struct object *obj, int type, void *data)
 {
 	if (!obj)
@@ -682,6 +691,58 @@ static int compare_delta_entry(const void *a, const void *b)
 				   objects[delta_b->obj_no].type);
 }
 
+static unsigned long parse_commit_date(const char *buf, const char *tail)
+{
+	const char *dateptr;
+
+	if (buf + 6 >= tail)
+		return 0;
+	if (memcmp(buf, "author", 6))
+		return 0;
+	while (buf < tail && *buf++ != '\n')
+		/* nada */;
+	if (buf + 9 >= tail)
+		return 0;
+	if (memcmp(buf, "committer", 9))
+		return 0;
+	while (buf < tail && *buf++ != '>')
+		/* nada */;
+	if (buf >= tail)
+		return 0;
+	dateptr = buf;
+	while (buf < tail && *buf++ != '\n')
+		/* nada */;
+	if (buf >= tail)
+		return 0;
+	/* dateptr < buf && buf[-1] == '\n', so strtoul will stop at buf-1 */
+	return strtoul(dateptr, NULL, 10);
+}
+
+static void write_commit_sha1(struct object_entry *obj, const char *data)
+{
+	unsigned char sha1[20];
+	if (!obj->commit_offset &&
+	    !memcmp(data, "tree ", 5) &&
+	    !memcmp(data + 45, "\nparent ", 8) &&
+	    memcmp(data + 45 + 48, "\nparent ", 8)) {
+		uint32_t date;
+		obj->commit_offset = cache_written;
+		cache_nr++;
+
+		get_sha1_hex(data + 5, sha1);
+		cache_written += xwrite(cache_fd, sha1, 20);
+		get_sha1_hex(data + 5 + 48, sha1);
+		cache_written += xwrite(cache_fd, sha1, 20);
+		date = parse_commit_date(data + 5 + 48 + 41, data + obj->size);
+		date = htonl(date);
+		cache_written += xwrite(cache_fd, &date, 4);
+	}
+#if 0
+	hashclr(sha1);
+	cache_written += xwrite(cache_fd, sha1, 20);
+#endif
+}
+
 /* Parse all objects and return the pack content SHA1 hash */
 static void parse_pack_objects(unsigned char *sha1)
 {
@@ -707,8 +768,13 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
-		} else
+		} else {
 			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+
+			/* We assume that commits are never deltified */
+			if (write_cache && obj->real_type == OBJ_COMMIT)
+				write_commit_sha1(obj, data);
+		}
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -919,6 +985,18 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
 	} else if (from_stdin)
 		chmod(final_pack_name, 0444);
 
+	if (write_cache) {
+		snprintf(name, sizeof(name), "%s/pack/pack-%s.sha1",
+			 get_object_directory(), sha1_to_hex(sha1));
+		if (move_temp_to_file(cache_filename, name))
+			die("cannot store commit cache file");
+
+		snprintf(name, sizeof(name), "%s/pack/pack-%s.sidx",
+			 get_object_directory(), sha1_to_hex(sha1));
+		if (move_temp_to_file(cache_idxname, name))
+			die("cannot store commit cache file");
+	}
+
 	if (final_index_name != curr_index_name) {
 		if (!final_index_name) {
 			snprintf(name, sizeof(name), "%s/pack/pack-%s.idx",
@@ -1120,6 +1198,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 				keep_msg = "";
 			} else if (!prefixcmp(arg, "--keep=")) {
 				keep_msg = arg + 7;
+			} else if (!strcmp(arg, "--no-cache")) {
+				write_cache = 0;
 			} else if (!prefixcmp(arg, "--pack_header=")) {
 				struct pack_header *hdr;
 				char *c;
@@ -1191,6 +1271,17 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	if (strict)
 		opts.flags |= WRITE_IDX_STRICT;
 
+	if (verify)
+		write_cache = 0;
+	if (write_cache) {
+		unsigned char sha1[20];
+		cache_fd = odb_mkstemp(cache_filename,
+				       sizeof(cache_filename),
+				       "pack/tmp_sha1_XXXXXX");
+		hashclr(sha1);
+		cache_written = xwrite(cache_fd, sha1, 20);
+	}
+
 	curr_pack = open_pack_file(pack_name);
 	parse_pack_header();
 	objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
@@ -1243,6 +1334,26 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
 	free(idx_objects);
 
+	if (write_cache) {
+		int nr;
+		struct pack_idx_entry **idx;
+
+		close(cache_fd);
+
+		idx = idx_objects = xmalloc((cache_nr) * sizeof(struct pack_idx_entry *));
+		for (nr = i = 0; i < nr_objects; i++) {
+			if (objects[i].commit_offset) {
+				objects[i].idx.offset = objects[i].commit_offset;
+				*idx++ = &objects[i].idx;
+				nr++;
+			}
+		}
+		assert(nr == cache_nr);
+		opts.flags |= WRITE_IDX_SHA1_CACHE;
+		cache_idxname = write_idx_file(NULL, idx_objects, cache_nr, &opts, cache_sha1);
+		free(idx_objects);
+	}
+
 	if (!verify)
 		final(pack_name, curr_pack,
 		      index_name, curr_index,
diff --git a/cache.h b/cache.h
index 9bd8c2d..fd3953f 100644
--- a/cache.h
+++ b/cache.h
@@ -972,6 +972,14 @@ struct pack_window {
 	unsigned int inuse_cnt;
 };
 
+struct sha1_cache {
+	const void *idx;
+	const void *data;
+	size_t idx_size;
+	size_t data_size;
+	uint32_t nr;
+};
+
 extern struct packed_git {
 	struct packed_git *next;
 	struct pack_window *windows;
@@ -984,6 +992,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	struct sha1_cache commit_cache;
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 do_not_close:1;
diff --git a/pack-write.c b/pack-write.c
index ca9e63b..6da977a 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -85,8 +85,11 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		f = sha1fd(fd, index_name);
 	}
 
-	/* if last object's offset is >= 2^31 we should use index V2 */
-	index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
+	if (opts->flags & WRITE_IDX_SHA1_CACHE)
+		index_version = 2;
+	else
+		/* if last object's offset is >= 2^31 we should use index V2 */
+		index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -138,6 +141,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	if (index_version >= 2) {
 		unsigned int nr_large_offset = 0;
 
+		if (opts->flags & WRITE_IDX_SHA1_CACHE)
+			goto skip_crc32;
+
 		/* write the crc32 table */
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
@@ -146,6 +152,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 			sha1write(f, &crc32_val, 4);
 		}
 
+skip_crc32:
 		/* write the 32-bit offset table */
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
diff --git a/pack.h b/pack.h
index aa6ee7d..f002b6a 100644
--- a/pack.h
+++ b/pack.h
@@ -40,6 +40,7 @@ struct pack_idx_option {
 	/* flag bits */
 #define WRITE_IDX_VERIFY 01 /* verify only, do not write the idx file */
 #define WRITE_IDX_STRICT 02
+#define WRITE_IDX_SHA1_CACHE 04
 
 	uint32_t version;
 	uint32_t off32_limit;
diff --git a/sha1_cache.c b/sha1_cache.c
new file mode 100644
index 0000000..3f244bb
--- /dev/null
+++ b/sha1_cache.c
@@ -0,0 +1,161 @@
+#include "cache.h"
+#include "pack.h"
+#include "sha1_cache.h"
+
+static int git_open_noatime(const char *name)
+{
+	static int sha1_file_open_flag = O_NOATIME;
+
+	for (;;) {
+		int fd = open(name, O_RDONLY | sha1_file_open_flag);
+		if (fd >= 0)
+			return fd;
+
+		/* Might the failure be due to O_NOATIME? */
+		if (errno != ENOENT && sha1_file_open_flag) {
+			sha1_file_open_flag = 0;
+			continue;
+		}
+
+		return -1;
+	}
+}
+
+/*
+ * Commit cache format is basically pack index v2 except that:
+ * - no crc32 table
+ * - no large offset support
+ * - offset table contains offset in _this_ file, in the commit
+ *   cache section
+ * - the commit cache section follows offset table, each entry
+ *   consists of tree sha1, parent sha1 if any, and terminated
+ *   by null sha-1.
+ */
+
+int open_sha1_cache(struct sha1_cache *cache,
+		    const char *data_path, const char *idx_path)
+{
+	void *idx_map, *data;
+	struct pack_idx_header *hdr;
+	size_t idx_size, data_size;
+	uint32_t nr, i, *index;
+	int fd = git_open_noatime(idx_path);
+	struct stat st;
+
+	if (fd < 0)
+		return -1;
+	if (fstat(fd, &st)) {
+		close(fd);
+		return -1;
+	}
+	idx_size = xsize_t(st.st_size);
+	if (idx_size < 4 * 256 + 20 + 20) {
+		close(fd);
+		return error("index file %s is too small", idx_path);
+	}
+	idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	fd = git_open_noatime(data_path);
+	if (fd < 0)
+		return -1;
+	if (fstat(fd, &st)) {
+		close(fd);
+		return -1;
+	}
+	data_size = xsize_t(st.st_size);
+	data = xmmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	hdr = idx_map;
+	if (hdr->idx_signature != htonl(PACK_IDX_SIGNATURE))
+		return error("not a commit cache file %s", idx_path);
+	if (ntohl(hdr->idx_version) != 2)
+		return error("wrong version %s %d", idx_path, ntohl(hdr->idx_version));
+
+	nr = 0;
+	index = idx_map;
+	index += 2;  /* skip index header */
+	for (i = 0; i < 256; i++) {
+		uint32_t n = ntohl(index[i]);
+		if (n < nr) {
+			munmap(idx_map, idx_size);
+			return error("non-monotonic index %s", idx_path);
+		}
+		nr = n;
+	}
+
+	cache->idx = idx_map;
+	cache->idx_size = idx_size;
+	cache->nr = nr;
+	cache->data = data;
+	cache->data_size = data_size;
+	return 0;
+}
+
+static const void *object_offset(const struct sha1_cache *cache, uint32_t n)
+{
+	const unsigned char *index = cache->idx;
+	uint32_t off;
+	index += 4 * 256;
+	index += 8 + cache->nr * 20;
+	off = ntohl(*((uint32_t *)(index + 4 * n)));
+	return (const char *)cache->data + off;
+}
+
+const void *find_sha1_in_cache(const unsigned char *sha1)
+{
+	const uint32_t *level1_ofs;
+	const unsigned char *index;
+	unsigned hi, lo, stride;
+
+	struct packed_git *p = find_sha1_pack(sha1, packed_git);
+	if (!p)
+		return 0;
+
+	open_pack_index(p);
+	if (!p->commit_cache.nr)
+		return 0;
+
+	level1_ofs = p->commit_cache.idx;
+	index = p->commit_cache.idx;
+
+	/* skip header */
+	level1_ofs += 2;
+	index += 8;
+
+	/* fanout table */
+	index += 4 * 256;
+	hi = ntohl(level1_ofs[*sha1]);
+	lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
+	stride = 20;
+
+	do {
+		unsigned mi = (lo + hi) / 2;
+		int cmp = hashcmp(index + mi * stride, sha1);
+
+		if (!cmp)
+			return object_offset(&p->commit_cache, mi);
+		if (cmp > 0)
+			hi = mi;
+		else
+			lo = mi+1;
+	} while (lo < hi);
+	return NULL;
+}
+
+int has_commit_cache(const unsigned char *sha1,
+		     unsigned char *tree,
+		     unsigned char *parent,
+		     unsigned long *date)
+{
+	const unsigned char *data;
+	data = find_sha1_in_cache(sha1);
+	if (!data)
+		return 0;
+
+	hashcpy(tree, data);
+	hashcpy(parent, data + 20);
+	*date = ntohl(*((uint32_t*)(data + 40)));
+	return 1;
+}
diff --git a/sha1_cache.h b/sha1_cache.h
new file mode 100644
index 0000000..59823cf
--- /dev/null
+++ b/sha1_cache.h
@@ -0,0 +1,6 @@
+extern int open_sha1_cache(struct sha1_cache *cache,
+			   const char *data_path, const char *idx_path);
+extern const void *find_sha1_in_cache(const unsigned char *sha1);
+extern int has_commit_cache(const unsigned char *sha1,
+			    unsigned char *tree, unsigned char *parent,
+			    unsigned long *date);
diff --git a/sha1_file.c b/sha1_file.c
index 88f2151..fc2f460 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -19,6 +19,7 @@
 #include "pack-revindex.h"
 #include "sha1-lookup.h"
 #include "bulk-checkin.h"
+#include "sha1_cache.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -578,14 +579,23 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 int open_pack_index(struct packed_git *p)
 {
 	char *idx_name;
+	int baselen = strlen(p->pack_name) - strlen(".pack");
 	int ret;
 
 	if (p->index_data)
 		return 0;
 
 	idx_name = xstrdup(p->pack_name);
-	strcpy(idx_name + strlen(idx_name) - strlen(".pack"), ".idx");
+	strcpy(idx_name + baselen, ".idx");
 	ret = check_packed_git_idx(idx_name, p);
+	if (!ret) {
+		char *cache_name;
+		cache_name = xstrdup(p->pack_name);
+		strcpy(idx_name + baselen, ".sidx");
+		strcpy(cache_name + baselen, ".sha1");
+		open_sha1_cache(&p->commit_cache, cache_name, idx_name);
+		free(cache_name);
+	}
 	free(idx_name);
 	return ret;
 }
diff --git a/test-sha1-cache.c b/test-sha1-cache.c
new file mode 100644
index 0000000..7b7f32c
--- /dev/null
+++ b/test-sha1-cache.c
@@ -0,0 +1,19 @@
+#include "cache.h"
+#include "sha1_cache.h"
+
+int main(int argc, char **argv)
+{
+	unsigned char sha1[20];
+	unsigned char tree[20];
+	unsigned char parent[20];
+	unsigned long date;
+
+	setup_git_directory();
+	prepare_packed_git();
+	get_sha1_hex(argv[1], sha1);
+	if (!has_commit_cache(sha1, tree, parent, &date))
+		return 1;
+	printf("tree %s\nparent %s\ndate %ld\n",
+	       sha1_to_hex(tree), sha1_to_hex(parent), date);
+	return 0;
+}
-- 
1.7.3.1.256.g2539c.dirty

  parent reply	other threads:[~2012-04-03  6:56 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-30  0:18 Git push performance problems with ~100K refs Martin Fick
2012-03-30  2:12 ` Junio C Hamano
2012-03-30  2:43   ` Martin Fick
2012-03-30  9:32     ` Jeff King
2012-03-30  9:40       ` Jeff King
2012-03-30 14:22         ` Martin Fick
2012-03-31 22:10         ` [PATCH 1/3] add mergesort() for linked lists René Scharfe
2012-04-05 19:17           ` Junio C Hamano
2012-04-08 20:32             ` René Scharfe
2012-04-09 18:26               ` Junio C Hamano
2012-04-11  6:19           ` Stephen Boyd
2012-04-11 16:44             ` Junio C Hamano
2012-03-31 22:10         ` [PATCH 2/3] commit: use mergesort() in commit_list_sort_by_date() René Scharfe
2012-03-31 22:11         ` [PATCH 3/3] revision: insert unsorted, then sort in prepare_revision_walk() René Scharfe
2012-03-31 22:36           ` Martin Fick
2012-03-31 23:45           ` Junio C Hamano
2012-04-02 16:24           ` Martin Fick
2012-04-02 16:39             ` Shawn Pearce
2012-04-02 16:49               ` Martin Fick
2012-04-02 16:51                 ` Shawn Pearce
2012-04-02 20:37                   ` Jeff King
2012-04-02 20:51                     ` Jeff King
2012-04-02 23:16                     ` Martin Fick
2012-04-03  3:49                     ` Nguyen Thai Ngoc Duy
2012-04-03  5:55                       ` Martin Fick
2012-04-03  6:55                         ` [PATCH 0/3] Commit cache Nguyễn Thái Ngọc Duy
2012-04-03  6:55                         ` [PATCH 1/3] parse_commit_buffer: rename a confusing variable name Nguyễn Thái Ngọc Duy
2012-04-03  6:55                         ` Nguyễn Thái Ngọc Duy [this message]
2012-04-03  6:55                         ` [PATCH 3/3] Add parse_commit_for_rev() to take advantage of sha1-cache Nguyễn Thái Ngọc Duy
2012-04-05 13:02                       ` [PATCH 3/3] revision: insert unsorted, then sort in prepare_revision_walk() Nguyen Thai Ngoc Duy
2012-04-06 19:21                         ` Shawn Pearce
2012-04-07  4:20                           ` Nguyen Thai Ngoc Duy
2012-04-03  3:44                   ` Nguyen Thai Ngoc Duy
2012-04-02 20:14           ` Jeff King
2012-04-02 22:54             ` René Scharfe
2012-04-03  8:40               ` Jeff King
2012-04-03  9:19                 ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1333436109-16526-3-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=mfick@codeaurora.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.