* Another slash at index size
@ 2013-02-21 9:45 Nguyễn Thái Ngọc Duy
0 siblings, 0 replies; only message in thread
From: Nguyễn Thái Ngọc Duy @ 2013-02-21 9:45 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
I noticed that even with v4, we still duplicate a lot of info in the
remaining fields. ce_uid and ce_guid for instance are unlikely to
change ever between entries. So I attempt to store offsets between the
previous entry instead. The result looks good. This is webkit index:
25M index-v2
14M index-v4
7.7M index-v5
4.5M index-v5.gz
gzip beats me naturally, we still have a lot of spare bits and we
don't use dictionaries. But the code is simpler and should run faster
than gzip.
Performance is improved too:
$ time GIT_INDEX_FILE=index-v2 ./git ls-files |head -n1 >/dev/null
real 0m0.437s
user 0m0.385s
sys 0m0.048s
$ time GIT_INDEX_FILE=index-v4 ./git ls-files |head -n1 >/dev/null
real 0m0.319s
user 0m0.277s
sys 0m0.040s
$ time GIT_INDEX_FILE=index-v5 ./git ls-files |head -n1 >/dev/null
real 0m0.250s
user 0m0.213s
sys 0m0.036s
Some details on the new format:
- in general varint is used to store numbers, unless we know the
numbers are really big.
- flags is the first field on disk, it has extra bits to let git know
what to do with the rest of the fields
- Many fields like ctime, mtime, uid, gid, dev, ino are stored as
offsets
- ce_mode's special values 100644 and 100755 are stored in flags. So
unless you use gitlinks or something else, ce_mode will not be
stored.
- ce_namelen is no longer stored on disk
- pathname is compressed just like in v4
---
cache.h | 2 +-
read-cache.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 269 insertions(+), 6 deletions(-)
diff --git a/cache.h b/cache.h
index e493563..6ab53e7 100644
--- a/cache.h
+++ b/cache.h
@@ -106,7 +106,7 @@ struct cache_header {
};
#define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
/*
* The "cache_time" is just the low 32 bits of the
diff --git a/read-cache.c b/read-cache.c
index 827ae55..147ace1 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1260,7 +1260,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
return error("bad signature");
hdr_version = ntohl(hdr->hdr_version);
- if (hdr_version < 2 || 4 < hdr_version)
+ if (hdr_version < 2 || 5 < hdr_version)
return error("bad index version %d", hdr_version);
git_SHA1_Init(&c);
git_SHA1_Update(&c, hdr, size - 20);
@@ -1407,6 +1407,137 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk,
return ce;
}
+/*
+ * Often used flags come first to keep flags in common case small so
+ * that encode_varint() produces fewer bytes
+ */
+#define CE5_OFS_MTIME (1 << 0)
+#define CE5_MODEMASK (3 << 1)
+#define CE5_MODE_644 0
+#define CE5_MODE_755 (1 << 1)
+#define CE5_MODE_FULL (2 << 1)
+#define CE5_FULL_INO (1 << 3)
+#define CE5_STAGESHIFT 4
+#define CE5_STAGEMASK (3 << CE5_STAGESHIFT)
+#define CE5_ITA (1 << 6)
+#define CE5_VALID (1 << 7)
+#define CE5_SWT (1 << 8)
+#define CE5_FULL_UID (1 << 9)
+#define CE5_FULL_GID (1 << 10)
+#define CE5_FULL_TIME (1 << 11)
+#define CE5_FULL_DEV (1 << 12)
+
+static uintmax_t decode_offset(const unsigned char **bufp, uintmax_t base)
+{
+ uintmax_t offset = decode_varint(bufp);
+ if (offset & 1) /* negative */
+ return base - (offset >> 1);
+ else
+ return base + (offset >> 1);
+
+}
+
+static struct cache_entry *create_from_disk_v5(const unsigned char *data,
+ unsigned long *consumed,
+ struct strbuf *previous_name,
+ const struct cache_entry *previous_ce)
+{
+ const unsigned char *orig = data;
+ struct cache_entry ce_tmp;
+ struct cache_entry *ce;
+ unsigned int flags;
+
+ flags = decode_varint(&data);
+ ce_tmp.ce_flags = ((flags & CE5_STAGEMASK) >> CE5_STAGESHIFT) << CE_STAGESHIFT;
+ if (flags & CE5_ITA)
+ ce_tmp.ce_flags |= CE_INTENT_TO_ADD;
+ if (flags & CE5_VALID)
+ ce_tmp.ce_flags |= CE_VALID;
+ if (flags & CE5_SWT)
+ ce_tmp.ce_flags |= CE_SKIP_WORKTREE;
+
+ if (flags & CE5_FULL_TIME) { /* full format, 16 bytes */
+ ce_tmp.ce_ctime.sec = ntoh_l(*(uint32_t*)data);
+ data += sizeof(uint32_t);
+ ce_tmp.ce_ctime.nsec = ntoh_l(*(uint32_t*)data);
+ data += sizeof(uint32_t);
+
+ ce_tmp.ce_mtime.sec = ntoh_l(*(uint32_t*)data);
+ data += sizeof(uint32_t);
+ ce_tmp.ce_mtime.nsec = ntoh_l(*(uint32_t*)data);
+ data += sizeof(uint32_t);
+ } else {
+ /* offset from previous ce */
+ ce_tmp.ce_ctime.sec = decode_offset(&data, previous_ce->ce_ctime.sec);
+ ce_tmp.ce_ctime.nsec = decode_offset(&data, previous_ce->ce_ctime.nsec);
+
+ if (flags & CE5_OFS_MTIME) {
+ /* offset from previous ce */
+ ce_tmp.ce_mtime.sec = decode_offset(&data, previous_ce->ce_mtime.sec);
+ ce_tmp.ce_mtime.nsec = decode_offset(&data, previous_ce->ce_mtime.nsec);
+ } else
+ ce_tmp.ce_mtime = ce_tmp.ce_ctime;
+ }
+
+
+ if (flags & CE5_FULL_DEV)
+ ce_tmp.ce_dev = decode_varint(&data);
+ else
+ ce_tmp.ce_dev = previous_ce->ce_dev;
+ if (flags & CE5_FULL_INO) {
+ ce_tmp.ce_ino = ntoh_l(*(uint32_t*)data);
+ data += sizeof(uint32_t);
+ } else {
+ uintmax_t offset = decode_varint(&data);
+ if (offset & 1) /* negative */
+ ce_tmp.ce_ino = previous_ce->ce_ino - (offset >> 1);
+ else
+ ce_tmp.ce_ino = previous_ce->ce_ino + (offset >> 1);
+ }
+ switch (flags & CE5_MODEMASK) {
+ case CE5_MODE_644:
+ ce_tmp.ce_mode = 0100644;
+ break;
+ case CE5_MODE_755:
+ ce_tmp.ce_mode = 0100755;
+ break;
+ case CE5_MODE_FULL:
+ ce_tmp.ce_mode = decode_varint(&data);
+ break;
+ default:
+ die("Huh?");
+ }
+ if (flags & CE5_FULL_UID)
+ ce_tmp.ce_uid = decode_varint(&data);
+ else
+ ce_tmp.ce_uid = previous_ce->ce_uid;
+ if (flags & CE5_FULL_GID)
+ ce_tmp.ce_gid = decode_varint(&data);
+ else
+ ce_tmp.ce_gid = previous_ce->ce_gid;
+ ce_tmp.ce_size = decode_varint(&data);
+ hashcpy(ce_tmp.sha1, data);
+ data += 20;
+
+ *consumed = data - orig;
+ *consumed += expand_name_field(previous_name, (const char *)data);
+
+ ce = xmalloc(cache_entry_size(previous_name->len));
+ ce->ce_ctime = ce_tmp.ce_ctime;
+ ce->ce_mtime = ce_tmp.ce_mtime;
+ ce->ce_dev = ce_tmp.ce_dev;
+ ce->ce_ino = ce_tmp.ce_ino;
+ ce->ce_mode = ce_tmp.ce_mode;
+ ce->ce_uid = ce_tmp.ce_uid;
+ ce->ce_gid = ce_tmp.ce_gid;
+ ce->ce_size = ce_tmp.ce_size;
+ ce->ce_flags = ce_tmp.ce_flags;
+ ce->ce_namelen = previous_name->len;
+ hashcpy(ce->sha1, ce_tmp.sha1);
+ memcpy(ce->name, previous_name->buf, previous_name->len + 1);
+ return ce;
+}
+
/* remember to discard_cache() before reading a different cache! */
int read_index_from(struct index_state *istate, const char *path)
{
@@ -1452,7 +1583,7 @@ int read_index_from(struct index_state *istate, const char *path)
istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *));
istate->initialized = 1;
- if (istate->version == 4)
+ if (istate->version >= 4)
previous_name = &previous_name_buf;
else
previous_name = NULL;
@@ -1464,7 +1595,13 @@ int read_index_from(struct index_state *istate, const char *path)
unsigned long consumed;
disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
- ce = create_from_disk(disk_ce, &consumed, previous_name);
+ if (istate->version == 5)
+ ce = create_from_disk_v5((const unsigned char*)disk_ce,
+ &consumed,
+ previous_name,
+ i > 0 ? istate->cache[i - 1] : NULL);
+ else
+ ce = create_from_disk(disk_ce, &consumed, previous_name);
set_index_entry(istate, i, ce);
src_offset += consumed;
@@ -1652,6 +1789,125 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
}
}
+static void strbuf_encode_varint(struct strbuf *sb, uintmax_t value)
+{
+ static unsigned char varint[16];
+ int varint_len = encode_varint(value, varint);
+ strbuf_add(sb, varint, varint_len);
+}
+
+static void strbuf_encode_offset(struct strbuf *sb, unsigned int v1, unsigned int v2)
+{
+ if (v1 < v2)
+ strbuf_encode_varint(sb, (v2 - v1) << 1);
+ else
+ strbuf_encode_varint(sb, 1 | ((v1 - v2) << 1));
+}
+
+static int ce_write_entry_v5(git_SHA_CTX *c, int fd,
+ struct cache_entry *ce,
+ struct strbuf *previous_name,
+ struct cache_entry *previous_ce)
+{
+ uint32_t data[4];
+ unsigned int flags = 0;
+ struct strbuf sb = STRBUF_INIT;
+ int result;
+
+ if (ce->ce_flags & CE_INTENT_TO_ADD)
+ flags |= CE5_ITA;
+ if (ce->ce_flags & CE_VALID)
+ flags |= CE5_VALID;
+ if (ce->ce_flags & CE_SKIP_WORKTREE)
+ flags |= CE5_SWT;
+ flags |= ce_stage(ce) << CE5_STAGESHIFT;
+
+ if (previous_ce) {
+ if (ce->ce_ctime.sec != ce->ce_mtime.sec ||
+ ce->ce_ctime.nsec != ce->ce_mtime.nsec)
+ flags |= CE5_OFS_MTIME;
+ if (ce->ce_uid != previous_ce->ce_uid)
+ flags |= CE5_FULL_UID;
+ if (ce->ce_gid != previous_ce->ce_gid)
+ flags |= CE5_FULL_GID;
+ if (ce->ce_dev != previous_ce->ce_dev)
+ flags |= CE5_FULL_DEV;
+ } else
+ flags |= CE5_FULL_TIME | CE5_FULL_UID | CE5_FULL_GID | CE5_FULL_DEV;
+ if (ce->ce_mode == 0100644)
+ flags |= CE5_MODE_644; /* no bit sets actually */
+ else if (ce->ce_mode == 0100755)
+ flags |= CE5_MODE_755;
+ else
+ flags |= CE5_MODE_FULL;
+ if (previous_ce &&
+ (ce->ce_ino - previous_ce->ce_ino < 16000 ||
+ previous_ce->ce_ino - ce->ce_ino < 16000))
+ ; /* inode offset */
+ else
+ flags |= CE5_FULL_INO;
+
+ strbuf_encode_varint(&sb, flags);
+
+ if (flags & CE5_FULL_TIME) { /* full format, 16 bytes */
+ data[0] = htonl(ce->ce_ctime.sec);
+ data[1] = htonl(ce->ce_ctime.nsec);
+ data[2] = htonl(ce->ce_mtime.sec);
+ data[3] = htonl(ce->ce_mtime.nsec);
+ strbuf_add(&sb, data, sizeof(data));
+ } else {
+ /* offset from previous ce */
+ strbuf_encode_offset(&sb, previous_ce->ce_ctime.sec, ce->ce_ctime.sec);
+ strbuf_encode_offset(&sb, previous_ce->ce_ctime.nsec, ce->ce_ctime.nsec);
+
+ if (flags & CE5_OFS_MTIME) {
+ /* offset from previous ce */
+ strbuf_encode_offset(&sb, previous_ce->ce_mtime.sec, ce->ce_mtime.sec);
+ strbuf_encode_offset(&sb, previous_ce->ce_mtime.nsec, ce->ce_mtime.nsec);
+ }
+ }
+
+ if (flags & CE5_FULL_DEV)
+ strbuf_encode_varint(&sb, ce->ce_dev);
+ if (flags & CE5_FULL_INO) {
+ data[0] = htonl(ce->ce_ino);
+ strbuf_add(&sb, data, sizeof(*data));
+ } else
+ strbuf_encode_offset(&sb, previous_ce->ce_ino, ce->ce_ino);
+ if (flags & CE5_MODE_FULL)
+ strbuf_encode_varint(&sb, ce->ce_mode);
+ if (flags & CE5_FULL_UID)
+ strbuf_encode_varint(&sb, ce->ce_uid);
+ if (flags & CE5_FULL_GID)
+ strbuf_encode_varint(&sb, ce->ce_gid);
+ strbuf_encode_varint(&sb, ce->ce_size);
+ strbuf_add(&sb, ce->sha1, 20);
+
+ if (previous_name) {
+ int common, to_remove, prefix_size;
+ unsigned char to_remove_vi[16];
+ for (common = 0;
+ (ce->name[common] &&
+ common < previous_name->len &&
+ ce->name[common] == previous_name->buf[common]);
+ common++)
+ ; /* still matching */
+ to_remove = previous_name->len - common;
+ prefix_size = encode_varint(to_remove, to_remove_vi);
+
+ strbuf_add(&sb, to_remove_vi, prefix_size);
+ strbuf_add(&sb, ce->name + common, ce_namelen(ce) - common);
+
+ strbuf_splice(previous_name, common, to_remove,
+ ce->name + common, ce_namelen(ce) - common);
+ } else
+ strbuf_add(&sb, ce->name, ce_namelen(ce));
+
+ result = ce_write(c, fd, sb.buf, sb.len + 1);
+ strbuf_release(&sb);
+ return result;
+}
+
/* Copy miscellaneous fields but not the name */
static char *copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
struct cache_entry *ce)
@@ -1793,16 +2049,23 @@ int write_index(struct index_state *istate, int newfd)
if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
return -1;
- previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
+ previous_name = (hdr_version >= 4) ? &previous_name_buf : NULL;
for (i = 0; i < entries; i++) {
struct cache_entry *ce = cache[i];
+ int ret;
if (ce->ce_flags & CE_REMOVE)
continue;
if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce))
ce_smudge_racily_clean_entry(ce);
if (is_null_sha1(ce->sha1))
return error("cache entry has null sha1: %s", ce->name);
- if (ce_write_entry(&c, newfd, ce, previous_name) < 0)
+ if (hdr_version == 5)
+ ret = ce_write_entry_v5(&c, newfd, ce,
+ previous_name,
+ i > 0 ? cache[i-1] : NULL);
+ else
+ ret = ce_write_entry(&c, newfd, ce, previous_name);
+ if (ret < 0)
return -1;
}
strbuf_release(&previous_name_buf);
--
1.8.1.2.495.g3fdf5d5.dirty
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2013-02-21 9:46 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-02-21 9:45 Another slash at index size Nguyễn Thái Ngọc Duy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).