* [PATCH 1/6] read-cache: use sha1file for sha1 calculation
@ 2012-02-06 5:48 Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 2/6] csum-file: make sha1 calculation optional Nguyễn Thái Ngọc Duy
` (5 more replies)
0 siblings, 6 replies; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
read-cache.c | 90 +++++++++++++++-------------------------------------------
1 files changed, 23 insertions(+), 67 deletions(-)
diff --git a/read-cache.c b/read-cache.c
index a51bba1..e9a20b6 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -12,6 +12,7 @@
#include "commit.h"
#include "blob.h"
#include "resolve-undo.h"
+#include "csum-file.h"
static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really);
@@ -1395,73 +1396,28 @@ int unmerged_index(const struct index_state *istate)
return 0;
}
-#define WRITE_BUFFER_SIZE 8192
-static unsigned char write_buffer[WRITE_BUFFER_SIZE];
-static unsigned long write_buffer_len;
-
-static int ce_write_flush(git_SHA_CTX *context, int fd)
-{
- unsigned int buffered = write_buffer_len;
- if (buffered) {
- git_SHA1_Update(context, write_buffer, buffered);
- if (write_in_full(fd, write_buffer, buffered) != buffered)
- return -1;
- write_buffer_len = 0;
- }
- return 0;
-}
-
-static int ce_write(git_SHA_CTX *context, int fd, void *data, unsigned int len)
+static int ce_write(struct sha1file *f, void *data, unsigned int len)
{
- while (len) {
- unsigned int buffered = write_buffer_len;
- unsigned int partial = WRITE_BUFFER_SIZE - buffered;
- if (partial > len)
- partial = len;
- memcpy(write_buffer + buffered, data, partial);
- buffered += partial;
- if (buffered == WRITE_BUFFER_SIZE) {
- write_buffer_len = buffered;
- if (ce_write_flush(context, fd))
- return -1;
- buffered = 0;
- }
- write_buffer_len = buffered;
- len -= partial;
- data = (char *) data + partial;
- }
- return 0;
+ return sha1write(f, data, len);
}
-static int write_index_ext_header(git_SHA_CTX *context, int fd,
+static int write_index_ext_header(struct sha1file *f,
unsigned int ext, unsigned int sz)
{
ext = htonl(ext);
sz = htonl(sz);
- return ((ce_write(context, fd, &ext, 4) < 0) ||
- (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
+ return ((ce_write(f, &ext, 4) < 0) ||
+ (ce_write(f, &sz, 4) < 0)) ? -1 : 0;
}
-static int ce_flush(git_SHA_CTX *context, int fd)
+static int ce_flush(struct sha1file *f)
{
- unsigned int left = write_buffer_len;
-
- if (left) {
- write_buffer_len = 0;
- git_SHA1_Update(context, write_buffer, left);
- }
-
- /* Flush first if not enough space for SHA1 signature */
- if (left + 20 > WRITE_BUFFER_SIZE) {
- if (write_in_full(fd, write_buffer, left) != left)
- return -1;
- left = 0;
- }
+ unsigned char sha1[20];
+ int fd = sha1close(f, sha1, 0);
- /* Append the SHA1 signature at the end */
- git_SHA1_Final(write_buffer + left, context);
- left += 20;
- return (write_in_full(fd, write_buffer, left) != left) ? -1 : 0;
+ if (fd < 0)
+ return -1;
+ return (write_in_full(fd, sha1, 20) != 20) ? -1 : 0;
}
static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
@@ -1513,7 +1469,7 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
}
}
-static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce)
+static int ce_write_entry(struct sha1file *f, struct cache_entry *ce)
{
int size = ondisk_ce_size(ce);
struct ondisk_cache_entry *ondisk = xcalloc(1, size);
@@ -1542,7 +1498,7 @@ static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce)
name = ondisk->name;
memcpy(name, ce->name, ce_namelen(ce));
- result = ce_write(c, fd, ondisk, size);
+ result = ce_write(f, ondisk, size);
free(ondisk);
return result;
}
@@ -1574,7 +1530,7 @@ void update_index_if_able(struct index_state *istate, struct lock_file *lockfile
int write_index(struct index_state *istate, int newfd)
{
- git_SHA_CTX c;
+ struct sha1file *f;
struct cache_header hdr;
int i, err, removed, extended;
struct cache_entry **cache = istate->cache;
@@ -1598,8 +1554,8 @@ int write_index(struct index_state *istate, int newfd)
hdr.hdr_version = htonl(extended ? 3 : 2);
hdr.hdr_entries = htonl(entries - removed);
- git_SHA1_Init(&c);
- if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
+ f = sha1fd(newfd, NULL);
+ if (ce_write(f, &hdr, sizeof(hdr)) < 0)
return -1;
for (i = 0; i < entries; i++) {
@@ -1608,7 +1564,7 @@ int write_index(struct index_state *istate, int newfd)
continue;
if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce))
ce_smudge_racily_clean_entry(ce);
- if (ce_write_entry(&c, newfd, ce) < 0)
+ if (ce_write_entry(f, ce) < 0)
return -1;
}
@@ -1617,8 +1573,8 @@ int write_index(struct index_state *istate, int newfd)
struct strbuf sb = STRBUF_INIT;
cache_tree_write(&sb, istate->cache_tree);
- err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
- || ce_write(&c, newfd, sb.buf, sb.len) < 0;
+ err = write_index_ext_header(f, CACHE_EXT_TREE, sb.len) < 0
+ || ce_write(f, sb.buf, sb.len) < 0;
strbuf_release(&sb);
if (err)
return -1;
@@ -1627,15 +1583,15 @@ int write_index(struct index_state *istate, int newfd)
struct strbuf sb = STRBUF_INIT;
resolve_undo_write(&sb, istate->resolve_undo);
- err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+ err = write_index_ext_header(f, CACHE_EXT_RESOLVE_UNDO,
sb.len) < 0
- || ce_write(&c, newfd, sb.buf, sb.len) < 0;
+ || ce_write(f, sb.buf, sb.len) < 0;
strbuf_release(&sb);
if (err)
return -1;
}
- if (ce_flush(&c, newfd) || fstat(newfd, &st))
+ if (ce_flush(f) || fstat(newfd, &st))
return -1;
istate->timestamp.sec = (unsigned int)st.st_mtime;
istate->timestamp.nsec = ST_MTIME_NSEC(st);
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 2/6] csum-file: make sha1 calculation optional
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
@ 2012-02-06 5:48 ` Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 3/6] Stop producing index version 2 Nguyễn Thái Ngọc Duy
` (4 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
csum-file.c | 16 +++++++++++++---
csum-file.h | 2 +-
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/csum-file.c b/csum-file.c
index 53f5375..4c517d1 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -11,6 +11,12 @@
#include "progress.h"
#include "csum-file.h"
+static void sha1update(struct sha1file *f, const void *data, unsigned offset)
+{
+ if (f->do_sha1)
+ git_SHA1_Update(&f->ctx, data, offset);
+}
+
static void flush(struct sha1file *f, void *buf, unsigned int count)
{
if (0 <= f->check_fd && count) {
@@ -47,7 +53,7 @@ void sha1flush(struct sha1file *f)
unsigned offset = f->offset;
if (offset) {
- git_SHA1_Update(&f->ctx, f->buffer, offset);
+ sha1update(f, f->buffer, offset);
flush(f, f->buffer, offset);
f->offset = 0;
}
@@ -58,7 +64,10 @@ int sha1close(struct sha1file *f, unsigned char *result, unsigned int flags)
int fd;
sha1flush(f);
- git_SHA1_Final(f->buffer, &f->ctx);
+ if (f->do_sha1)
+ git_SHA1_Final(f->buffer, &f->ctx);
+ else
+ hashclr(f->buffer);
if (result)
hashcpy(result, f->buffer);
if (flags & (CSUM_CLOSE | CSUM_FSYNC)) {
@@ -110,7 +119,7 @@ int sha1write(struct sha1file *f, void *buf, unsigned int count)
buf = (char *) buf + nr;
left -= nr;
if (!left) {
- git_SHA1_Update(&f->ctx, data, offset);
+ sha1update(f, data, offset);
flush(f, data, offset);
offset = 0;
}
@@ -154,6 +163,7 @@ struct sha1file *sha1fd_throughput(int fd, const char *name, struct progress *tp
f->tp = tp;
f->name = name;
f->do_crc = 0;
+ f->do_sha1 = 1;
git_SHA1_Init(&f->ctx);
return f;
}
diff --git a/csum-file.h b/csum-file.h
index 3b540bd..c23ea62 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -12,7 +12,7 @@ struct sha1file {
off_t total;
struct progress *tp;
const char *name;
- int do_crc;
+ int do_crc, do_sha1;
uint32_t crc32;
unsigned char buffer[8192];
};
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 3/6] Stop producing index version 2
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 2/6] csum-file: make sha1 calculation optional Nguyễn Thái Ngọc Duy
@ 2012-02-06 5:48 ` Nguyễn Thái Ngọc Duy
2012-02-06 7:10 ` Junio C Hamano
2012-02-06 5:48 ` [PATCH 4/6] Introduce index version 4 with global flags Nguyễn Thái Ngọc Duy
` (3 subsequent siblings)
5 siblings, 1 reply; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
read-cache.c learned to produce version 2 or 3 depending on whether
extended cache entries exist in 06aaaa0 (Extend index to save more flags
- 2008-10-01), first released in 1.6.1. The purpose is to keep
compatibility with older git. It's been more than three years since
then and git has reached 1.7.9. Drop support for older git.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
read-cache.c | 8 +++-----
t/t2104-update-index-skip-worktree.sh | 12 ------------
2 files changed, 3 insertions(+), 17 deletions(-)
diff --git a/read-cache.c b/read-cache.c
index e9a20b6..fe6b0e0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1532,26 +1532,24 @@ int write_index(struct index_state *istate, int newfd)
{
struct sha1file *f;
struct cache_header hdr;
- int i, err, removed, extended;
+ int i, err, removed;
struct cache_entry **cache = istate->cache;
int entries = istate->cache_nr;
struct stat st;
- for (i = removed = extended = 0; i < entries; i++) {
+ for (i = removed = 0; i < entries; i++) {
if (cache[i]->ce_flags & CE_REMOVE)
removed++;
/* reduce extended entries if possible */
cache[i]->ce_flags &= ~CE_EXTENDED;
if (cache[i]->ce_flags & CE_EXTENDED_FLAGS) {
- extended++;
cache[i]->ce_flags |= CE_EXTENDED;
}
}
hdr.hdr_signature = htonl(CACHE_SIGNATURE);
- /* for extended format, increase version so older git won't try to read it */
- hdr.hdr_version = htonl(extended ? 3 : 2);
+ hdr.hdr_version = htonl(3);
hdr.hdr_entries = htonl(entries - removed);
f = sha1fd(newfd, NULL);
diff --git a/t/t2104-update-index-skip-worktree.sh b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..8221ffa 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -28,19 +28,11 @@ test_expect_success 'setup' '
git ls-files -t | test_cmp expect.full -
'
-test_expect_success 'index is at version 2' '
- test "$(test-index-version < .git/index)" = 2
-'
-
test_expect_success 'update-index --skip-worktree' '
git update-index --skip-worktree 1 sub/1 &&
git ls-files -t | test_cmp expect.skip -
'
-test_expect_success 'index is at version 3 after having some skip-worktree entries' '
- test "$(test-index-version < .git/index)" = 3
-'
-
test_expect_success 'ls-files -t' '
git ls-files -t | test_cmp expect.skip -
'
@@ -50,8 +42,4 @@ test_expect_success 'update-index --no-skip-worktree' '
git ls-files -t | test_cmp expect.full -
'
-test_expect_success 'index version is back to 2 when there is no skip-worktree entry' '
- test "$(test-index-version < .git/index)" = 2
-'
-
test_done
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-06 5:48 ` [PATCH 3/6] Stop producing index version 2 Nguyễn Thái Ngọc Duy
@ 2012-02-06 7:10 ` Junio C Hamano
2012-02-07 3:09 ` Shawn Pearce
0 siblings, 1 reply; 20+ messages in thread
From: Junio C Hamano @ 2012-02-06 7:10 UTC (permalink / raw)
To: spearce
Cc: Nguyễn Thái Ngọc Duy, git, Thomas Rast,
Joshua Redstone
Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
> read-cache.c learned to produce version 2 or 3 depending on whether
> extended cache entries exist in 06aaaa0 (Extend index to save more flags
> - 2008-10-01), first released in 1.6.1. The purpose is to keep
> compatibility with older git. It's been more than three years since
> then and git has reached 1.7.9. Drop support for older git.
Cc'ing this, as I suspect this would surely raise eyebrows of some people
who wanted to get rid of the version 3 format.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-06 7:10 ` Junio C Hamano
@ 2012-02-07 3:09 ` Shawn Pearce
2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Shawn Pearce @ 2012-02-07 3:09 UTC (permalink / raw)
To: Junio C Hamano
Cc: Nguyễn Thái Ngọc, git, Thomas Rast,
Joshua Redstone
2012/2/5 Junio C Hamano <gitster@pobox.com>:
> Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
>
>> read-cache.c learned to produce version 2 or 3 depending on whether
>> extended cache entries exist in 06aaaa0 (Extend index to save more flags
>> - 2008-10-01), first released in 1.6.1. The purpose is to keep
>> compatibility with older git. It's been more than three years since
>> then and git has reached 1.7.9. Drop support for older git.
>
> Cc'ing this, as I suspect this would surely raise eyebrows of some people
> who wanted to get rid of the version 3 format.
Version 3 was a mistake because of the variable length record sizes.
Saving 2 bytes on some records that don't use the extended flags makes
the index file *MUCH* harder to parse. So much so that we should take
version 3 and kill it, not encourage it as the default!
IMHO, when these extended flags were added to make version 3 the
following should have happened:
- All records use the larger structure format with 4 bytes for the
flags, not 2 bytes.
- Change the trailing padding after the name to be a *SINGLE* \0 byte,
and do not pad out to an 8 byte boundary.
Both make it really hard to process the file, and the latter happens
only for direct mmap usage, which we don't do anymore.
We also have to consider the EGit and JGit user base as part of the
ecosystem. We can't just kill a file format because git-core has been
capable of reading its alternative since some arbitrary YYYY-MM-DD
release date. We need to also consider when did some other major tools
catch up and also support this format?
FWIW JGit released index version 3 support in version 0.9.1, which
shipped Sep 15, 2010. JGit/EGit were more than 2 years behind here.
<thinking type="wishful" probability="never-happen"
probably-inflating-flame-from="linus">
I have long wanted to scrap the current index format. I unfortunately
don't have the time to do it myself. But I suspect there may be a lot
of gains by making the index format match the canonical tree format
better by keeping the tree structure within a single file stream,
nesting entries below their parent directory, and keeping tree SHA-1
data along with the directory entry. For one thing the index would be
able to register an empty subdirectory, rather than ignoring them. It
would also better line up with the filesystem's readdir() handling,
giving us more sane logic to compare what readdir() tells us exists
against what the index thinks should be in the same file. And the
overall index should be smaller, because we don't have to repeat the
same path/to/a/file/for/every/file/in/that/same/directory/tree.
Reconstructing the path strings at read time into a flat list should
be pretty trivial, and still keep the parallel lstat calls running off
a flat list working well for fast status operations.
</thinking>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-07 3:09 ` Shawn Pearce
@ 2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
2012-02-07 8:51 ` Nguyen Thai Ngoc Duy
2012-02-07 5:21 ` Junio C Hamano
2012-02-07 17:25 ` Thomas Rast
2 siblings, 1 reply; 20+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-02-07 4:50 UTC (permalink / raw)
To: Shawn Pearce; +Cc: Junio C Hamano, git, Thomas Rast, Joshua Redstone
On Tue, Feb 7, 2012 at 10:09 AM, Shawn Pearce <spearce@spearce.org> wrote:
> 2012/2/5 Junio C Hamano <gitster@pobox.com>:
>> Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
>>
>>> read-cache.c learned to produce version 2 or 3 depending on whether
>>> extended cache entries exist in 06aaaa0 (Extend index to save more flags
>>> - 2008-10-01), first released in 1.6.1. The purpose is to keep
>>> compatibility with older git. It's been more than three years since
>>> then and git has reached 1.7.9. Drop support for older git.
>>
>> Cc'ing this, as I suspect this would surely raise eyebrows of some people
>> who wanted to get rid of the version 3 format.
>
> Version 3 was a mistake because of the variable length record sizes.
> Saving 2 bytes on some records that don't use the extended flags makes
> the index file *MUCH* harder to parse. So much so that we should take
> version 3 and kill it, not encourage it as the default!
Probably too late for that, but it's good to know there are strong
user base for v2.
> <thinking type="wishful" probability="never-happen"
> probably-inflating-flame-from="linus">
>
> I have long wanted to scrap the current index format. I unfortunately
> don't have the time to do it myself. But I suspect there may be a lot
> of gains by making the index format match the canonical tree format
> better by keeping the tree structure within a single file stream,
> nesting entries below their parent directory, and keeping tree SHA-1
> data along with the directory entry. For one thing the index would be
> able to register an empty subdirectory, rather than ignoring them. It
> would also better line up with the filesystem's readdir() handling,
> giving us more sane logic to compare what readdir() tells us exists
> against what the index thinks should be in the same file. And the
> overall index should be smaller, because we don't have to repeat the
> same path/to/a/file/for/every/file/in/that/same/directory/tree.
> Reconstructing the path strings at read time into a flat list should
> be pretty trivial, and still keep the parallel lstat calls running off
> a flat list working well for fast status operations.
>
> </thinking>
Haven't really thought through, but I suppose we could create extended
tree object format (there is info in cache entry that's not in tree
entry), store index in this format, then pack together and store the
pack as part of index file. Append-only access to index would be
possible by appending a new pack of new trees to index) I think with
tree-based index, we could kill a big chunk of code (merging trees and
index together) in unpack_trees(). With further efforts to remove
list-based index usage, we could even kill match_pathspec_depth(),
making tree_entry_interesting() the only function to match patchspec.
But dreams probably never come true.
--
Duy
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
@ 2012-02-07 8:51 ` Nguyen Thai Ngoc Duy
0 siblings, 0 replies; 20+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-02-07 8:51 UTC (permalink / raw)
To: Shawn Pearce; +Cc: Junio C Hamano, git, Thomas Rast, Joshua Redstone
On Tue, Feb 7, 2012 at 11:50 AM, Nguyen Thai Ngoc Duy <pclouds@gmail.com> wrote:
>> Version 3 was a mistake because of the variable length record sizes.
>> Saving 2 bytes on some records that don't use the extended flags makes
>> the index file *MUCH* harder to parse. So much so that we should take
>> version 3 and kill it, not encourage it as the default!
>
> Probably too late for that, but it's good to know there are strong
> user base for v2.
OK probably not too late. We cannot kill it, but we could deprecate
it. We can introduce a mandatory extension to store extra flags. The
extension is basically an array of
struct ce_extended_flags {
int ce_index; /* points to istate->cache[ce_index] */
unsigned long flags;
};
On reading the extension, extra flags is applied back in mem, the
extension is created again when new index is written. There are only
two users of index v3: skip-worktree and intent-to-add bits, which are
not used often, I think. Still want to kill it?
Switching from sha-1 to crc32 could be done the same way (i.e. new
mandatory extension _at the end_ that contains crc32 checksum and skip
sha-1 check on reading if it's all zero) if we agree to move to crc32.
--
Duy
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-07 3:09 ` Shawn Pearce
2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
@ 2012-02-07 5:21 ` Junio C Hamano
2012-02-07 17:25 ` Thomas Rast
2 siblings, 0 replies; 20+ messages in thread
From: Junio C Hamano @ 2012-02-07 5:21 UTC (permalink / raw)
To: Shawn Pearce
Cc: Nguyễn Thái Ngọc, git, Thomas Rast,
Joshua Redstone
Shawn Pearce <spearce@spearce.org> writes:
> <thinking type="wishful" probability="never-happen"
> probably-inflating-flame-from="linus">
>
> I have long wanted to scrap the current index format. I unfortunately
> don't have the time to do it myself. But I suspect there may be a lot
> of gains by making the index format match the canonical tree format
> better by keeping the tree structure within a single file stream,
> nesting entries below their parent directory, and keeping tree SHA-1
> data along with the directory entry.
I suspect that is not so "never-happen wishful thinking".
In an earlier message, I alluded to a data structure that starts with a
single top-level tree entry that is lazily expanded as the index entries
are updated. The above shows that at least two of us share the same (day)
dream, and I suspect there are others that share the same "gut feeling"
that such a tree-based structure would be the way to do large index right.
It would be a large and possibly painful change, but the good thing is
that the index is a local matter and we won't have to worry too much about
a flag day event.
</thinking>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/6] Stop producing index version 2
2012-02-07 3:09 ` Shawn Pearce
2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
2012-02-07 5:21 ` Junio C Hamano
@ 2012-02-07 17:25 ` Thomas Rast
2 siblings, 0 replies; 20+ messages in thread
From: Thomas Rast @ 2012-02-07 17:25 UTC (permalink / raw)
To: Shawn Pearce
Cc: Junio C Hamano, Nguyễn Thái Ngọc, git,
Joshua Redstone
Shawn Pearce <spearce@spearce.org> writes:
> I have long wanted to scrap the current index format. I unfortunately
> don't have the time to do it myself. But I suspect there may be a lot
> of gains by making the index format match the canonical tree format
> better by keeping the tree structure within a single file stream,
> nesting entries below their parent directory, and keeping tree SHA-1
> data along with the directory entry.
If I may add to this: the one thing that I would like to see fixed about
the index is that it's flat out impossible to change a single thing in
it without re"writing" it from scratch.
I'm saying "writing" because it is possible to change a few things
around, but recomputing the trailing SHA1 swamps that by a large margin
unless you are writing to a floppy disk, so it doesn't matter. I'm sure
using a CRC32 helps here, but if we're going to make an incompatible
change, why not go all the way?
A tree layout can fix that if it is properly arranged so that if you
'git add path/to/file', it only updates the SHA1s for path/to/file,
path/to and path. For this to work, the checks would have to correspond
to the trees, perhaps even directly use the actual tree SHA1. This
would at least be natural in some sense; getting to actual log(n)
complexity for hilariously large directories would require dynamically
splitting directories where appropriate.
Along the same lines the format should allow for changing the extension
data for a single extension while only rehashing the new data.
When I worked on cache-tree, I considered making a change to the latter
effect, but thought the impact too great for a little gain. Now from
this thread, I'm getting the impression that such a change would be ok,
even if users would have to scrap the index if they downgrade. Is that
right?
--
Thomas Rast
trast@{inf,student}.ethz.ch
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 4/6] Introduce index version 4 with global flags
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 2/6] csum-file: make sha1 calculation optional Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 3/6] Stop producing index version 2 Nguyễn Thái Ngọc Duy
@ 2012-02-06 5:48 ` Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 5/6] Allow to use crc32 as a lighter checksum on index Nguyễn Thái Ngọc Duy
` (2 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
v4 adds 32-bit field to cache header after 32-bit number of entries.
If this field is zero, fall back to v3.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
Documentation/technical/index-format.txt | 4 ++-
cache.h | 6 +++++
read-cache.c | 31 ++++++++++++++++++++++-------
3 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index 8930b3f..2b6a38e 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -12,10 +12,12 @@ GIT index format
The signature is { 'D', 'I', 'R', 'C' } (stands for "dircache")
4-byte version number:
- The current supported versions are 2 and 3.
+ The current supported versions are 2, 3 and 4.
32-bit number of index entries.
+ 32-bit flags (version 4 only).
+
- A number of sorted index entries (see below).
- Extensions
diff --git a/cache.h b/cache.h
index 9bd8c2d..c2e884a 100644
--- a/cache.h
+++ b/cache.h
@@ -105,6 +105,11 @@ struct cache_header {
unsigned int hdr_entries;
};
+struct ext_cache_header {
+ struct cache_header h;
+ unsigned int hdr_flags;
+};
+
/*
* The "cache_time" is just the low 32 bits of the
* time. It doesn't matter if it overflows - we only
@@ -314,6 +319,7 @@ static inline unsigned int canon_mode(unsigned int mode)
struct index_state {
struct cache_entry **cache;
unsigned int cache_nr, cache_alloc, cache_changed;
+ unsigned int hdr_flags;
struct string_list *resolve_undo;
struct cache_tree *cache_tree;
struct cache_time timestamp;
diff --git a/read-cache.c b/read-cache.c
index fe6b0e0..fd21af6 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1190,7 +1190,9 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
return error("bad signature");
- if (hdr->hdr_version != htonl(2) && hdr->hdr_version != htonl(3))
+ if (hdr->hdr_version != htonl(2) &&
+ hdr->hdr_version != htonl(3) &&
+ hdr->hdr_version != htonl(4))
return error("bad index version");
git_SHA1_Init(&c);
git_SHA1_Update(&c, hdr, size - 20);
@@ -1320,7 +1322,12 @@ int read_index_from(struct index_state *istate, const char *path)
istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *));
istate->initialized = 1;
- src_offset = sizeof(*hdr);
+ if (ntohl(hdr->hdr_version) >= 4) {
+ struct ext_cache_header *ehdr = mmap;
+ istate->hdr_flags = ntohl(ehdr->hdr_flags);
+ src_offset = sizeof(*ehdr);
+ } else
+ src_offset = sizeof(*hdr);
for (i = 0; i < istate->cache_nr; i++) {
struct ondisk_cache_entry *disk_ce;
struct cache_entry *ce;
@@ -1375,6 +1382,7 @@ int discard_index(struct index_state *istate)
resolve_undo_clear_index(istate);
istate->cache_nr = 0;
istate->cache_changed = 0;
+ istate->hdr_flags = 0;
istate->timestamp.sec = 0;
istate->timestamp.nsec = 0;
istate->name_hash_initialized = 0;
@@ -1531,8 +1539,8 @@ void update_index_if_able(struct index_state *istate, struct lock_file *lockfile
int write_index(struct index_state *istate, int newfd)
{
struct sha1file *f;
- struct cache_header hdr;
- int i, err, removed;
+ struct ext_cache_header hdr;
+ int i, err, removed, hdr_size;
struct cache_entry **cache = istate->cache;
int entries = istate->cache_nr;
struct stat st;
@@ -1548,12 +1556,19 @@ int write_index(struct index_state *istate, int newfd)
}
}
- hdr.hdr_signature = htonl(CACHE_SIGNATURE);
- hdr.hdr_version = htonl(3);
- hdr.hdr_entries = htonl(entries - removed);
+ hdr.h.hdr_signature = htonl(CACHE_SIGNATURE);
+ if (istate->hdr_flags) {
+ hdr.h.hdr_version = htonl(4);
+ hdr.hdr_flags = htonl(istate->hdr_flags);
+ hdr_size = sizeof(hdr);
+ } else {
+ hdr.h.hdr_version = htonl(3);
+ hdr_size = sizeof(hdr.h);
+ }
+ hdr.h.hdr_entries = htonl(entries - removed);
f = sha1fd(newfd, NULL);
- if (ce_write(f, &hdr, sizeof(hdr)) < 0)
+ if (ce_write(f, &hdr, hdr_size) < 0)
return -1;
for (i = 0; i < entries; i++) {
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 5/6] Allow to use crc32 as a lighter checksum on index
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
` (2 preceding siblings ...)
2012-02-06 5:48 ` [PATCH 4/6] Introduce index version 4 with global flags Nguyễn Thái Ngọc Duy
@ 2012-02-06 5:48 ` Nguyễn Thái Ngọc Duy
2012-02-07 3:17 ` Shawn Pearce
2012-02-06 5:48 ` [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large Nguyễn Thái Ngọc Duy
2012-02-06 7:34 ` [PATCH 1/6] read-cache: use sha1file for sha1 calculation Junio C Hamano
5 siblings, 1 reply; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
Documentation/git-update-index.txt | 12 +++++++-
builtin/update-index.c | 11 +++++++
cache.h | 2 +
read-cache.c | 54 ++++++++++++++++++++++++++++--------
4 files changed, 66 insertions(+), 13 deletions(-)
diff --git a/Documentation/git-update-index.txt b/Documentation/git-update-index.txt
index a3081f4..2574a4e 100644
--- a/Documentation/git-update-index.txt
+++ b/Documentation/git-update-index.txt
@@ -13,7 +13,7 @@ SYNOPSIS
[--add] [--remove | --force-remove] [--replace]
[--refresh] [-q] [--unmerged] [--ignore-missing]
[(--cacheinfo <mode> <object> <file>)...]
- [--chmod=(+|-)x]
+ [--chmod=(+|-)x] [--[no-]crc32]
[--assume-unchanged | --no-assume-unchanged]
[--skip-worktree | --no-skip-worktree]
[--ignore-submodules]
@@ -109,6 +109,16 @@ you will need to handle the situation manually.
set and unset the "skip-worktree" bit for the paths. See
section "Skip-worktree bit" below for more information.
+--crc32::
+--no-crc32::
+ Normally SHA-1 is used to check for index integrity. When the
+ index is large, SHA-1 computation cost can be significant.
+ --crc32 will convert current index to use (cheaper) crc32
+ instead. Note that later writes to index by other commands can
+ convert the index back to SHA-1. Older git versions may not
+ understand crc32 index, --no-crc32 can be used to convert it
+ back to SHA-1.
+
-g::
--again::
Runs 'git update-index' itself on the paths whose index
diff --git a/builtin/update-index.c b/builtin/update-index.c
index a6a23fa..6913226 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -707,6 +707,7 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
{
int newfd, entries, has_errors = 0, line_termination = '\n';
int read_from_stdin = 0;
+ int do_crc = -1;
int prefix_length = prefix ? strlen(prefix) : 0;
char set_executable_bit = 0;
struct refresh_params refresh_args = {0, &has_errors};
@@ -791,6 +792,8 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
"(for porcelains) forget saved unresolved conflicts",
PARSE_OPT_NOARG | PARSE_OPT_NONEG,
resolve_undo_clear_callback},
+ OPT_BOOL(0, "crc32", &do_crc,
+ "use crc32 as checksum instead of sha1"),
OPT_END()
};
@@ -852,6 +855,14 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
}
argc = parse_options_end(&ctx);
+ if (do_crc != -1) {
+ if (do_crc)
+ the_index.hdr_flags |= CACHE_F_CRC;
+ else
+ the_index.hdr_flags &= ~CACHE_F_CRC;
+ active_cache_changed = 1;
+ }
+
if (read_from_stdin) {
struct strbuf buf = STRBUF_INIT, nbuf = STRBUF_INIT;
diff --git a/cache.h b/cache.h
index c2e884a..7352402 100644
--- a/cache.h
+++ b/cache.h
@@ -105,6 +105,8 @@ struct cache_header {
unsigned int hdr_entries;
};
+#define CACHE_F_CRC 1 /* use crc32 instead of sha1 for index checksum */
+
struct ext_cache_header {
struct cache_header h;
unsigned int hdr_flags;
diff --git a/read-cache.c b/read-cache.c
index fd21af6..a34878e 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1185,20 +1185,33 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int reall
static int verify_hdr(struct cache_header *hdr, unsigned long size)
{
+ int do_crc;
git_SHA_CTX c;
unsigned char sha1[20];
if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
return error("bad signature");
- if (hdr->hdr_version != htonl(2) &&
- hdr->hdr_version != htonl(3) &&
- hdr->hdr_version != htonl(4))
+ if (hdr->hdr_version == htonl(2) ||
+ hdr->hdr_version == htonl(3))
+ do_crc = 0;
+ else if (hdr->hdr_version == htonl(4)) {
+ struct ext_cache_header *ehdr = (struct ext_cache_header *)hdr;
+ do_crc = ntohl(ehdr->hdr_flags) & CACHE_F_CRC;
+ }
+ else
return error("bad index version");
- git_SHA1_Init(&c);
- git_SHA1_Update(&c, hdr, size - 20);
- git_SHA1_Final(sha1, &c);
- if (hashcmp(sha1, (unsigned char *)hdr + size - 20))
- return error("bad index file sha1 signature");
+ if (do_crc) {
+ uint32_t crc = crc32(0, NULL, 0);
+ crc = crc32(crc,(void *) hdr, size - sizeof(uint32_t));
+ if (crc != *(uint32_t*)((unsigned char *)hdr + size - sizeof(uint32_t)))
+ return error("bad index file crc32 signature");
+ } else {
+ git_SHA1_Init(&c);
+ git_SHA1_Update(&c, hdr, size - 20);
+ git_SHA1_Final(sha1, &c);
+ if (hashcmp(sha1, (unsigned char *)hdr + size - 20))
+ return error("bad index file sha1 signature");
+ }
return 0;
}
@@ -1421,11 +1434,24 @@ static int write_index_ext_header(struct sha1file *f,
static int ce_flush(struct sha1file *f)
{
unsigned char sha1[20];
- int fd = sha1close(f, sha1, 0);
+ int fd;
- if (fd < 0)
- return -1;
- return (write_in_full(fd, sha1, 20) != 20) ? -1 : 0;
+ if (f->do_crc) {
+ uint32_t crc;
+
+ assert(f->do_sha1 == 0);
+ sha1flush(f);
+ crc = crc32_end(f);
+ fd = sha1close(f, sha1, 0);
+ if (fd < 0)
+ return -1;
+ return (write_in_full(fd, &crc, sizeof(crc)) != sizeof(crc)) ? -1 : 0;
+ } else {
+ fd = sha1close(f, sha1, 0);
+ if (fd < 0)
+ return -1;
+ return (write_in_full(fd, sha1, 20) != 20) ? -1 : 0;
+ }
}
static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
@@ -1568,6 +1594,10 @@ int write_index(struct index_state *istate, int newfd)
hdr.h.hdr_entries = htonl(entries - removed);
f = sha1fd(newfd, NULL);
+ if (istate->hdr_flags & CACHE_F_CRC) {
+ crc32_begin(f);
+ f->do_sha1 = 0;
+ }
if (ce_write(f, &hdr, hdr_size) < 0)
return -1;
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 5/6] Allow to use crc32 as a lighter checksum on index
2012-02-06 5:48 ` [PATCH 5/6] Allow to use crc32 as a lighter checksum on index Nguyễn Thái Ngọc Duy
@ 2012-02-07 3:17 ` Shawn Pearce
2012-02-07 4:04 ` Dave Zarzycki
0 siblings, 1 reply; 20+ messages in thread
From: Shawn Pearce @ 2012-02-07 3:17 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git, Thomas Rast, Joshua Redstone
2012/2/5 Nguyễn Thái Ngọc Duy <pclouds@gmail.com>:
> if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
> return error("bad signature");
> - if (hdr->hdr_version != htonl(2) &&
> - hdr->hdr_version != htonl(3) &&
> - hdr->hdr_version != htonl(4))
> + if (hdr->hdr_version == htonl(2) ||
> + hdr->hdr_version == htonl(3))
> + do_crc = 0;
> + else if (hdr->hdr_version == htonl(4)) {
> + struct ext_cache_header *ehdr = (struct ext_cache_header *)hdr;
> + do_crc = ntohl(ehdr->hdr_flags) & CACHE_F_CRC;
> + }
> + else
Ick. Ick. Ick. Please $DEITY no.
When it comes to data integrity codes in Git... PICK ONE AND STICK WITH IT.
If CRC-32 is good enough to protect the index content such that disk
corruption is probably detectable with it, lets just switch to CRC-32
in index version 4. Don't make it optional with a new header field
that wasn't there in version 3 and is now only able to accept 32 bits
of flags before we have to go and create index version 5. We already
have a cache extension system available with extension codes in the
footer of the index file. We don't need YET ANOTHER EXTENSION SYSTEM.
If CRC-32 is not good enough, and we don't want to trust it (or
really, YOU don't want to trust it) please do not then go and propose
that a less knowledgeable user should switch to CRC-32 "because it is
faster". If we don't want to rely on the error detection of CRC-32,
then we should be using SHA-1. Or SHA-256.
I haven't really put a lot of thought into this. But I suspect CRC-32
is sufficient on the index file, until it gets so big that the
probability of a bit flip going undetected is too high due to the size
of the file, but then we are into the "huge" index size range that has
you trying to swap out SHA-1 for CRC-32 because SHA-1 is too slow. Uhm
no.
CRC-32 may be good enough, we use it inside of the pack-objects when
doing repacking locally and don't want to inflate objects to check
SHA-1, but do want to try and detect a random bit flip caused by a
broken file copier. Thus far its held up well there. Given the very
transient nature of the index file (and how it can be mostly rebuilt
from a tree object and the working directory), CRC-32 might be good
enough. But please pick one.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 5/6] Allow to use crc32 as a lighter checksum on index
2012-02-07 3:17 ` Shawn Pearce
@ 2012-02-07 4:04 ` Dave Zarzycki
2012-02-07 4:29 ` Dave Zarzycki
0 siblings, 1 reply; 20+ messages in thread
From: Dave Zarzycki @ 2012-02-07 4:04 UTC (permalink / raw)
To: Shawn Pearce
Cc: Nguyễn Thái Ngọc Duy, git, Thomas Rast,
Joshua Redstone
On Feb 6, 2012, at 7:17 PM, Shawn Pearce <spearce@spearce.org> wrote:
> I haven't really put a lot of thought into this. But I suspect CRC-32
> is sufficient on the index file, until it gets so big that the
> probability of a bit flip going undetected is too high due to the size
> of the file, but then we are into the "huge" index size range that has
> you trying to swap out SHA-1 for CRC-32 because SHA-1 is too slow. Uhm
> no.
CRCs are designed to be implemented in hardware and provide basic single-bit error checking for networking packets of disk blocks. With a good polynomial, they're reasonably effective at detecting a single-bit error within 8 or 16 kilobytes:
http://www.ece.cmu.edu/~koopman/networks/dsn02/dsn02_koopman.pdf
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 5/6] Allow to use crc32 as a lighter checksum on index
2012-02-07 4:04 ` Dave Zarzycki
@ 2012-02-07 4:29 ` Dave Zarzycki
0 siblings, 0 replies; 20+ messages in thread
From: Dave Zarzycki @ 2012-02-07 4:29 UTC (permalink / raw)
To: Shawn Pearce
Cc: Nguyễn Thái Ngọc Duy, git, Thomas Rast,
Joshua Redstone
On Feb 6, 2012, at 8:04 PM, Dave Zarzycki <zarzycki@apple.com> wrote:
> On Feb 6, 2012, at 7:17 PM, Shawn Pearce <spearce@spearce.org> wrote:
>
>> I haven't really put a lot of thought into this. But I suspect CRC-32
>> is sufficient on the index file, until it gets so big that the
>> probability of a bit flip going undetected is too high due to the size
>> of the file, but then we are into the "huge" index size range that has
>> you trying to swap out SHA-1 for CRC-32 because SHA-1 is too slow. Uhm
>> no.
>
> CRCs are designed to be implemented in hardware and provide basic single-bit error checking for networking packets of disk blocks. With a good polynomial, they're reasonably effective at detecting a single-bit error within 8 or 16 kilobytes:
>
> http://www.ece.cmu.edu/~koopman/networks/dsn02/dsn02_koopman.pdf
s/packets of disk blocks/packets or disk blocks/g
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
` (3 preceding siblings ...)
2012-02-06 5:48 ` [PATCH 5/6] Allow to use crc32 as a lighter checksum on index Nguyễn Thái Ngọc Duy
@ 2012-02-06 5:48 ` Nguyễn Thái Ngọc Duy
2012-02-06 8:50 ` Dave Zarzycki
2012-02-06 7:34 ` [PATCH 1/6] read-cache: use sha1file for sha1 calculation Junio C Hamano
5 siblings, 1 reply; 20+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-06 5:48 UTC (permalink / raw)
To: git; +Cc: Thomas Rast, Joshua Redstone,
Nguyễn Thái Ngọc Duy
An experiment with -O3 is done on Intel D510@1.66GHz. At around 250k
entries, index reading time exceeds 0.5s. Switching to crc32 brings it
back lower than 0.2s.
On 4M files index, reading time with SHA-1 takes ~8.4, crc32 2.8s.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
I know no real repositories this size though. gentoo-x86 is "only"
120k. Haven't checked libreoffice repo yet.
On 2M files index, allocating one big block (i.e. reverting debed2a
(read-cache.c: allocate index entries individually - 2011-10-24)
saves about 0.3s. Maybe we can allocate one big block, then malloc
separately when the block is fully used.
Writing time is still high. "git update-index --crc32" on crc32 250k index
takes 0.9s (so writing time is about 0.5s)
A better solution may be narrow clone (or just the narrow checkout
part), where index only contains entries from checked out
subdirectories.
Documentation/config.txt | 7 +++++++
builtin/update-index.c | 1 +
cache.h | 1 +
config.c | 5 +++++
environment.c | 1 +
read-cache.c | 8 ++++++++
6 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/Documentation/config.txt b/Documentation/config.txt
index abeb82b..55b7596 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -540,6 +540,13 @@ relatively high IO latencies. With this set to 'true', git will do the
index comparison to the filesystem data in parallel, allowing
overlapping IO's.
+core.crc32IndexThreshold::
+ Usually SHA-1 is used to check for index integerity. When the
+ number of entries in index exceeds this threshold, crc32 will
+ be used instead. Zero means SHA-1 always be used. Negative
+ value disables this threshold (i.e. crc32 or SHA-1 is decided
+ by other means).
+
core.createObject::
You can set this to 'link', in which case a hardlink followed by
a delete of the source are used to make sure that object creation
diff --git a/builtin/update-index.c b/builtin/update-index.c
index 6913226..5cb51c7 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -856,6 +856,7 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
argc = parse_options_end(&ctx);
if (do_crc != -1) {
+ core_crc32_index_threshold = -1;
if (do_crc)
the_index.hdr_flags |= CACHE_F_CRC;
else
diff --git a/cache.h b/cache.h
index 7352402..d05856b 100644
--- a/cache.h
+++ b/cache.h
@@ -610,6 +610,7 @@ extern unsigned long pack_size_limit_cfg;
extern int read_replace_refs;
extern int fsync_object_files;
extern int core_preload_index;
+extern int core_crc32_index_threshold;
extern int core_apply_sparse_checkout;
enum branch_track {
diff --git a/config.c b/config.c
index 40f9c6d..905e071 100644
--- a/config.c
+++ b/config.c
@@ -671,6 +671,11 @@ static int git_default_core_config(const char *var, const char *value)
return 0;
}
+ if (!strcmp(var, "core.crc32indexthreshold")) {
+ core_crc32_index_threshold = git_config_int(var, value);
+ return 0;
+ }
+
if (!strcmp(var, "core.createobject")) {
if (!strcmp(value, "rename"))
object_creation_mode = OBJECT_CREATION_USES_RENAMES;
diff --git a/environment.c b/environment.c
index c93b8f4..9d9dfc2 100644
--- a/environment.c
+++ b/environment.c
@@ -66,6 +66,7 @@ unsigned long pack_size_limit_cfg;
/* Parallel index stat data preload? */
int core_preload_index = 0;
+int core_crc32_index_threshold = 250000;
/* This is set by setup_git_dir_gently() and/or git_default_config() */
char *git_work_tree_cfg;
diff --git a/read-cache.c b/read-cache.c
index a34878e..fd032d8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1582,6 +1582,14 @@ int write_index(struct index_state *istate, int newfd)
}
}
+ if (core_crc32_index_threshold >= 0) {
+ if (core_crc32_index_threshold > 0 &&
+ istate->cache_nr >= core_crc32_index_threshold)
+ istate->hdr_flags |= CACHE_F_CRC;
+ else
+ istate->hdr_flags &= ~CACHE_F_CRC;
+ }
+
hdr.h.hdr_signature = htonl(CACHE_SIGNATURE);
if (istate->hdr_flags) {
hdr.h.hdr_version = htonl(4);
--
1.7.8.36.g69ee2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large
2012-02-06 5:48 ` [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large Nguyễn Thái Ngọc Duy
@ 2012-02-06 8:50 ` Dave Zarzycki
2012-02-06 8:54 ` Nguyen Thai Ngoc Duy
0 siblings, 1 reply; 20+ messages in thread
From: Dave Zarzycki @ 2012-02-06 8:50 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git, Thomas Rast, Joshua Redstone
Which crc32 polynomial is being used? crc32c (a.k.a. Castagnoli)? It would be great if this were the same polynomial that Intel implements in hardware via SSE4.2.
On Feb 5, 2012, at 9:48 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> An experiment with -O3 is done on Intel D510@1.66GHz. At around 250k
> entries, index reading time exceeds 0.5s. Switching to crc32 brings it
> back lower than 0.2s.
>
> On 4M files index, reading time with SHA-1 takes ~8.4, crc32 2.8s.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large
2012-02-06 8:50 ` Dave Zarzycki
@ 2012-02-06 8:54 ` Nguyen Thai Ngoc Duy
2012-02-06 9:07 ` Dave Zarzycki
0 siblings, 1 reply; 20+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-02-06 8:54 UTC (permalink / raw)
To: Dave Zarzycki; +Cc: git, Thomas Rast, Joshua Redstone
2012/2/6 Dave Zarzycki <zarzycki@apple.com>:
> Which crc32 polynomial is being used? crc32c (a.k.a. Castagnoli)? It would be great if this were the same polynomial that Intel implements in hardware via SSE4.2.
It's zlib's crc32.
> On Feb 5, 2012, at 9:48 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
>
>> An experiment with -O3 is done on Intel D510@1.66GHz. At around 250k
>> entries, index reading time exceeds 0.5s. Switching to crc32 brings it
>> back lower than 0.2s.
>>
>> On 4M files index, reading time with SHA-1 takes ~8.4, crc32 2.8s.
--
Duy
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large
2012-02-06 8:54 ` Nguyen Thai Ngoc Duy
@ 2012-02-06 9:07 ` Dave Zarzycki
0 siblings, 0 replies; 20+ messages in thread
From: Dave Zarzycki @ 2012-02-06 9:07 UTC (permalink / raw)
To: Nguyen Thai Ngoc Duy; +Cc: git, Thomas Rast, Joshua Redstone
On Feb 6, 2012, at 12:54 AM, Nguyen Thai Ngoc Duy <pclouds@gmail.com> wrote:
> 2012/2/6 Dave Zarzycki <zarzycki@apple.com>:
>> Which crc32 polynomial is being used? crc32c (a.k.a. Castagnoli)? It would be great if this were the same polynomial that Intel implements in hardware via SSE4.2.
>
> It's zlib's crc32.
That's too bad. Zlib uses crc32, not crc32c. The Intel instruction is crc32c and is 2-3 times faster than the best software based implementation.
http://www.strchr.com/crc32_popcnt
>
>> On Feb 5, 2012, at 9:48 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
>>
>>> An experiment with -O3 is done on Intel D510@1.66GHz. At around 250k
>>> entries, index reading time exceeds 0.5s. Switching to crc32 brings it
>>> back lower than 0.2s.
>>>
>>> On 4M files index, reading time with SHA-1 takes ~8.4, crc32 2.8s.
> --
> Duy
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/6] read-cache: use sha1file for sha1 calculation
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
` (4 preceding siblings ...)
2012-02-06 5:48 ` [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large Nguyễn Thái Ngọc Duy
@ 2012-02-06 7:34 ` Junio C Hamano
2012-02-06 8:36 ` Nguyen Thai Ngoc Duy
5 siblings, 1 reply; 20+ messages in thread
From: Junio C Hamano @ 2012-02-06 7:34 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git, Thomas Rast, Joshua Redstone
Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Having no explanation on any of the patch in the series without cover
letter makes it hard to comment on anything, and not having any numbers
makes it even harder after guessing that this is about some performance
tweaks for 2M entry index cases.
This is open source, and I wouldn't stop you from spending time on anything
that interests you.
But having said that, if you have extra Git time, I would still rather see
you spend it first on tying up loose ends of your topics in flight and
on helping others that touch parts that are related to areas that you have
already thought about, namely:
(1) nd/commit-ignore-i-t-a, which I think should be marketted as fixing
an earlier UI mistake and presented with a clean migration path to
make the updated behaviour the default in the future; and
(2) the negative pathspec thing that resurfaced in disguise as Albert
Yale's "grep --exclude" series.
than playing with the approach of this series. The two reasons I suspect
that spending your time on this series will give us much less value than
the above two topics out of you are:
(1) While I think 2M-entry index is an interesting issue, it does not
affect most of the people; and more importantly
(2) I think the proper way to handle 2M-entry index case is to avoid
having to write and read the whole 2M-entry as a flat table in the
first place, not by weakening how its integrity is assured in order
to micro-tweak the read/write efficiency without re-examining the
flatness of the current in-core index [*1*].
The first patch that reuses the existing csum-file API to older code that
was written before csum-file was invented is probably a good thing to do,
though, independent of the 2M-entry issue.
Thanks.
[Footnote]
*1* A possible approach might be to stuff unmodified trees in the index
without exploding them into its components, and as entries are modified,
lazily expand these "tree" entries, while ensuring the "unmodified" parts
remain unmodified by turning the files in the working tree read-only and
requiring the user to say "git edit" or "git open" or something before
starting to edit. But as I said, I consider this not an ultra-urgent
issue, so I haven't thought things through yet.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/6] read-cache: use sha1file for sha1 calculation
2012-02-06 7:34 ` [PATCH 1/6] read-cache: use sha1file for sha1 calculation Junio C Hamano
@ 2012-02-06 8:36 ` Nguyen Thai Ngoc Duy
0 siblings, 0 replies; 20+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-02-06 8:36 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
2012/2/6 Junio C Hamano <gitster@pobox.com>:
> This is open source, and I wouldn't stop you from spending time on anything
> that interests you.
Well fun stuff is more interesting to do.. but I guess that's about
it. More time cut down requires bigger changes that does not fit in a
few hours of work.
> But having said that, if you have extra Git time, I would still rather see
> you spend it first on tying up loose ends of your topics in flight and
> on helping others that touch parts that are related to areas that you have
> already thought about, namely:
>
> (1) nd/commit-ignore-i-t-a, which I think should be marketted as fixing
> an earlier UI mistake and presented with a clean migration path to
> make the updated behaviour the default in the future; and
Yeah, I was avoiding the deprecation procedure (plus providing a
convincing argument to push it forward). Need to look up old emails..
> (2) the negative pathspec thing that resurfaced in disguise as Albert
> Yale's "grep --exclude" series.
This is pure headache. Can't avoid it forever, I guess.
> *1* A possible approach might be to stuff unmodified trees in the index
> without exploding them into its components, and as entries are modified,
> lazily expand these "tree" entries, while ensuring the "unmodified" parts
> remain unmodified by turning the files in the working tree read-only and
> requiring the user to say "git edit" or "git open" or something before
> starting to edit. But as I said, I consider this not an ultra-urgent
> issue, so I haven't thought things through yet.
A sparse index is something that may be achieved with narrow clone (or
narrow checkout in full clone) because by nature we can't have full
index in narrow clone. That may be the right way to go.
--
Duy
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2012-02-07 17:25 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-02-06 5:48 [PATCH 1/6] read-cache: use sha1file for sha1 calculation Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 2/6] csum-file: make sha1 calculation optional Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 3/6] Stop producing index version 2 Nguyễn Thái Ngọc Duy
2012-02-06 7:10 ` Junio C Hamano
2012-02-07 3:09 ` Shawn Pearce
2012-02-07 4:50 ` Nguyen Thai Ngoc Duy
2012-02-07 8:51 ` Nguyen Thai Ngoc Duy
2012-02-07 5:21 ` Junio C Hamano
2012-02-07 17:25 ` Thomas Rast
2012-02-06 5:48 ` [PATCH 4/6] Introduce index version 4 with global flags Nguyễn Thái Ngọc Duy
2012-02-06 5:48 ` [PATCH 5/6] Allow to use crc32 as a lighter checksum on index Nguyễn Thái Ngọc Duy
2012-02-07 3:17 ` Shawn Pearce
2012-02-07 4:04 ` Dave Zarzycki
2012-02-07 4:29 ` Dave Zarzycki
2012-02-06 5:48 ` [PATCH 6/6] Automatically switch to crc32 checksum for index when it's too large Nguyễn Thái Ngọc Duy
2012-02-06 8:50 ` Dave Zarzycki
2012-02-06 8:54 ` Nguyen Thai Ngoc Duy
2012-02-06 9:07 ` Dave Zarzycki
2012-02-06 7:34 ` [PATCH 1/6] read-cache: use sha1file for sha1 calculation Junio C Hamano
2012-02-06 8:36 ` Nguyen Thai Ngoc Duy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).