* [PATCH] multi item packed files
@ 2005-04-21 15:13 Chris Mason
2005-04-21 15:41 ` Linus Torvalds
0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-04-21 15:13 UTC (permalink / raw)
To: git, torvalds
[-- Attachment #1: Type: text/plain, Size: 1797 bytes --]
Hello,
There have been a few threads on making git more space efficient, and
eventually someone mentions tiny files and space fragmentation. Now that git
object names are decoupled from their compression, it's easier to consider a
a variety of compression algorithms. I whipped up a really silly "pack files
together" compression.
This would maintain the write once semantics but allow a simple mechanism
where objects are combined together. Choosing which objects to combine is
easy, things put together into update-cache go together. This gives us more
space efficiency and no seeks when reading that packed file off disk.
A natural extension to this is to make update-cache --commit-tree, which
includes the files produced by write-tree and commit-tree into the same
packed file. (I haven't coded this).
The layout works like this:
1) a new object type "packed" is added.
2) new objects are buffered into a packed object, until it gets to around 32k
in size. This is complete arbitrary but felt about right.
3) The packed object is writting to git storage and then hard links are made
to the packed object from the sha1 filename of each object inside.
4) read_sha1_file is changed to recognize the packed object and search inside.
I did a simple test on the 2.6.11 tree with my 100 patches applied. Without
packing, .git is 99MB. With packing it only needs 62MB:
read speeds don't suffer with this, time to read-tree ; checkout-cache -a -f
from a cold cache were the same. I could get the times lower with the patch
by caching the uncompressed data, since in theory I should be faster here.
Using this on data you care about would be a really bad idea right now. I'm
only posting the patch to get the basic idea across for benchmarking and
discussion.
-chris
[-- Attachment #2: comp-tree.diff --]
[-- Type: text/x-diff, Size: 9944 bytes --]
diff -ur linus.back/cache.h linus/cache.h
--- linus.back/cache.h 2005-04-21 11:05:27.971607944 -0400
+++ linus/cache.h 2005-04-21 09:35:47.173613576 -0400
@@ -109,7 +109,7 @@
/* Read and unpack a sha1 file into memory, write memory to a sha1 file */
extern void * map_sha1_file(const unsigned char *sha1, unsigned long *size);
-extern void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned long *size);
+extern void * unpack_sha1_file(const unsigned char *sha1, void *map, unsigned long mapsize, char *type, unsigned long *size);
extern void * read_sha1_file(const unsigned char *sha1, char *type, unsigned long *size);
extern int write_sha1_file(char *buf, unsigned len, unsigned char *return_sha1);
extern int check_sha1_signature(unsigned char *sha1, void *buf, unsigned long size, const char *type);
@@ -117,6 +117,10 @@
/* Convert to/from hex/sha1 representation */
extern int get_sha1_hex(const char *hex, unsigned char *sha1);
extern char *sha1_to_hex(const unsigned char *sha1); /* static buffer result! */
+extern int pack_sha1_buffer(void *buf, unsigned long buf_len,
+ unsigned char *returnsha1, char **dest,
+ unsigned long *dest_size);
+int write_packed_buffer(void *buf, unsigned long len);
/* General helper functions */
extern void usage(const char *err);
diff -ur linus.back/cat-file.c linus/cat-file.c
--- linus.back/cat-file.c 2005-04-21 11:05:27.971607944 -0400
+++ linus/cat-file.c 2005-04-21 10:04:29.871723656 -0400
@@ -23,7 +23,7 @@
type[size] = '\n';
size++;
} else if (strcmp(type, argv[1])) {
- die("cat-file %s: bad tag", argv[2]);
+ die("cat-file %s: bad tag (%s: %s)", argv[2], type, argv[1]);
}
while (size > 0) {
diff -ur linus.back/fsck-cache.c linus/fsck-cache.c
--- linus.back/fsck-cache.c 2005-04-21 11:05:27.974607488 -0400
+++ linus/fsck-cache.c 2005-04-21 09:14:03.139856840 -0400
@@ -85,7 +85,7 @@
if (map) {
char type[100];
unsigned long size;
- void *buffer = unpack_sha1_file(map, mapsize, type, &size);
+ void *buffer = unpack_sha1_file(sha1, map, mapsize, type, &size);
if (!buffer)
return -1;
if (check_sha1_signature(sha1, buffer, size, type) < 0)
diff -ur linus.back/sha1_file.c linus/sha1_file.c
--- linus.back/sha1_file.c 2005-04-21 11:05:27.978606880 -0400
+++ linus/sha1_file.c 2005-04-21 10:41:51.280977656 -0400
@@ -116,7 +116,8 @@
return map;
}
-void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned long *size)
+void * unpack_sha1_file(const unsigned char *sha1, void *map,
+ unsigned long mapsize, char *type, unsigned long *size)
{
int ret, bytes;
z_stream stream;
@@ -134,12 +135,12 @@
ret = inflate(&stream, 0);
if (sscanf(buffer, "%10s %lu", type, size) != 2)
return NULL;
-
bytes = strlen(buffer) + 1;
buf = malloc(*size);
- if (!buf)
+ if (!buf) {
+ perror("malloc");
return NULL;
-
+ }
memcpy(buf, buffer + bytes, stream.total_out - bytes);
bytes = stream.total_out - bytes;
if (bytes < *size && ret == Z_OK) {
@@ -149,6 +150,36 @@
/* nothing */;
}
inflateEnd(&stream);
+
+ /* we've found a packed object */
+ if (strcmp(type, "packed") == 0) {
+ char *p = buf;
+ if (!sha1)
+ return NULL;
+ while(p < buf + *size) {
+ unsigned long item_len;
+ unsigned char sha1_hex[50];
+ unsigned char item_sha[20];
+ sscanf(p, "%50s %lu", sha1_hex, &item_len);
+ if (get_sha1_hex(sha1_hex, item_sha))
+ die("packed file corruption");
+ if (memcmp(item_sha, sha1, 20) == 0) {
+ char *temp;
+ char *r;
+ temp = p + strlen(p) + 1;
+ if (sscanf(temp, "%10s %lu", type, size) != 2)
+ return NULL;
+ r = malloc(*size);
+ if (!r)
+ return NULL;
+ memcpy(r, temp + strlen(temp) + 1, *size);
+ free(buf);
+ return r;
+ }
+ p += strlen(p) + 1 + item_len;
+ }
+ return NULL;
+ }
return buf;
}
@@ -159,7 +190,7 @@
map = map_sha1_file(sha1, &mapsize);
if (map) {
- buf = unpack_sha1_file(map, mapsize, type, size);
+ buf = unpack_sha1_file(sha1, map, mapsize, type, size);
munmap(map, mapsize);
return buf;
}
@@ -305,3 +336,111 @@
close(fd);
return 0;
}
+
+int pack_sha1_buffer(void *buf, unsigned long buf_len,
+ unsigned char *returnsha1, char **dest,
+ unsigned long *dest_size)
+{
+ unsigned char sha1[20];
+ SHA_CTX c;
+ char *filename;
+ struct stat st;
+ void *p;
+ int metadata_size;
+
+ /* Sha1.. */
+ SHA1_Init(&c);
+ SHA1_Update(&c, buf, buf_len);
+ SHA1_Final(sha1, &c);
+
+ if (returnsha1)
+ memcpy(returnsha1, sha1, 20);
+
+ filename = sha1_file_name(sha1);
+ if (stat(filename, &st) == 0)
+ return 0;
+
+ p = realloc(*dest, *dest_size + buf_len + 250);
+ if (!p)
+ return -1;
+ *dest = p;
+ p += *dest_size;
+ metadata_size = 1 + sprintf(p, "%s %lu", sha1_to_hex(sha1), buf_len);
+ p += metadata_size;
+ memcpy(p, buf, buf_len);
+ *dest_size += buf_len + metadata_size;
+ return 0;
+}
+
+int write_packed_buffer(void *buf, unsigned long len)
+{
+ unsigned char sha1[20];
+ SHA_CTX c;
+ char *filename;
+ char *p;
+ char *metadata = malloc(200);
+ unsigned char sha1_hex[50];
+ int metadata_size;
+ int fd;
+ int ret = 0;
+
+ metadata_size = 1+sprintf(metadata, "packed %lu", len);
+
+ SHA1_Init(&c);
+ SHA1_Update(&c, metadata, metadata_size);
+ SHA1_Update(&c, buf, len);
+ SHA1_Final(sha1, &c);
+
+ filename = strdup(sha1_file_name(sha1));
+ fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0666);
+ if (fd < 0) {
+ if (errno != EEXIST)
+ return -1;
+ /* add collision check! */
+ } else {
+ char *compressed;
+ z_stream stream;
+ unsigned long size;
+ /* Set it up */
+ memset(&stream, 0, sizeof(stream));
+ deflateInit(&stream, Z_BEST_COMPRESSION);
+ size = deflateBound(&stream, len + metadata_size);
+ compressed = malloc(size);
+
+ /* Compress it */
+ stream.next_in = metadata;
+ stream.avail_in = metadata_size;
+ stream.next_out = compressed;
+ stream.avail_out = size;
+ while (deflate(&stream, 0) == Z_OK)
+ /* nothing */;
+ stream.next_in = buf;
+ stream.avail_in = len;
+ while (deflate(&stream, Z_FINISH) == Z_OK)
+ /* nothing */;
+ deflateEnd(&stream);
+ write(fd, compressed, stream.total_out);
+ close(fd);
+ }
+ free(metadata);
+ /* now we have the packed blob on disk, lets link to it */
+ p = buf;
+ while(p < (char *)buf + len) {
+ unsigned long item_len;
+ char *item_file;
+ sscanf(p, "%50s %lu\n", sha1_hex, &item_len);
+ /* + 1 for the null at the end of p */
+ p += strlen(p) + item_len + 1;
+
+ if (get_sha1_hex(sha1_hex, sha1))
+ die("packed file corruption");
+
+ item_file = sha1_file_name(sha1);
+ if (link(filename, item_file) && errno != EEXIST) {
+ ret = -errno;
+ break;
+ }
+ }
+ free(filename);
+ return ret;
+}
diff -ur linus.back/update-cache.c linus/update-cache.c
--- linus.back/update-cache.c 2005-04-21 11:05:27.979606728 -0400
+++ linus/update-cache.c 2005-04-21 10:42:08.109419344 -0400
@@ -14,55 +14,33 @@
*/
static int allow_add = 0, allow_remove = 0;
-static int index_fd(unsigned char *sha1, int fd, struct stat *st)
+static int index_fd(unsigned char *sha1, int fd, struct stat *st, char **packed_buffer, unsigned long *packed_len)
{
- z_stream stream;
unsigned long size = st->st_size;
- int max_out_bytes = size + 200;
- void *out = malloc(max_out_bytes);
void *metadata = malloc(200);
int metadata_size;
void *in;
- SHA_CTX c;
+ char *copy;
+ int ret;
in = "";
if (size)
in = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
- if (!out || (int)(long)in == -1)
+ if (!metadata || (int)(long)in == -1)
return -1;
metadata_size = 1+sprintf(metadata, "blob %lu", size);
-
- SHA1_Init(&c);
- SHA1_Update(&c, metadata, metadata_size);
- SHA1_Update(&c, in, size);
- SHA1_Final(sha1, &c);
-
- memset(&stream, 0, sizeof(stream));
- deflateInit(&stream, Z_BEST_COMPRESSION);
-
- /*
- * ASCII size + nul byte
- */
- stream.next_in = metadata;
- stream.avail_in = metadata_size;
- stream.next_out = out;
- stream.avail_out = max_out_bytes;
- while (deflate(&stream, 0) == Z_OK)
- /* nothing */;
-
- /*
- * File content
- */
- stream.next_in = in;
- stream.avail_in = size;
- while (deflate(&stream, Z_FINISH) == Z_OK)
- /*nothing */;
-
- deflateEnd(&stream);
-
- return write_sha1_buffer(sha1, out, stream.total_out);
+ copy = malloc(metadata_size + size);
+ if (!copy)
+ return -1;
+ memcpy(copy, metadata, metadata_size);
+ memcpy(copy + metadata_size, in, size);
+ ret = pack_sha1_buffer(copy, metadata_size + size,
+ sha1, packed_buffer, packed_len);
+ munmap(in, size);
+ free(copy);
+ return ret;
}
/*
@@ -85,7 +63,7 @@
ce->ce_size = htonl(st->st_size);
}
-static int add_file_to_cache(char *path)
+static int add_file_to_cache(char *path, char **packed_buffer, unsigned long *packed_len)
{
int size, namelen;
struct cache_entry *ce;
@@ -113,9 +91,14 @@
ce->ce_mode = create_ce_mode(st.st_mode);
ce->ce_flags = htons(namelen);
- if (index_fd(ce->sha1, fd, &st) < 0)
+ if (index_fd(ce->sha1, fd, &st, packed_buffer, packed_len) < 0)
return -1;
+ if (*packed_len > 32768) {
+ if (write_packed_buffer(*packed_buffer, *packed_len))
+ return -1;
+ *packed_len = 0;
+ }
return add_cache_entry(ce, allow_add);
}
@@ -286,6 +269,8 @@
{
int i, newfd, entries;
int allow_options = 1;
+ char *packed_buffer = NULL;
+ unsigned long packed_len = 0;
newfd = open(".git/index.lock", O_RDWR | O_CREAT | O_EXCL, 0600);
if (newfd < 0)
@@ -330,9 +315,14 @@
fprintf(stderr, "Ignoring path %s\n", argv[i]);
continue;
}
- if (add_file_to_cache(path))
+ if (add_file_to_cache(path, &packed_buffer, &packed_len))
die("Unable to add %s to database", path);
}
+ if (packed_buffer) {
+ if (packed_len)
+ if (write_packed_buffer(packed_buffer, packed_len))
+ free(packed_buffer);
+ }
if (write_cache(newfd, active_cache, active_nr) ||
rename(".git/index.lock", ".git/index"))
die("Unable to write new cachefile");
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 15:13 [PATCH] multi item packed files Chris Mason
@ 2005-04-21 15:41 ` Linus Torvalds
2005-04-21 16:23 ` Chris Mason
2005-04-21 19:28 ` Krzysztof Halasa
0 siblings, 2 replies; 17+ messages in thread
From: Linus Torvalds @ 2005-04-21 15:41 UTC (permalink / raw)
To: Chris Mason; +Cc: git
On Thu, 21 Apr 2005, Chris Mason wrote:
>
> There have been a few threads on making git more space efficient, and
> eventually someone mentions tiny files and space fragmentation. Now that git
> object names are decoupled from their compression, it's easier to consider a
> a variety of compression algorithms. I whipped up a really silly "pack files
> together" compression.
Careful.
This is something that needs history to tell whether it's effective. In
particular, if one file changes and another one does not, your packed
archive now ends up being a new blob, so while you "saved space" by having
just one blob for the object, in reality you didn't save any space at all
because with the <x> files changing, you just guaranteed that the packed
blob changes <x> times more often.
See? Your "packing in space" ends up also resulting in "packing in time",
and you didn't actually win anything.
(If you did a good job of packing, you hopefully didn't _lose_ anything
either - you needed 1:<x> number of objects that took 1:<x> the space if
the packing ended up perfect - but since you needed <x> times more of
these objects unless they all change together, you end up with exactly the
same space usage).
So the argument is: you can't lose with the method, and you _can_ win.
Right?
Wrong. You most definitely _can_ lose: you end up having to optimize for
one particular filesystem blocking size, and you'll lose on any other
filesystem. And you'll lose on the special filesystem of "network
traffic", which is byte-granular.
I don't want to pee on peoples parades, and I'm all for gathering numbers,
but the thing is, the current git isn't actually all that bad, and I
guarantee that it's hard to make it better without using delta
representation. And the current thing is really really simple.
Linus
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 15:41 ` Linus Torvalds
@ 2005-04-21 16:23 ` Chris Mason
2005-04-21 19:28 ` Krzysztof Halasa
1 sibling, 0 replies; 17+ messages in thread
From: Chris Mason @ 2005-04-21 16:23 UTC (permalink / raw)
To: Linus Torvalds; +Cc: git
On Thursday 21 April 2005 11:41, Linus Torvalds wrote:
> On Thu, 21 Apr 2005, Chris Mason wrote:
> > There have been a few threads on making git more space efficient, and
> > eventually someone mentions tiny files and space fragmentation. Now that
> > git object names are decoupled from their compression, it's easier to
> > consider a a variety of compression algorithms. I whipped up a really
> > silly "pack files together" compression.
>
> Careful.
>
> This is something that needs history to tell whether it's effective. In
> particular, if one file changes and another one does not, your packed
> archive now ends up being a new blob, so while you "saved space" by having
> just one blob for the object, in reality you didn't save any space at all
> because with the <x> files changing, you just guaranteed that the packed
> blob changes <x> times more often.
The packed blob lives in git but never makes it into a tree. Lets say that I
have a packed blob with files "a, b, c", and another packed blob with files
"x, y, z". Someone changes files, b and z and then runs update-cache b z.
Now we have 2 unchanged packed blobs: "a, b, c", "x, y, z", and one new
packed blob: "b_new, z_new". This means that in order for the packing to
help, we have to change more then one file at a time. That's why it would be
good to have update-cache include the write-tree and commit-tree.
>
> See? Your "packing in space" ends up also resulting in "packing in time",
> and you didn't actually win anything.
>
> (If you did a good job of packing, you hopefully didn't _lose_ anything
> either - you needed 1:<x> number of objects that took 1:<x> the space if
> the packing ended up perfect - but since you needed <x> times more of
> these objects unless they all change together, you end up with exactly the
> same space usage).
>
> So the argument is: you can't lose with the method, and you _can_ win.
> Right?
>
> Wrong. You most definitely _can_ lose: you end up having to optimize for
> one particular filesystem blocking size, and you'll lose on any other
> filesystem. And you'll lose on the special filesystem of "network
> traffic", which is byte-granular.
>
The patch does have one extra directory entry (for the packed blob), but from
a network point of view roughly the same number of bytes should be copied.
The hardlinks won't play nice with rsync though, soft links might be better.
packing isn't just about filesystem block sizes, it's about locality. All the
hashing means pretty much every access in git is random. With packing we can
at least try to put a single changeset together on disk. Right now it
doesn't matter much, but when the git tree is 6GB in two years we'll feel the
pain.
> I don't want to pee on peoples parades, and I'm all for gathering numbers,
> but the thing is, the current git isn't actually all that bad, and I
> guarantee that it's hard to make it better without using delta
> representation. And the current thing is really really simple.
>
Grin, if I thought you wanted the patch I might have tried to pretty it up a
little. The point is that all the discussions about ways to make git use
less space end up stuck in "but wait, that'll make a bunch of tiny files and
filesystems aren't good at that". So I believe some kind of packing is a
required building block for any kind of delta storage.
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 15:41 ` Linus Torvalds
2005-04-21 16:23 ` Chris Mason
@ 2005-04-21 19:28 ` Krzysztof Halasa
2005-04-21 20:07 ` Linus Torvalds
2005-04-21 20:22 ` Chris Mason
1 sibling, 2 replies; 17+ messages in thread
From: Krzysztof Halasa @ 2005-04-21 19:28 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Chris Mason, git
Linus Torvalds <torvalds@osdl.org> writes:
> Wrong. You most definitely _can_ lose: you end up having to optimize for
> one particular filesystem blocking size, and you'll lose on any other
> filesystem. And you'll lose on the special filesystem of "network
> traffic", which is byte-granular.
If someone needs better on-disk ratio, (s)he can go with 1 KB filesystem
or something like that, without all the added complexity of packing.
If we want to optimize that further, I would try doing it at the
underlying filesystem level. For example, loop-mounted one.
--
Krzysztof Halasa
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 19:28 ` Krzysztof Halasa
@ 2005-04-21 20:07 ` Linus Torvalds
2005-04-22 9:40 ` Krzysztof Halasa
2005-04-21 20:22 ` Chris Mason
1 sibling, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2005-04-21 20:07 UTC (permalink / raw)
To: Krzysztof Halasa; +Cc: Chris Mason, git
On Thu, 21 Apr 2005, Krzysztof Halasa wrote:
>
> If someone needs better on-disk ratio, (s)he can go with 1 KB filesystem
> or something like that, without all the added complexity of packing.
I really think the argument that "you can use filesystem feature XYZ" is
bogus.
I know that I'm not willing to switch filesystems on a whim. I suspect
nobody else is either. I'm not going to create a loopback filesystem just
for git, it's just too much pain.
And dammit, if I'm the original author and likely biggest power-user, and
_I_ can't be bothered to use special filesystems, then who can? Nobody.
This is why I absolutely do not believe in arguments like "if your
filesystem doesn't do tail packing, you shouldn't use it" or "if your
don't have name hashing enabled in your filesystem it's broken".
I'm perfectly willing to optimize for the common case, but that's as far
as it goes. I do not want to make fundamental design decisions that depend
on the target filesystem having some particular feature.
(I'll happily make decisions that say that the target _OS_ has to have a
particular feature, though. I'll require a sane base-level for
functionality, but not something like filesystem details).
Linus
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 19:28 ` Krzysztof Halasa
2005-04-21 20:07 ` Linus Torvalds
@ 2005-04-21 20:22 ` Chris Mason
2005-04-21 22:47 ` Linus Torvalds
2005-04-22 9:48 ` Krzysztof Halasa
1 sibling, 2 replies; 17+ messages in thread
From: Chris Mason @ 2005-04-21 20:22 UTC (permalink / raw)
To: Krzysztof Halasa; +Cc: Linus Torvalds, git
On Thursday 21 April 2005 15:28, Krzysztof Halasa wrote:
> Linus Torvalds <torvalds@osdl.org> writes:
> > Wrong. You most definitely _can_ lose: you end up having to optimize for
> > one particular filesystem blocking size, and you'll lose on any other
> > filesystem. And you'll lose on the special filesystem of "network
> > traffic", which is byte-granular.
>
> If someone needs better on-disk ratio, (s)he can go with 1 KB filesystem
> or something like that, without all the added complexity of packing.
>
> If we want to optimize that further, I would try doing it at the
> underlying filesystem level. For example, loop-mounted one.
Shrug, we shouldn't need help from the kernel for something like this. git as
a database hits worst case scenarios for almost every FS.
We've got:
1) subdirectories with lots of files
2) wasted space for tiny files
3) files that are likely to be accessed together spread across the whole disk
One compromise for SCM use would be one packed file per commit, with an index
that lets us quickly figure out which commit has a particular version of a
given file. My hack gets something close to that (broken into 32k chunks for
no good reason), and the index to find a given file is just the git directory
tree.
But my code does hide the fact that we're packing things from most of the git
interfaces. So I can almost keep a straight face while claiming to be true
to the original git design...almost. The whole setup is far from perfect,
but it is one option for addressing points 2 & 3 above.
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 20:22 ` Chris Mason
@ 2005-04-21 22:47 ` Linus Torvalds
2005-04-22 0:16 ` Chris Mason
2005-04-22 9:48 ` Krzysztof Halasa
1 sibling, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2005-04-21 22:47 UTC (permalink / raw)
To: Chris Mason; +Cc: Krzysztof Halasa, git
On Thu, 21 Apr 2005, Chris Mason wrote:
>
> Shrug, we shouldn't need help from the kernel for something like this. git as
> a database hits worst case scenarios for almost every FS.
I really disagree.
> We've got:
>
> 1) subdirectories with lots of files
> 2) wasted space for tiny files
> 3) files that are likely to be accessed together spread across the whole disk
On the other hand, git does a lot of things that are just _lovely_ for a
filesystem:
- it never rewrites a file. Rewriting a file is unquestionably _the_
single worst access pattern for any filesystem. In contrast, never
writing to a file again means that filesystems can optimize their
layout and that things like defragmentation actually works.
- it caches beautifully, and efficiently. Part of it comes from never
modifying files after they are written (which means that any network
filesystem automatically breathes a huge sign of relief), but part of
it is that it always reads full files, and the layout is done so that
it really actually _uses_ everything it reads.
It also caches beautifully on a memory subsystem level, largely for the
same reasons.
- it doesn't use tons of directories.
You say that "subdirectories with lots of files" is painful, but that's
not really the whole story. A _deep_ directory structure tends to
actually be worse in many ways, because it's much easier to optimize a
flat directory structure than a deep one. In other words, git ends up
making name hashing etc _productive_.
So yes, it's a bit wasteful. But it's wasteful of what is absolutely the
cheapest resource around: disk space. It's not a huge downside, and in
fact I really do believe that the biggest downside _by_far_ in diskspace
utilization is the _seek_ costs, not the space itself. Let's face it,
anybody who wants three years of kernel archives and thinks that 3GB of
disk is too much, has some serious problems.
The _seek_ issue is real, but git actually has a very nice architecture
even there: not only dos it cache really really well (and you can do a
simple "ls-tree $(cat .git/HEAD)" and populate the case from the results),
but the low level of indirection in a git archive means that it's almost
totally prefetchable with near-perfect access patterns.
In seeking, the real cost is synchronization, and the git model actually
means that there are very few seeks that have to be synchronized. You
could literally do the "ls-tree" thing and make an absolutely trivial
prefetcher that did the prefetching with enough parallellism that the
filesystem could probably get decent IO performance out of a disk.
In other words, we really could have a "git prefetch" command that would
populate the cache of the current head quite efficiently. Because the data
layout supports that.
Linus
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 22:47 ` Linus Torvalds
@ 2005-04-22 0:16 ` Chris Mason
2005-04-22 16:22 ` Linus Torvalds
0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-04-22 0:16 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Krzysztof Halasa, git
On Thursday 21 April 2005 18:47, Linus Torvalds wrote:
> On Thu, 21 Apr 2005, Chris Mason wrote:
> > Shrug, we shouldn't need help from the kernel for something like this.
> > git as a database hits worst case scenarios for almost every FS.
[ ... ]
We somewhat agree on most of this, I snipped out the parts that aren't worth
nitpicking over. git is really fast right now, and I'm all for throwing
drive space at things to solve problems. I just don't think we have to throw
as much space at it as we are.
> The _seek_ issue is real, but git actually has a very nice architecture
> even there: not only dos it cache really really well (and you can do a
> simple "ls-tree $(cat .git/HEAD)" and populate the case from the results),
> but the low level of indirection in a git archive means that it's almost
> totally prefetchable with near-perfect access patterns.
We can sort by the files before reading them in, but even if we order things
perfectly, we're spreading the io out too much across the drive. It works
right now because the git archive is relatively dense. At a few hundred MB
when we order things properly the drive head isn't moving that much.
At 3-6 GB this hurts more. The data gets farther apart as things age, and
drive performance rots away. I'll never convince you without numbers, which
means I'll have to wait for the full load of old history and try it out ;)
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 20:07 ` Linus Torvalds
@ 2005-04-22 9:40 ` Krzysztof Halasa
2005-04-22 18:12 ` Martin Uecker
0 siblings, 1 reply; 17+ messages in thread
From: Krzysztof Halasa @ 2005-04-22 9:40 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Chris Mason, git
Linus Torvalds <torvalds@osdl.org> writes:
> And dammit, if I'm the original author and likely biggest power-user, and
> _I_ can't be bothered to use special filesystems, then who can? Nobody.
If someone is motivated enough, and if the task is quite trivial (as it
seems to be) someone may try it. I can see nothing wrong with it as long
as it doesn't affect other people.
> This is why I absolutely do not believe in arguments like "if your
> filesystem doesn't do tail packing, you shouldn't use it" or "if your
> don't have name hashing enabled in your filesystem it's broken".
Of course. But one may consider using a filesystem with, say, different
settings. Or a special filesystem for this task, such as CNFS used by
news servers (it seems news servers do quite the same what git does,
except they also purge old contents, i.e., container files don't grow up).
> I'm perfectly willing to optimize for the common case, but that's as far
> as it goes. I do not want to make fundamental design decisions that depend
> on the target filesystem having some particular feature.
The optimization would be (in) the underlying filesystem (i.e., the OS
thing, or possibly a shared preloaded library?), not git itself.
--
Krzysztof Halasa
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-21 20:22 ` Chris Mason
2005-04-21 22:47 ` Linus Torvalds
@ 2005-04-22 9:48 ` Krzysztof Halasa
1 sibling, 0 replies; 17+ messages in thread
From: Krzysztof Halasa @ 2005-04-22 9:48 UTC (permalink / raw)
To: Chris Mason; +Cc: Linus Torvalds, git
Chris Mason <mason@suse.com> writes:
> Shrug, we shouldn't need help from the kernel for something like this.
> git as
> a database hits worst case scenarios for almost every FS.
Not sure.
> 1) subdirectories with lots of files
Correct. But git doesn't search dirs so it's not that bad.
> 2) wasted space for tiny files
... depends on block size. With 2 KB:
defiant:~$ du -s /pub/mirror/linux-2.6.git
88366 /pub/mirror/linux-2.6.git
defiant:~$ du -s --apparent-size /pub/mirror/linux-2.6.git
63400 /pub/mirror/linux-2.6.git
Not bad, is it?
> 3) files that are likely to be accessed together spread across the whole disk
... across the whole filesystem.
Well, probably it isn't best to have git and .iso archives on the same
filesystem.
--
Krzysztof Halasa
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 0:16 ` Chris Mason
@ 2005-04-22 16:22 ` Linus Torvalds
2005-04-22 18:58 ` Chris Mason
0 siblings, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2005-04-22 16:22 UTC (permalink / raw)
To: Chris Mason; +Cc: Krzysztof Halasa, git
On Thu, 21 Apr 2005, Chris Mason wrote:
>
> We can sort by the files before reading them in, but even if we order things
> perfectly, we're spreading the io out too much across the drive.
No we don't.
It's easy to just copy the repository in a way where this just isn't true:
you sort the objects by how far they are from the current HEAD, and you
just copy the repository in that order ("furthest" objects first - commits
last).
That's what I meant by defragmentation - you can actually do this on your
own, even if your filesystem doesn't support it.
Do it twice a year, and I pretty much guarantee that your performance will
stay pretty constant over time. The one exception is fsck, which doesn't
seek in "history order".
And this works exactly because:
- we don't do no steenking delta's, and don't have deep "chains" of data
to follow. The longest chain we ever have is just a few deep, and it's
trivial to just encourage the filesystem to have recent things together.
- we have an append-only mentality.
In fact, it works for exactly the same reason that makes us able to drop
old history if we want to. We essentially "drop" the history to another
part of the disk.
Linus
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 9:40 ` Krzysztof Halasa
@ 2005-04-22 18:12 ` Martin Uecker
0 siblings, 0 replies; 17+ messages in thread
From: Martin Uecker @ 2005-04-22 18:12 UTC (permalink / raw)
To: git
[-- Attachment #1: Type: text/plain, Size: 760 bytes --]
On Fri, Apr 22, 2005 at 11:40:29AM +0200, Krzysztof Halasa wrote:
> > This is why I absolutely do not believe in arguments like "if your
> > filesystem doesn't do tail packing, you shouldn't use it" or "if your
> > don't have name hashing enabled in your filesystem it's broken".
>
> Of course. But one may consider using a filesystem with, say, different
> settings. Or a special filesystem for this task, such as CNFS used by
> news servers (it seems news servers do quite the same what git does,
> except they also purge old contents, i.e., container files don't grow up).
and nttp would give a nice transfer method for git objects...
Martin
--
One night, when little Giana from Milano was fast asleep,
she had a strange dream.
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 16:22 ` Linus Torvalds
@ 2005-04-22 18:58 ` Chris Mason
2005-04-22 19:43 ` Linus Torvalds
0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-04-22 18:58 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Krzysztof Halasa, git
On Friday 22 April 2005 12:22, Linus Torvalds wrote:
> On Thu, 21 Apr 2005, Chris Mason wrote:
> > We can sort by the files before reading them in, but even if we order
> > things perfectly, we're spreading the io out too much across the drive.
>
> No we don't.
>
> It's easy to just copy the repository in a way where this just isn't true:
> you sort the objects by how far they are from the current HEAD, and you
> just copy the repository in that order ("furthest" objects first - commits
> last).
>
> That's what I meant by defragmentation - you can actually do this on your
> own, even if your filesystem doesn't support it.
This certainly can help. Based on some ideas from andrea I made a poor man's
defrag script last year that was similar. It worked by copying files into a
flat dir in the order you expected to read them in, deleting the original,
then hard linking them into their original name.
Copying in order straight into a new git tree doesn't help much when the
filesystem is using the subdirectory as a hint to block allocation. So
you'll probably have to copy them all into a flat directory and then hard
link back into the git tree (the flat dir can then be deleted of course).
The problem I see for git is that once you have enough data, it should degrade
over and over again somewhat quickly. My own guess is that you'll need to
run the script at least monthly. If we're designing the thing now and say
'wow, that's going to be really slow without help', it doesn't hurt to look
at alternatives.
I grabbed Ingo's tarball of 28,000 patches since 2.4.0 and applied them all
into git on ext3 (htree). It only took ~2.5 hrs to apply. I did use my
write-tree patch where you had to give write-tree a list of directories to
search, but I don't think this helped much since the operation was mostly
disk write bound.
Anyway, I ended up with a 2.6GB .git directory. Then I:
rm .git/index
umount ; mount again
time read-tree `tree-id` (24.45s)
time checkout-cache --prefix=../checkout/ -a -f (4m30s)
--prefix is neat ;)
The tree that ended up in checkout was 239456k, giving us an effective io rate
for checkout-cache of 885k/s. (this drive gets 24MB/s sequential reads).
I'll have numbers for the packed files later on today. No, I don't really
expect the numbers will convince you to implement some kind of packing ;)
But it's still a good data point to have, and generating them here is just
poking the box every 2 hours or so.
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 18:58 ` Chris Mason
@ 2005-04-22 19:43 ` Linus Torvalds
2005-04-22 20:32 ` Chris Mason
0 siblings, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2005-04-22 19:43 UTC (permalink / raw)
To: Chris Mason; +Cc: Krzysztof Halasa, git
On Fri, 22 Apr 2005, Chris Mason wrote:
>
> The problem I see for git is that once you have enough data, it should degrade
> over and over again somewhat quickly.
I really doubt that.
There's a more or less constant amount of new data added all the time: the
number of changes does _not_ grow with history. The number of changes
grows with the amount of changes going on in the tree, and while that
isn't exactly constant, it definitely is not something that grows very
fast.
Btw, this is how git is able to be so fast in the first place. Git is fast
because it knows that the "size of the change" is a lot smaller than the
"size of the repository", so it fundamentally at all points tries to make
sure that it only ever bothers with stuff that has changed.
Stuff that hasn't changed, it ignores very _very_ efficiently.
That's really the whole point of the index file: it's a way to quickly
ignore the stuff that hasn't changed - both for simple operations like
"show-diff", but also for complex operations like "merge these three
trees".
And it works exactly because the number of changes does _not_ grow at all
linearly with the history of the project. In fact, in most projects, the
rate of change does _down_ when the project grows, because the projects
matures and generally gets more complicated and thus harder to change.
(The kernel _really_ is pretty special. I am willing to bet that there are
not a lot of big projects that have been able to continue to take changes
at the kind of pace that the kernel does. But we've had to work at it a
lot, including obviously using SCM tools that are very much geared towards
scaling. Why do you think the kernel puts more pressure on SCM's than
other projects? It's exactly because we're trying to scale our change
acceptance to bigger numbers).
So when you say "once you have enough data, it will degrade quickly"
ignores the fact that the rate of change isn't (the "second derivative of
the size of the project in time") really isn't that high.
> I grabbed Ingo's tarball of 28,000 patches since 2.4.0 and applied them all
> into git on ext3 (htree). It only took ~2.5 hrs to apply.
Ok, I'd actually wish it took even less, but that's still a pretty
impressive average of three patches a second.
> Anyway, I ended up with a 2.6GB .git directory. Then I:
>
> rm .git/index
> umount ; mount again
> time read-tree `tree-id` (24.45s)
> time checkout-cache --prefix=../checkout/ -a -f (4m30s)
>
> --prefix is neat ;)
That sounds pretty acceptable. Four minutes is a long time, but I assume
that the whole point of the exercise was to try to test worst-case
behaviour. We can certainly make sure that real usage gets lower numbers
than that (in particular, my "real usage" ends up being 100% in the disk
cache ;)
Linus
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 19:43 ` Linus Torvalds
@ 2005-04-22 20:32 ` Chris Mason
2005-04-22 23:55 ` Chris Mason
0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-04-22 20:32 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Krzysztof Halasa, git
On Friday 22 April 2005 15:43, Linus Torvalds wrote:
> On Fri, 22 Apr 2005, Chris Mason wrote:
> > The problem I see for git is that once you have enough data, it should
> > degrade over and over again somewhat quickly.
>
> I really doubt that.
>
> There's a more or less constant amount of new data added all the time: the
> number of changes does _not_ grow with history. The number of changes
> grows with the amount of changes going on in the tree, and while that
> isn't exactly constant, it definitely is not something that grows very
> fast.
>From a filesystem point of view, it's not the number of changes that matters,
it's the distance between them. The amount of new data is constant, but the
speed of accessing the new data is affected by the bulk of old data on disk.
Even with defragging you hopefully end up with a big chunk of the disk where
everything is in order. Then you add a new file and it goes either somewhere
behind that big chunk or in front of it. The next new file might go
somewhere behind or in front etc etc. Having a big chunk just means the new
files are likely to be farther apart making reads of the new data very seeky.
>
> Btw, this is how git is able to be so fast in the first place. Git is fast
> because it knows that the "size of the change" is a lot smaller than the
> "size of the repository", so it fundamentally at all points tries to make
> sure that it only ever bothers with stuff that has changed.
>
> Stuff that hasn't changed, it ignores very _very_ efficiently.
>
git as a write engine is very fast, and we definitely write more then we read.
> > I grabbed Ingo's tarball of 28,000 patches since 2.4.0 and applied them
> > all into git on ext3 (htree). It only took ~2.5 hrs to apply.
>
> Ok, I'd actually wish it took even less, but that's still a pretty
> impressive average of three patches a second.
Yeah, and this was a relatively old machine with slowish drives. One run to
apply into my packed tree is finished and only took 2 hours. But, I had
'tuned' it to make bigger packed files, and the end result is 2MB compressed
objects. Great for compression rate, but my dumb format doesn't hold up
well for reading it back.
If I pack every 64k (uncompressed), the checkout-tree time goes down to 3m14s.
That's a very big difference considering how stupid my code is .git was only
20% smaller with 64k chunks. I should be able to do better...I'll do one
more run.
>
> > Anyway, I ended up with a 2.6GB .git directory. Then I:
> >
> > rm .git/index
> > umount ; mount again
> > time read-tree `tree-id` (24.45s)
> > time checkout-cache --prefix=../checkout/ -a -f (4m30s)
> >
> > --prefix is neat ;)
>
> That sounds pretty acceptable. Four minutes is a long time, but I assume
> that the whole point of the exercise was to try to test worst-case
> behaviour. We can certainly make sure that real usage gets lower numbers
> than that (in particular, my "real usage" ends up being 100% in the disk
> cache ;)
I had a tree with 28,000 patches. If we pretend that one bk changeset will
equal one git changeset, we'd have 64,000 patches (57k without empty
mergesets), and it probably wouldn't fit into ram anymore ;) Our bk cset
rate was about 24k/year, so we'll have to trim very aggressively to have
reasonable performance.
For a working tree that's fine, but we need some fast central place to pull
the working .git trees from, and we're really going to feel the random io
there.
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 20:32 ` Chris Mason
@ 2005-04-22 23:55 ` Chris Mason
2005-04-25 22:20 ` Chris Mason
0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-04-22 23:55 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Krzysztof Halasa, git
On Friday 22 April 2005 16:32, Chris Mason wrote:
> If I pack every 64k (uncompressed), the checkout-tree time goes down to
> 3m14s. That's a very big difference considering how stupid my code is .git
> was only 20% smaller with 64k chunks. I should be able to do better...I'll
> do one more run.
>
This run also packed tree files together (everything produced by write-tree
went into a packed file), but not the commits. I estimate I could save about
another 168m by packing the tree files and commits into the same file with
the blobs, but this wouldn't make any of the times below faster.
git - original (28k commits) packed
FS size 2,675,408k 1,723,820k
read-tree 24.45s 18.9s
checkout-cache 4m30s 3m5s
patch time 2h30m 1h55m
The format for the packed files could be smarter, such that it didn't require
decompressing the whole packed file to read one item. I would guess I could
get another 20% checkout-cache performance out of it via more tuning, and
probably another 10% of space savings.
Of course, none of this is likely to convince you ;) If you decide later on
it's worthwhile, I don't think it would be difficult to add then.
-chris
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] multi item packed files
2005-04-22 23:55 ` Chris Mason
@ 2005-04-25 22:20 ` Chris Mason
0 siblings, 0 replies; 17+ messages in thread
From: Chris Mason @ 2005-04-25 22:20 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Krzysztof Halasa, git
[-- Attachment #1: Type: text/plain, Size: 2897 bytes --]
On Friday 22 April 2005 19:55, Chris Mason wrote:
> On Friday 22 April 2005 16:32, Chris Mason wrote:
> > If I pack every 64k (uncompressed), the checkout-tree time goes down to
> > 3m14s. That's a very big difference considering how stupid my code is
> > .git was only 20% smaller with 64k chunks. I should be able to do
> > better...I'll do one more run.
>
> This run also packed tree files together (everything produced by write-tree
> went into a packed file), but not the commits. I estimate I could save
> about another 168m by packing the tree files and commits into the same file
> with the blobs, but this wouldn't make any of the times below faster.
>
> git - original (28k commits) packed
> FS size 2,675,408k 1,723,820k
> read-tree 24.45s 18.9s
> checkout-cache 4m30s 3m5s
> patch time 2h30m 1h55m
>
It was a rainy weekend, so I took a break from lawn care and hacked in some
simple changes to the packed file format. There's now a header listing the
sha1 for each subfile and the offset where to find it in the main file. Each
subfile is compressed individually so you don't have to decompress the whole
packed file to find one. commits were added into the packed files as well.
Some results were about what I expected:
FS size -- 1,614,376k
read-tree -- 18s
checkout-cache -- 2m35s (cold cache)
checkout-cache -- 18s (hot cache)
patch time -- 96m
vanilla git needs 56s to checkout with a hot cache. The hot cache numbers
weren't done before because I hadn't expected my patch to help at all. Even
though we both do things entirely from cache, vanilla git is much slower at
writing the checked out files back to the drive. I've made no optimizations
to that code, and the drive is only 30% full, so this seems to just be a bad
interaction with filesystem layout.
I also expected vanilla git to perform pretty well when there were no commits
in the tree. My test was to put a copy of 2.6.11 under git.
vanilla packed
update-cache (for all files) 2m1s 48s
checkout-cache (cold) 1m23s 28s
checkout-cache (hot) 12s 15s
The difference in hot cache checkout time is userland cpu time. It could be
avoided with smarter caching of the packed file header. Right now I'm
decompressing it over and over again for each checkout. Still, the
performance hit is pretty small because I try to limit the number of subfiles
that get packed together.
My current patch is attached for reference, it's against a git from late last
week. I wouldn't suggest using this for anything other than benchmarking,
and since I don't think I can get much better numbers easily, I'll stop
playing around with this for a while.
-chris
[-- Attachment #2: comp-tree-4.diff --]
[-- Type: text/x-diff, Size: 26388 bytes --]
diff -ur linus.back/cache.h linus/cache.h
--- linus.back/cache.h 2005-04-25 17:30:21.616654304 -0400
+++ linus/cache.h 2005-04-25 10:56:15.000000000 -0400
@@ -64,6 +64,16 @@
char name[0];
};
+struct packed_item {
+ /* lenght of compressed data */
+ unsigned long len;
+ struct packed_item *next;
+ /* sha1 of uncompressed data */
+ char sha1[20];
+ /* compressed data */
+ char *data;
+};
+
#define CE_NAMEMASK (0x0fff)
#define CE_STAGEMASK (0x3000)
#define CE_STAGESHIFT 12
@@ -117,7 +127,7 @@
/* Read and unpack a sha1 file into memory, write memory to a sha1 file */
extern void * map_sha1_file(const unsigned char *sha1, unsigned long *size);
-extern void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned long *size);
+extern void * unpack_sha1_file(const unsigned char *sha1, void *map, unsigned long mapsize, char *type, unsigned long *size);
extern void * read_sha1_file(const unsigned char *sha1, char *type, unsigned long *size);
extern int write_sha1_file(char *buf, unsigned len, unsigned char *return_sha1);
extern int check_sha1_signature(unsigned char *sha1, void *buf, unsigned long size, const char *type);
@@ -125,6 +135,9 @@
/* Convert to/from hex/sha1 representation */
extern int get_sha1_hex(const char *hex, unsigned char *sha1);
extern char *sha1_to_hex(const unsigned char *sha1); /* static buffer result! */
+extern int pack_sha1_buffer(void *buf, unsigned long buf_len,
+ unsigned char *returnsha1, struct packed_item **);
+int write_packed_buffer(struct packed_item *head);
/* General helper functions */
extern void usage(const char *err);
@@ -137,4 +150,9 @@
unsigned long *size,
unsigned char *tree_sha1_ret);
+extern int write_tree(struct cache_entry **cachep, int maxentries, const char *base, int baselen, unsigned char *returnsha1, struct packed_item **head);
+
+#define MAXPARENT 16
+extern int commit_tree(char *tree_sha1_hex, unsigned char parent_sha1[MAXPARENT][20], int num_parents, struct packed_item **head);
+extern void check_valid_sha1_file(unsigned char *sha1, const char *expect);
#endif /* CACHE_H */
diff -ur linus.back/commit-tree.c linus/commit-tree.c
--- linus.back/commit-tree.c 2005-04-25 17:30:21.626652784 -0400
+++ linus/commit-tree.c 2005-04-25 10:58:15.000000000 -0400
@@ -4,360 +4,32 @@
* Copyright (C) Linus Torvalds, 2005
*/
#include "cache.h"
-
-#include <pwd.h>
-#include <time.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-
-#define BLOCKING (1ul << 14)
-#define ORIG_OFFSET (40)
-
-/*
- * Leave space at the beginning to insert the tag
- * once we know how big things are.
- *
- * FIXME! Share the code with "write-tree.c"
- */
-static void init_buffer(char **bufp, unsigned int *sizep)
-{
- char *buf = malloc(BLOCKING);
- memset(buf, 0, ORIG_OFFSET);
- *sizep = ORIG_OFFSET;
- *bufp = buf;
-}
-
-static void add_buffer(char **bufp, unsigned int *sizep, const char *fmt, ...)
-{
- char one_line[2048];
- va_list args;
- int len;
- unsigned long alloc, size, newsize;
- char *buf;
-
- va_start(args, fmt);
- len = vsnprintf(one_line, sizeof(one_line), fmt, args);
- va_end(args);
- size = *sizep;
- newsize = size + len;
- alloc = (size + 32767) & ~32767;
- buf = *bufp;
- if (newsize > alloc) {
- alloc = (newsize + 32767) & ~32767;
- buf = realloc(buf, alloc);
- *bufp = buf;
- }
- *sizep = newsize;
- memcpy(buf + size, one_line, len);
-}
-
-static int prepend_integer(char *buffer, unsigned val, int i)
-{
- buffer[--i] = '\0';
- do {
- buffer[--i] = '0' + (val % 10);
- val /= 10;
- } while (val);
- return i;
-}
-
-static void finish_buffer(char *tag, char **bufp, unsigned int *sizep)
-{
- int taglen;
- int offset;
- char *buf = *bufp;
- unsigned int size = *sizep;
-
- offset = prepend_integer(buf, size - ORIG_OFFSET, ORIG_OFFSET);
- taglen = strlen(tag);
- offset -= taglen;
- buf += offset;
- size -= offset;
- memcpy(buf, tag, taglen);
-
- *bufp = buf;
- *sizep = size;
-}
-
-static void remove_special(char *p)
-{
- char c;
- char *dst = p, *src = p;
-
- for (;;) {
- c = *src;
- src++;
- switch(c) {
- case '\n': case '<': case '>':
- continue;
- }
- *dst++ = c;
- if (!c)
- break;
- }
-
- /*
- * Go back, and remove crud from the end: some people
- * have commas etc in their gecos field
- */
- dst--;
- while (--dst >= p) {
- unsigned char c = *dst;
- switch (c) {
- case ',': case ';': case '.':
- *dst = 0;
- continue;
- }
- break;
- }
-}
-
-static const char *month_names[] = {
- "Jan", "Feb", "Mar", "Apr", "May", "Jun",
- "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
-};
-
-static const char *weekday_names[] = {
- "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
-};
-
-
-static char *skipfws(char *str)
-{
- while (isspace(*str))
- str++;
- return str;
-}
-
-
-/* Gr. strptime is crap for this; it doesn't have a way to require RFC2822
- (i.e. English) day/month names, and it doesn't work correctly with %z. */
-static void parse_rfc2822_date(char *date, char *result, int maxlen)
-{
- struct tm tm;
- char *p;
- int i, offset;
- time_t then;
-
- memset(&tm, 0, sizeof(tm));
-
- /* Skip day-name */
- p = skipfws(date);
- if (!isdigit(*p)) {
- for (i=0; i<7; i++) {
- if (!strncmp(p,weekday_names[i],3) && p[3] == ',') {
- p = skipfws(p+4);
- goto day;
- }
- }
- return;
- }
-
- /* day */
- day:
- tm.tm_mday = strtoul(p, &p, 10);
-
- if (tm.tm_mday < 1 || tm.tm_mday > 31)
- return;
-
- if (!isspace(*p))
- return;
-
- p = skipfws(p);
-
- /* month */
-
- for (i=0; i<12; i++) {
- if (!strncmp(p, month_names[i], 3) && isspace(p[3])) {
- tm.tm_mon = i;
- p = skipfws(p+strlen(month_names[i]));
- goto year;
- }
- }
- return; /* Error -- bad month */
-
- /* year */
- year:
- tm.tm_year = strtoul(p, &p, 10);
-
- if (!tm.tm_year && !isspace(*p))
- return;
-
- if (tm.tm_year > 1900)
- tm.tm_year -= 1900;
-
- p=skipfws(p);
-
- /* hour */
- if (!isdigit(*p))
- return;
- tm.tm_hour = strtoul(p, &p, 10);
-
- if (!tm.tm_hour > 23)
- return;
-
- if (*p != ':')
- return; /* Error -- bad time */
- p++;
-
- /* minute */
- if (!isdigit(*p))
- return;
- tm.tm_min = strtoul(p, &p, 10);
-
- if (!tm.tm_min > 59)
- return;
-
- if (isspace(*p))
- goto zone;
-
- if (*p != ':')
- return; /* Error -- bad time */
- p++;
-
- /* second */
- if (!isdigit(*p))
- return;
- tm.tm_sec = strtoul(p, &p, 10);
-
- if (!tm.tm_sec > 59)
- return;
-
- if (!isspace(*p))
- return;
-
- zone:
- p = skipfws(p);
-
- if (*p == '-')
- offset = -60;
- else if (*p == '+')
- offset = 60;
- else
- return;
-
- if (!isdigit(p[1]) || !isdigit(p[2]) || !isdigit(p[3]) || !isdigit(p[4]))
- return;
-
- i = strtoul(p+1, NULL, 10);
- offset *= ((i % 100) + ((i / 100) * 60));
-
- if (*(skipfws(p + 5)))
- return;
-
- then = mktime(&tm); /* mktime appears to ignore the GMT offset, stupidly */
- if (then == -1)
- return;
-
- then -= offset;
-
- snprintf(result, maxlen, "%lu %5.5s", then, p);
-}
-
-static void check_valid(unsigned char *sha1, const char *expect)
-{
- void *buf;
- char type[20];
- unsigned long size;
-
- buf = read_sha1_file(sha1, type, &size);
- if (!buf || strcmp(type, expect))
- die("%s is not a valid '%s' object", sha1_to_hex(sha1), expect);
- free(buf);
-}
-
/*
* Having more than two parents is not strange at all, and this is
* how multi-way merges are represented.
*/
-#define MAXPARENT (16)
static char *commit_tree_usage = "commit-tree <sha1> [-p <sha1>]* < changelog";
int main(int argc, char **argv)
{
- int i, len;
+ int i;
int parents = 0;
unsigned char tree_sha1[20];
unsigned char parent_sha1[MAXPARENT][20];
- unsigned char commit_sha1[20];
- char *gecos, *realgecos, *commitgecos;
- char *email, *commitemail, realemail[1000];
- char date[20], realdate[20];
- char *audate;
- char comment[1000];
- struct passwd *pw;
- time_t now;
- struct tm *tm;
- char *buffer;
- unsigned int size;
if (argc < 2 || get_sha1_hex(argv[1], tree_sha1) < 0)
usage(commit_tree_usage);
- check_valid(tree_sha1, "tree");
+ check_valid_sha1_file(tree_sha1, "tree");
for (i = 2; i < argc; i += 2) {
char *a, *b;
a = argv[i]; b = argv[i+1];
if (!b || strcmp(a, "-p") || get_sha1_hex(b, parent_sha1[parents]))
usage(commit_tree_usage);
- check_valid(parent_sha1[parents], "commit");
+ check_valid_sha1_file(parent_sha1[parents], "commit");
parents++;
}
- if (!parents)
- fprintf(stderr, "Committing initial tree %s\n", argv[1]);
- pw = getpwuid(getuid());
- if (!pw)
- die("You don't exist. Go away!");
- realgecos = pw->pw_gecos;
- len = strlen(pw->pw_name);
- memcpy(realemail, pw->pw_name, len);
- realemail[len] = '@';
- gethostname(realemail+len+1, sizeof(realemail)-len-1);
- if (!strchr(realemail+len+1, '.')) {
- strcat(realemail, ".");
- getdomainname(realemail+strlen(realemail), sizeof(realemail)-strlen(realemail)-1);
- }
- time(&now);
- tm = localtime(&now);
-
- strftime(realdate, sizeof(realdate), "%s %z", tm);
- strcpy(date, realdate);
-
- commitgecos = getenv("COMMIT_AUTHOR_NAME") ? : realgecos;
- commitemail = getenv("COMMIT_AUTHOR_EMAIL") ? : realemail;
- gecos = getenv("AUTHOR_NAME") ? : realgecos;
- email = getenv("AUTHOR_EMAIL") ? : realemail;
- audate = getenv("AUTHOR_DATE");
- if (audate)
- parse_rfc2822_date(audate, date, sizeof(date));
-
- remove_special(gecos); remove_special(realgecos); remove_special(commitgecos);
- remove_special(email); remove_special(realemail); remove_special(commitemail);
-
- init_buffer(&buffer, &size);
- add_buffer(&buffer, &size, "tree %s\n", sha1_to_hex(tree_sha1));
-
- /*
- * NOTE! This ordering means that the same exact tree merged with a
- * different order of parents will be a _different_ changeset even
- * if everything else stays the same.
- */
- for (i = 0; i < parents; i++)
- add_buffer(&buffer, &size, "parent %s\n", sha1_to_hex(parent_sha1[i]));
-
- /* Person/date information */
- add_buffer(&buffer, &size, "author %s <%s> %s\n", gecos, email, date);
- add_buffer(&buffer, &size, "committer %s <%s> %s\n\n", commitgecos, commitemail, realdate);
-
- /* And add the comment */
- while (fgets(comment, sizeof(comment), stdin) != NULL)
- add_buffer(&buffer, &size, "%s", comment);
-
- finish_buffer("commit ", &buffer, &size);
-
- write_sha1_file(buffer, size, commit_sha1);
- printf("%s\n", sha1_to_hex(commit_sha1));
+ commit_tree(tree_sha1, parent_sha1, parents, NULL);
return 0;
}
diff -ur linus.back/fsck-cache.c linus/fsck-cache.c
--- linus.back/fsck-cache.c 2005-04-25 17:30:21.630652176 -0400
+++ linus/fsck-cache.c 2005-04-22 10:25:07.000000000 -0400
@@ -85,7 +85,7 @@
if (map) {
char type[100];
unsigned long size;
- void *buffer = unpack_sha1_file(map, mapsize, type, &size);
+ void *buffer = unpack_sha1_file(sha1, map, mapsize, type, &size);
if (!buffer)
return -1;
if (check_sha1_signature(sha1, buffer, size, type) < 0)
diff -ur linus.back/Makefile linus/Makefile
--- linus.back/Makefile 2005-04-25 17:30:21.631652024 -0400
+++ linus/Makefile 2005-04-25 10:03:53.000000000 -0400
@@ -23,7 +23,7 @@
install: $(PROG)
install $(PROG) $(HOME)/bin/
-LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o
+LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o lib-tree.o
LIB_FILE=libgit.a
LIB_H=cache.h object.h
@@ -71,6 +71,7 @@
show-diff.o: $(LIB_H)
show-files.o: $(LIB_H)
tree.o: $(LIB_H)
+lib-tree.o: $(LIB_H)
update-cache.o: $(LIB_H)
usage.o: $(LIB_H)
unpack-file.o: $(LIB_H)
diff -ur linus.back/sha1_file.c linus/sha1_file.c
--- linus.back/sha1_file.c 2005-04-25 17:30:21.633651720 -0400
+++ linus/sha1_file.c 2005-04-25 17:15:53.050696400 -0400
@@ -116,12 +116,14 @@
return map;
}
-void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned long *size)
+void * unpack_sha1_file(const unsigned char *sha1, void *map,
+ unsigned long mapsize, char *type, unsigned long *size)
{
int ret, bytes;
z_stream stream;
char buffer[8192];
char *buf;
+ unsigned long offset;
/* Get the data stream */
memset(&stream, 0, sizeof(stream));
@@ -134,12 +136,12 @@
ret = inflate(&stream, 0);
if (sscanf(buffer, "%10s %lu", type, size) != 2)
return NULL;
-
bytes = strlen(buffer) + 1;
buf = malloc(*size);
- if (!buf)
+ if (!buf) {
+ perror("malloc");
return NULL;
-
+ }
memcpy(buf, buffer + bytes, stream.total_out - bytes);
bytes = stream.total_out - bytes;
if (bytes < *size && ret == Z_OK) {
@@ -149,6 +151,56 @@
/* nothing */;
}
inflateEnd(&stream);
+
+ /* we've found a packed object */
+ if (strcmp(type, "packed") == 0) {
+ char *p = buf;
+ unsigned long header_len = *size;
+ offset = stream.total_in;
+ if (!sha1)
+ return NULL;
+ while(p < buf + header_len) {
+ unsigned long item_len;
+ unsigned char sha1_hex[50];
+ unsigned char item_sha[20];
+ memcpy(item_sha, p, 20);
+ sscanf(p + 20, "%lu ", &item_len);
+ p += 20 + strlen(p + 20) + 1;
+ if (memcmp(item_sha, sha1, 20) == 0) {
+ /* Get the data stream */
+ free(buf);
+ memset(&stream, 0, sizeof(stream));
+ stream.next_in = map + offset;
+ stream.avail_in = mapsize - offset;
+ stream.next_out = buffer;
+ stream.avail_out = sizeof(buffer);
+
+ inflateInit(&stream);
+ ret = inflate(&stream, 0);
+ if (sscanf(buffer, "%10s %lu", type, size) != 2)
+ return NULL;
+ bytes = strlen(buffer) + 1;
+ buf = malloc(*size);
+ if (!buf) {
+ perror("malloc");
+ return NULL;
+ }
+ memcpy(buf, buffer + bytes,
+ stream.total_out - bytes);
+ bytes = stream.total_out - bytes;
+ if (bytes < *size && ret == Z_OK) {
+ stream.next_out = buf + bytes;
+ stream.avail_out = *size - bytes;
+ while (inflate(&stream, Z_FINISH) == Z_OK)
+ /* nothing */;
+ }
+ inflateEnd(&stream);
+ return buf;
+ }
+ offset += item_len;
+ }
+ return NULL;
+ }
return buf;
}
@@ -159,7 +211,7 @@
map = map_sha1_file(sha1, &mapsize);
if (map) {
- buf = unpack_sha1_file(map, mapsize, type, size);
+ buf = unpack_sha1_file(sha1, map, mapsize, type, size);
munmap(map, mapsize);
return buf;
}
@@ -305,3 +357,166 @@
close(fd);
return 0;
}
+
+int pack_sha1_buffer(void *buf, unsigned long buf_len,
+ unsigned char *returnsha1,
+ struct packed_item **packed_item)
+{
+ unsigned char sha1[20];
+ SHA_CTX c;
+ char *filename;
+ struct stat st;
+ char *compressed;
+ z_stream stream;
+ unsigned long size;
+ struct packed_item *item;
+
+ *packed_item = NULL;
+
+ /* Sha1.. */
+ SHA1_Init(&c);
+ SHA1_Update(&c, buf, buf_len);
+ SHA1_Final(sha1, &c);
+
+ if (returnsha1)
+ memcpy(returnsha1, sha1, 20);
+
+ filename = sha1_file_name(sha1);
+ if (stat(filename, &st) == 0)
+ return 0;
+
+ /* Set it up */
+ memset(&stream, 0, sizeof(stream));
+ deflateInit(&stream, Z_BEST_COMPRESSION);
+ size = deflateBound(&stream, buf_len);
+ compressed = malloc(size);
+
+ /*
+ * ASCII size + nul byte
+ */
+ stream.next_in = buf;
+ stream.avail_in = buf_len;
+ stream.next_out = compressed;
+ stream.avail_out = size;
+ /* Compress it */
+ while (deflate(&stream, Z_FINISH) == Z_OK)
+ /* nothing */;
+ deflateEnd(&stream);
+ size = stream.total_out;
+
+ item = malloc(sizeof(struct packed_item));
+ if (!item) {
+ free(compressed);
+ return -1;
+ }
+ memcpy(item->sha1, sha1, 20);
+ item->len = size;
+ item->next = NULL;
+ item->data = compressed;
+ *packed_item = item;
+ return 0;
+}
+
+static char *create_packed_header(struct packed_item *head, unsigned long *size)
+{
+ char *metadata = NULL;
+ int metadata_size = 0;
+ *size = 0;
+
+ while(head) {
+ char *p;
+ metadata = realloc(metadata, metadata_size + 220);
+ if (!metadata)
+ return NULL;
+ p = metadata+metadata_size;
+ memcpy(p, head->sha1, 20);
+ p += 20;
+ metadata_size += 1 + sprintf(p, "%lu ", head->len) + 20;
+ head = head->next;
+ }
+ *size = metadata_size;
+ return metadata;
+}
+
+int write_packed_buffer(struct packed_item *head)
+{
+ unsigned char sha1[20];
+ SHA_CTX c;
+ char *filename;
+ char *metadata = malloc(200);
+ char *header;
+ int metadata_size;
+ int fd;
+ int ret = 0;
+ unsigned long header_len;
+ struct packed_item *item;
+ char *compressed;
+ z_stream stream;
+ unsigned long size;
+ int nr = 0;
+
+ header = create_packed_header(head, &header_len);
+ metadata_size = 1+sprintf(metadata, "packed %lu", header_len);
+
+ SHA1_Init(&c);
+ SHA1_Update(&c, metadata, metadata_size);
+ SHA1_Update(&c, header, header_len);
+ item = head;
+ while(item) {
+ SHA1_Update(&c, item->data, item->len);
+ item = item->next;
+ nr++;
+ }
+ SHA1_Final(sha1, &c);
+
+ filename = strdup(sha1_file_name(sha1));
+ fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0666);
+ if (fd < 0) {
+ /* add collision check! */
+ if (errno != EEXIST) {
+ ret = -errno;
+ }
+ goto out;
+ }
+ /* compress just the header info */
+ memset(&stream, 0, sizeof(stream));
+ deflateInit(&stream, Z_BEST_COMPRESSION);
+ size = deflateBound(&stream, header_len + metadata_size);
+ compressed = malloc(size);
+
+ stream.next_in = metadata;
+ stream.avail_in = metadata_size;
+ stream.next_out = compressed;
+ stream.avail_out = size;
+ while (deflate(&stream, 0) == Z_OK)
+ /* nothing */;
+ stream.next_in = header;
+ stream.avail_in = header_len;
+ while (deflate(&stream, Z_FINISH) == Z_OK)
+ /* nothing */;
+ deflateEnd(&stream);
+ size = stream.total_out;
+
+ write(fd, compressed, size);
+ free(compressed);
+
+ item = head;
+ while(item) {
+ char *item_file;
+ struct packed_item *next = item->next;
+ write(fd, item->data, item->len);
+ item_file = sha1_file_name(item->sha1);
+ if (link(filename, item_file) && errno != EEXIST) {
+ ret = -errno;
+ break;
+ }
+ free(item->data);
+ free(item);
+ item = next;
+ }
+out:
+ free(header);
+ free(metadata);
+ free(filename);
+ return ret;
+}
diff -ur linus.back/update-cache.c linus/update-cache.c
--- linus.back/update-cache.c 2005-04-25 17:30:21.635651416 -0400
+++ linus/update-cache.c 2005-04-25 14:24:14.000000000 -0400
@@ -12,57 +12,48 @@
* like "update-cache *" and suddenly having all the object
* files be revision controlled.
*/
-static int allow_add = 0, allow_remove = 0;
+static int allow_add = 0, allow_remove = 0, commit = 0;
-static int index_fd(unsigned char *sha1, int fd, struct stat *st)
+static int index_fd(unsigned char *sha1, int fd, struct stat *st, struct packed_item **head, struct packed_item **tail, unsigned long *packed_size)
{
- z_stream stream;
unsigned long size = st->st_size;
- int max_out_bytes = size + 200;
- void *out = malloc(max_out_bytes);
void *metadata = malloc(200);
int metadata_size;
void *in;
- SHA_CTX c;
+ char *copy;
+ int ret;
+ struct packed_item *new_item;
in = "";
if (size)
in = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
- if (!out || (int)(long)in == -1)
+ if (!metadata || (int)(long)in == -1)
return -1;
-
metadata_size = 1+sprintf(metadata, "blob %lu", size);
-
- SHA1_Init(&c);
- SHA1_Update(&c, metadata, metadata_size);
- SHA1_Update(&c, in, size);
- SHA1_Final(sha1, &c);
-
- memset(&stream, 0, sizeof(stream));
- deflateInit(&stream, Z_BEST_COMPRESSION);
-
- /*
- * ASCII size + nul byte
- */
- stream.next_in = metadata;
- stream.avail_in = metadata_size;
- stream.next_out = out;
- stream.avail_out = max_out_bytes;
- while (deflate(&stream, 0) == Z_OK)
- /* nothing */;
-
- /*
- * File content
- */
- stream.next_in = in;
- stream.avail_in = size;
- while (deflate(&stream, Z_FINISH) == Z_OK)
- /*nothing */;
-
- deflateEnd(&stream);
-
- return write_sha1_buffer(sha1, out, stream.total_out);
+ copy = malloc(metadata_size + size);
+ if (!copy)
+ return -1;
+ memcpy(copy, metadata, metadata_size);
+ memcpy(copy + metadata_size, in, size);
+ ret = pack_sha1_buffer(copy, metadata_size + size, sha1, &new_item);
+ if (new_item) {
+ if (*tail)
+ (*tail)->next = new_item;
+ *tail = new_item;
+ if (!*head)
+ *head = new_item;
+ *packed_size += new_item->len;
+ if (*packed_size > (512 * 1024)) {
+ write_packed_buffer(*head);
+ *head = NULL;
+ *tail = NULL;
+ *packed_size = 0;
+ }
+ }
+ munmap(in, size);
+ free(copy);
+ return ret;
}
/*
@@ -85,7 +76,7 @@
ce->ce_size = htonl(st->st_size);
}
-static int add_file_to_cache(char *path)
+static int add_file_to_cache(char *path, struct packed_item **packed_head, struct packed_item **packed_tail, unsigned long *packed_size)
{
int size, namelen;
struct cache_entry *ce;
@@ -113,7 +104,8 @@
ce->ce_mode = create_ce_mode(st.st_mode);
ce->ce_flags = htons(namelen);
- if (index_fd(ce->sha1, fd, &st) < 0)
+ if (index_fd(ce->sha1, fd, &st, packed_head,
+ packed_tail, packed_size) < 0)
return -1;
return add_cache_entry(ce, allow_add);
@@ -282,12 +274,30 @@
unlink(lockfile_name);
}
+static int path_comp(const void *p1, const void *p2)
+{
+ const char *s1 = *(char **)p1;
+ const char *s2 = *(char **)p2;
+ int len1 = strlen(s1);
+ int len2 = strlen(s2);
+ int ret;
+ ret = cache_name_compare(s1, len1, s2, len2);
+ return ret;
+}
+
int main(int argc, char **argv)
{
int i, newfd, entries;
int allow_options = 1;
static char lockfile[MAXPATHLEN+1];
const char *indexfile = get_index_file();
+ struct packed_item *packed_head = NULL;
+ struct packed_item *packed_tail = NULL;
+ unsigned long packed_size = 0;
+ char **paths = malloc(argc * sizeof(char *));
+ int num_paths = 0;
+ unsigned char parent_sha1[20];
+ int parents = 0;
snprintf(lockfile, sizeof(lockfile), "%s.lock", indexfile);
@@ -318,6 +328,17 @@
allow_remove = 1;
continue;
}
+ if (!strcmp(path, "--commit")) {
+ commit = 1;
+ continue;
+ }
+ if (!strcmp(path, "--parent")) {
+ if (i+1 >= argc || get_sha1_hex(argv[i+1], parent_sha1))
+ die("update-cache: --parent sha1");
+ parents = 1;
+ i+=1;
+ continue;
+ }
if (!strcmp(path, "--refresh")) {
refresh_cache();
continue;
@@ -334,8 +355,27 @@
fprintf(stderr, "Ignoring path %s\n", argv[i]);
continue;
}
- if (add_file_to_cache(path))
- die("Unable to add %s to database", path);
+ paths[num_paths++] = path;
+
+ }
+ // qsort(paths, num_paths, sizeof(char *), path_comp);
+ for(i = 0 ; i < num_paths ; i++) {
+ if (add_file_to_cache(paths[i], &packed_head, &packed_tail, &packed_size))
+ die("Unable to add %s to database", paths[i]);
+
+ }
+ if (commit) {
+ char tree_sha1[20];
+ if (write_tree(active_cache, active_nr, "", 0, tree_sha1, &packed_head) != active_nr)
+ die("write-tree failed");
+fprintf(stderr, "write_tree gave us %s\n", sha1_to_hex(tree_sha1));
+
+ if (commit_tree(tree_sha1, &parent_sha1, parents, &packed_head))
+ die("commit-tree failed");
+ }
+ if (packed_head) {
+ if (write_packed_buffer(packed_head))
+ die("write packed buffer failed");
}
if (write_cache(newfd, active_cache, active_nr) || rename(lockfile, indexfile))
die("Unable to write new cachefile");
diff -ur linus.back/write-tree.c linus/write-tree.c
--- linus.back/write-tree.c 2005-04-25 17:30:21.635651416 -0400
+++ linus/write-tree.c 2005-04-25 10:01:30.000000000 -0400
@@ -3,106 +3,15 @@
*
* Copyright (C) Linus Torvalds, 2005
*/
-#include "cache.h"
-
-static int check_valid_sha1(unsigned char *sha1)
-{
- char *filename = sha1_file_name(sha1);
- int ret;
-
- /* If we were anal, we'd check that the sha1 of the contents actually matches */
- ret = access(filename, R_OK);
- if (ret)
- perror(filename);
- return ret;
-}
-
-static int prepend_integer(char *buffer, unsigned val, int i)
-{
- buffer[--i] = '\0';
- do {
- buffer[--i] = '0' + (val % 10);
- val /= 10;
- } while (val);
- return i;
-}
-
-#define ORIG_OFFSET (40) /* Enough space to add the header of "tree <size>\0" */
-
-static int write_tree(struct cache_entry **cachep, int maxentries, const char *base, int baselen, unsigned char *returnsha1)
-{
- unsigned char subdir_sha1[20];
- unsigned long size, offset;
- char *buffer;
- int i, nr;
-
- /* Guess at some random initial size */
- size = 8192;
- buffer = malloc(size);
- offset = ORIG_OFFSET;
-
- nr = 0;
- do {
- struct cache_entry *ce = cachep[nr];
- const char *pathname = ce->name, *filename, *dirname;
- int pathlen = ce_namelen(ce), entrylen;
- unsigned char *sha1;
- unsigned int mode;
-
- /* Did we hit the end of the directory? Return how many we wrote */
- if (baselen >= pathlen || memcmp(base, pathname, baselen))
- break;
- sha1 = ce->sha1;
- mode = ntohl(ce->ce_mode);
-
- /* Do we have _further_ subdirectories? */
- filename = pathname + baselen;
- dirname = strchr(filename, '/');
- if (dirname) {
- int subdir_written;
-
- subdir_written = write_tree(cachep + nr, maxentries - nr, pathname, dirname-pathname+1, subdir_sha1);
- nr += subdir_written;
-
- /* Now we need to write out the directory entry into this tree.. */
- mode = S_IFDIR;
- pathlen = dirname - pathname;
-
- /* ..but the directory entry doesn't count towards the total count */
- nr--;
- sha1 = subdir_sha1;
- }
-
- if (check_valid_sha1(sha1) < 0)
- exit(1);
-
- entrylen = pathlen - baselen;
- if (offset + entrylen + 100 > size) {
- size = alloc_nr(offset + entrylen + 100);
- buffer = realloc(buffer, size);
- }
- offset += sprintf(buffer + offset, "%o %.*s", mode, entrylen, filename);
- buffer[offset++] = 0;
- memcpy(buffer + offset, sha1, 20);
- offset += 20;
- nr++;
- } while (nr < maxentries);
-
- i = prepend_integer(buffer, offset - ORIG_OFFSET, ORIG_OFFSET);
- i -= 5;
- memcpy(buffer+i, "tree ", 5);
-
- write_sha1_file(buffer + i, offset - i, returnsha1);
- free(buffer);
- return nr;
-}
+#include "cache.h"
int main(int argc, char **argv)
{
int i, unmerged;
int entries = read_cache();
unsigned char sha1[20];
+ struct packed_item *head = NULL;
if (entries <= 0)
die("write-tree: no cache contents to write");
@@ -123,8 +32,12 @@
die("write-tree: not able to write tree");
/* Ok, write it out */
- if (write_tree(active_cache, entries, "", 0, sha1) != entries)
+ if (write_tree(active_cache, entries, "", 0, sha1, &head) != entries)
die("write-tree: internal error");
+ if (head) {
+ if (write_packed_buffer(head))
+ die("write_packed_buffer failed");
+ }
printf("%s\n", sha1_to_hex(sha1));
return 0;
}
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2005-04-25 22:18 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-21 15:13 [PATCH] multi item packed files Chris Mason
2005-04-21 15:41 ` Linus Torvalds
2005-04-21 16:23 ` Chris Mason
2005-04-21 19:28 ` Krzysztof Halasa
2005-04-21 20:07 ` Linus Torvalds
2005-04-22 9:40 ` Krzysztof Halasa
2005-04-22 18:12 ` Martin Uecker
2005-04-21 20:22 ` Chris Mason
2005-04-21 22:47 ` Linus Torvalds
2005-04-22 0:16 ` Chris Mason
2005-04-22 16:22 ` Linus Torvalds
2005-04-22 18:58 ` Chris Mason
2005-04-22 19:43 ` Linus Torvalds
2005-04-22 20:32 ` Chris Mason
2005-04-22 23:55 ` Chris Mason
2005-04-25 22:20 ` Chris Mason
2005-04-22 9:48 ` Krzysztof Halasa
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).