git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Shawn Pearce <spearce@spearce.org>
To: Jon Smirl <jonsmirl@gmail.com>
Cc: Martin Langhoff <martin.langhoff@gmail.com>,
	Linus Torvalds <torvalds@osdl.org>, git <git@vger.kernel.org>
Subject: Re: Creating objects manually and repack
Date: Sat, 5 Aug 2006 01:46:55 -0400	[thread overview]
Message-ID: <20060805054655.GB18679@spearce.org> (raw)
In-Reply-To: <20060805052135.GA18679@spearce.org>

[-- Attachment #1: Type: text/plain, Size: 2432 bytes --]

Shawn Pearce <spearce@spearce.org> wrote:
> Jon Smirl <jonsmirl@gmail.com> wrote:
> > On 8/5/06, Martin Langhoff <martin.langhoff@gmail.com> wrote:
> > >On 8/5/06, Jon Smirl <jonsmirl@gmail.com> wrote:
> > >> On 8/4/06, Linus Torvalds <torvalds@osdl.org> wrote:
> > >> > and you're basically all done. The above would turn each *,v file into 
> > >a
> > >> > *-<sha>.pack/*-<sha>.idx file pair, so you'd have exactly as many
> > >> > pack-files as you have *,v files.
> > >>
> > >> I'll end up with 110,000 pack files.
> > >
> > >Then just do it every 100 files, and you'll only have 1,100 pack
> > >files, and it'll be fine.
> > 
> > This is something that has to be tuned. If you wait too long
> > everything spills out of RAM and you go totally IO bound for days. If
> > you do it too often you end up with too many packs and it takes a day
> > to repack them.
> > 
> > If I had a way to pipe the all of the objects into repack one at a
> > time without repack doing multiple passes none of this tuning would be
> > necessary. In this model the standalone objects never get created in
> > the first place. The fastest IO is IO that has been eliminated.
> 
> I'm almost done with what I'm calling `git-fast-import`.

OK, now I'm done.  I'm attaching the code.  Toss it into the Makefile
as git-fast-import and recompile.

I tested it with the following Perl script, feeding the Perl script
a list of files that I wanted blobs for on STDIN:

	while (<>) {
		chop;
		print pack('L', -s $_);
		open(F, $_);
		my $buf;
		print $buf while read(F,$buf,128*1024) > 0;
		close F;
	}

This gave me an execution order of:

	find . -name '*.c' | perl test.pl | git-fast-import in.pack
	git-index-pack in.pack

at which point in.pack claims to be a completely valid pack with an
index of in.idx.  Move these into .git/objects/pack, generate trees
and commits, and run git-repack -a -d.  If the order you feed the
objects to git-fast-import in is reasonable (do one RCS file at a
time, feed most recent to least recent revisions) you may not get
any major benefit from using -f during your final repack.

The code for git-fast-import could probably be tweaked to accept
trees and commits too, which would permit you to stream the entire
CVS repository into a single pack file.  :-)

I can't help you decompress the RCS files faster, but hopefully
this will help you generate the GIT pack faster.  Hopefully you
can make use of it!

-- 
Shawn.

[-- Attachment #2: fast-import.c --]
[-- Type: text/x-csrc, Size: 4659 bytes --]

#include "builtin.h"
#include "cache.h"
#include "object.h"
#include "blob.h"
#include "delta.h"
#include "pack.h"
#include "csum-file.h"

static int max_depth = 10;
static unsigned long object_count;
static int packfd;
static int current_depth;
static void *lastdat;
static unsigned long lastdatlen;
static unsigned char lastsha1[20];

static ssize_t yread(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static ssize_t ywrite(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
{
	int n = 1;
	unsigned char c;

	if (type < OBJ_COMMIT || type > OBJ_DELTA)
		die("bad type %d", type);

	c = (type << 4) | (size & 15);
	size >>= 4;
	while (size) {
		*hdr++ = c | 0x80;
		c = size & 0x7f;
		size >>= 7;
		n++;
	}
	*hdr = c;
	return n;
}

static void write_blob (void *dat, unsigned long datlen)
{
	z_stream s;
	void *out, *delta;
	unsigned char hdr[64];
	unsigned long hdrlen, deltalen;

	if (lastdat && current_depth < max_depth) {
		delta = diff_delta(lastdat, lastdatlen,
			dat, datlen,
			&deltalen, 0);
	} else
		delta = 0;

	memset(&s, 0, sizeof(s));
	deflateInit(&s, zlib_compression_level);

	if (delta) {
		current_depth++;
		s.next_in = delta;
		s.avail_in = deltalen;
		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
			die("Can't write object base: %s", strerror(errno));
	} else {
		current_depth = 0;
		s.next_in = dat;
		s.avail_in = datlen;
		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
	}

	s.avail_out = deflateBound(&s, s.avail_in);
	s.next_out = out = xmalloc(s.avail_out);
	while (deflate(&s, Z_FINISH) == Z_OK)
		/* nothing */;
	deflateEnd(&s);

	if (ywrite(packfd, out, s.total_out) != s.total_out)
		die("Failed writing compressed data %s", strerror(errno));

	free(out);
	if (delta)
		free(delta);
}

static void init_pack_header ()
{
	const char* magic = "PACK";
	unsigned long version = 2;
	unsigned long zero = 0;

	version = htonl(version);

	if (ywrite(packfd, (char*)magic, 4) != 4)
		die("Can't write pack magic: %s", strerror(errno));
	if (ywrite(packfd, &version, 4) != 4)
		die("Can't write pack version: %s", strerror(errno));
	if (ywrite(packfd, &zero, 4) != 4)
		die("Can't write 0 object count: %s", strerror(errno));
}

static void fixup_header_footer ()
{
	SHA_CTX c;
	char hdr[8];
	unsigned char sha1[20];
	unsigned long cnt;
	char *buf;
	size_t n;

	if (lseek(packfd, 0, SEEK_SET) != 0)
		die("Failed seeking to start: %s", strerror(errno));

	SHA1_Init(&c);
	if (yread(packfd, hdr, 8) != 8)
		die("Failed reading header: %s", strerror(errno));
	SHA1_Update(&c, hdr, 8);

fprintf(stderr, "%lu objects\n", object_count);
	cnt = htonl(object_count);
	SHA1_Update(&c, &cnt, 4);
	if (ywrite(packfd, &cnt, 4) != 4)
		die("Failed writing object count: %s", strerror(errno));

	buf = xmalloc(128 * 1024);
	for (;;) {
		n = xread(packfd, buf, 128 * 1024);
		if (n <= 0)
			break;
		SHA1_Update(&c, buf, n);
	}
	free(buf);

	SHA1_Final(sha1, &c);
	if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
		die("Failed writing pack checksum: %s", strerror(errno));
}

int main (int argc, const char **argv)
{
	packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
	if (packfd < 0)
		die("Can't create pack file %s: %s", argv[1], strerror(errno));

	init_pack_header();
	for (;;) {
		unsigned long datlen;
		int hdrlen;
		void *dat;
		char hdr[128];
		unsigned char sha1[20];
		SHA_CTX c;

		if (yread(0, &datlen, 4) != 4)
			break;

		dat = xmalloc(datlen);
		if (yread(0, dat, datlen) != datlen)
			break;

		hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
		SHA1_Init(&c);
		SHA1_Update(&c, hdr, hdrlen);
		SHA1_Update(&c, dat, datlen);
		SHA1_Final(sha1, &c);

		write_blob(dat, datlen);
		object_count++;
		printf("%s\n", sha1_to_hex(sha1));
		fflush(stdout);

		if (lastdat)
			free(lastdat);
		lastdat = dat;
		lastdatlen = datlen;
		memcpy(lastsha1, sha1, sizeof(sha1));
	}
	fixup_header_footer();
	close(packfd);

	return 0;
}

      parent reply	other threads:[~2006-08-05  5:47 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-04  3:43 Creating objects manually and repack Jon Smirl
2006-08-04  3:58 ` Jeff King
2006-08-04  4:01 ` Linus Torvalds
2006-08-04  4:24   ` Jon Smirl
2006-08-04  4:46     ` Linus Torvalds
2006-08-04  5:01       ` Linus Torvalds
2006-08-04  5:11         ` Jon Smirl
2006-08-04 14:40         ` Jon Smirl
2006-08-04 14:50           ` Jon Smirl
2006-08-04 15:22             ` Linus Torvalds
2006-08-04 15:41               ` Jon Smirl
2006-08-04 16:01                 ` A Large Angry SCM
2006-08-04 16:11                   ` Jon Smirl
2006-08-04 16:32                     ` Linus Torvalds
2006-08-04 16:56                   ` Linus Torvalds
2006-08-04 16:39                 ` Rogan Dawes
2006-08-04 16:53                   ` Jon Smirl
2006-08-04 16:53                 ` Linus Torvalds
2006-08-04 17:17                   ` Jon Smirl
2006-08-04 17:29                     ` Linus Torvalds
2006-08-04 18:06                       ` Linus Torvalds
2006-08-04 18:24                         ` Junio C Hamano
2006-08-04 19:20                           ` Linus Torvalds
2006-08-04 19:31                             ` Carl Worth
2006-08-04 19:57                               ` Junio C Hamano
2006-08-04 20:08                                 ` Carl Worth
2006-08-04 20:08                                 ` Carl Worth
2006-08-04 20:12                                 ` Jakub Narebski
2006-08-04 20:30                                   ` Junio C Hamano
2006-08-04 20:37                                     ` Jakub Narebski
2006-08-05  4:15                     ` Martin Langhoff
2006-08-05  5:12                       ` Jon Smirl
2006-08-05  5:21                         ` Shawn Pearce
2006-08-05  5:40                           ` Jon Smirl
2006-08-05  5:52                             ` Shawn Pearce
2006-08-05  5:46                           ` Shawn Pearce [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060805054655.GB18679@spearce.org \
    --to=spearce@spearce.org \
    --cc=git@vger.kernel.org \
    --cc=jonsmirl@gmail.com \
    --cc=martin.langhoff@gmail.com \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).