Re: Creating objects manually and repack

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Shawn Pearce <spearce@spearce.org>
To: Jon Smirl <jonsmirl@gmail.com>
Cc: Martin Langhoff <martin.langhoff@gmail.com>,
	Linus Torvalds <torvalds@osdl.org>, git <git@vger.kernel.org>
Subject: Re: Creating objects manually and repack
Date: Sat, 5 Aug 2006 01:46:55 -0400	[thread overview]
Message-ID: <20060805054655.GB18679@spearce.org> (raw)
In-Reply-To: <20060805052135.GA18679@spearce.org>

[-- Attachment #1: Type: text/plain, Size: 2432 bytes --]

Shawn Pearce <spearce@spearce.org> wrote:
> Jon Smirl <jonsmirl@gmail.com> wrote:
> > On 8/5/06, Martin Langhoff <martin.langhoff@gmail.com> wrote:
> > >On 8/5/06, Jon Smirl <jonsmirl@gmail.com> wrote:
> > >> On 8/4/06, Linus Torvalds <torvalds@osdl.org> wrote:
> > >> > and you're basically all done. The above would turn each *,v file into 
> > >a
> > >> > *-<sha>.pack/*-<sha>.idx file pair, so you'd have exactly as many
> > >> > pack-files as you have *,v files.
> > >>
> > >> I'll end up with 110,000 pack files.
> > >
> > >Then just do it every 100 files, and you'll only have 1,100 pack
> > >files, and it'll be fine.
> > 
> > This is something that has to be tuned. If you wait too long
> > everything spills out of RAM and you go totally IO bound for days. If
> > you do it too often you end up with too many packs and it takes a day
> > to repack them.
> > 
> > If I had a way to pipe the all of the objects into repack one at a
> > time without repack doing multiple passes none of this tuning would be
> > necessary. In this model the standalone objects never get created in
> > the first place. The fastest IO is IO that has been eliminated.
> 
> I'm almost done with what I'm calling `git-fast-import`.

OK, now I'm done.  I'm attaching the code.  Toss it into the Makefile
as git-fast-import and recompile.

I tested it with the following Perl script, feeding the Perl script
a list of files that I wanted blobs for on STDIN:

	while (<>) {
		chop;
		print pack('L', -s $_);
		open(F, $_);
		my $buf;
		print $buf while read(F,$buf,128*1024) > 0;
		close F;
	}

This gave me an execution order of:

	find . -name '*.c' | perl test.pl | git-fast-import in.pack
	git-index-pack in.pack

at which point in.pack claims to be a completely valid pack with an
index of in.idx.  Move these into .git/objects/pack, generate trees
and commits, and run git-repack -a -d.  If the order you feed the
objects to git-fast-import in is reasonable (do one RCS file at a
time, feed most recent to least recent revisions) you may not get
any major benefit from using -f during your final repack.

The code for git-fast-import could probably be tweaked to accept
trees and commits too, which would permit you to stream the entire
CVS repository into a single pack file.  :-)

I can't help you decompress the RCS files faster, but hopefully
this will help you generate the GIT pack faster.  Hopefully you
can make use of it!

-- 
Shawn.

[-- Attachment #2: fast-import.c --]
[-- Type: text/x-csrc, Size: 4659 bytes --]

#include "builtin.h"
#include "cache.h"
#include "object.h"
#include "blob.h"
#include "delta.h"
#include "pack.h"
#include "csum-file.h"

static int max_depth = 10;
static unsigned long object_count;
static int packfd;
static int current_depth;
static void *lastdat;
static unsigned long lastdatlen;
static unsigned char lastsha1[20];

static ssize_t yread(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static ssize_t ywrite(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
{
	int n = 1;
	unsigned char c;

	if (type < OBJ_COMMIT || type > OBJ_DELTA)
		die("bad type %d", type);

	c = (type << 4) | (size & 15);
	size >>= 4;
	while (size) {
		*hdr++ = c | 0x80;
		c = size & 0x7f;
		size >>= 7;
		n++;
	}
	*hdr = c;
	return n;
}

static void write_blob (void *dat, unsigned long datlen)
{
	z_stream s;
	void *out, *delta;
	unsigned char hdr[64];
	unsigned long hdrlen, deltalen;

	if (lastdat && current_depth < max_depth) {
		delta = diff_delta(lastdat, lastdatlen,
			dat, datlen,
			&deltalen, 0);
	} else
		delta = 0;

	memset(&s, 0, sizeof(s));
	deflateInit(&s, zlib_compression_level);

	if (delta) {
		current_depth++;
		s.next_in = delta;
		s.avail_in = deltalen;
		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
			die("Can't write object base: %s", strerror(errno));
	} else {
		current_depth = 0;
		s.next_in = dat;
		s.avail_in = datlen;
		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
	}

	s.avail_out = deflateBound(&s, s.avail_in);
	s.next_out = out = xmalloc(s.avail_out);
	while (deflate(&s, Z_FINISH) == Z_OK)
		/* nothing */;
	deflateEnd(&s);

	if (ywrite(packfd, out, s.total_out) != s.total_out)
		die("Failed writing compressed data %s", strerror(errno));

	free(out);
	if (delta)
		free(delta);
}

static void init_pack_header ()
{
	const char* magic = "PACK";
	unsigned long version = 2;
	unsigned long zero = 0;

	version = htonl(version);

	if (ywrite(packfd, (char*)magic, 4) != 4)
		die("Can't write pack magic: %s", strerror(errno));
	if (ywrite(packfd, &version, 4) != 4)
		die("Can't write pack version: %s", strerror(errno));
	if (ywrite(packfd, &zero, 4) != 4)
		die("Can't write 0 object count: %s", strerror(errno));
}

static void fixup_header_footer ()
{
	SHA_CTX c;
	char hdr[8];
	unsigned char sha1[20];
	unsigned long cnt;
	char *buf;
	size_t n;

	if (lseek(packfd, 0, SEEK_SET) != 0)
		die("Failed seeking to start: %s", strerror(errno));

	SHA1_Init(&c);
	if (yread(packfd, hdr, 8) != 8)
		die("Failed reading header: %s", strerror(errno));
	SHA1_Update(&c, hdr, 8);

fprintf(stderr, "%lu objects\n", object_count);
	cnt = htonl(object_count);
	SHA1_Update(&c, &cnt, 4);
	if (ywrite(packfd, &cnt, 4) != 4)
		die("Failed writing object count: %s", strerror(errno));

	buf = xmalloc(128 * 1024);
	for (;;) {
		n = xread(packfd, buf, 128 * 1024);
		if (n <= 0)
			break;
		SHA1_Update(&c, buf, n);
	}
	free(buf);

	SHA1_Final(sha1, &c);
	if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
		die("Failed writing pack checksum: %s", strerror(errno));
}

int main (int argc, const char **argv)
{
	packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
	if (packfd < 0)
		die("Can't create pack file %s: %s", argv[1], strerror(errno));

	init_pack_header();
	for (;;) {
		unsigned long datlen;
		int hdrlen;
		void *dat;
		char hdr[128];
		unsigned char sha1[20];
		SHA_CTX c;

		if (yread(0, &datlen, 4) != 4)
			break;

		dat = xmalloc(datlen);
		if (yread(0, dat, datlen) != datlen)
			break;

		hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
		SHA1_Init(&c);
		SHA1_Update(&c, hdr, hdrlen);
		SHA1_Update(&c, dat, datlen);
		SHA1_Final(sha1, &c);

		write_blob(dat, datlen);
		object_count++;
		printf("%s\n", sha1_to_hex(sha1));
		fflush(stdout);

		if (lastdat)
			free(lastdat);
		lastdat = dat;
		lastdatlen = datlen;
		memcpy(lastsha1, sha1, sizeof(sha1));
	}
	fixup_header_footer();
	close(packfd);

	return 0;
}

     prev parent reply	other threads:[~2006-08-05  5:47 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-04  3:43 Creating objects manually and repack Jon Smirl
2006-08-04  3:58 ` Jeff King
2006-08-04  4:01 ` Linus Torvalds
2006-08-04  4:24   ` Jon Smirl
2006-08-04  4:46     ` Linus Torvalds
2006-08-04  5:01       ` Linus Torvalds
2006-08-04  5:11         ` Jon Smirl
2006-08-04 14:40         ` Jon Smirl
2006-08-04 14:50           ` Jon Smirl
2006-08-04 15:22             ` Linus Torvalds
2006-08-04 15:41               ` Jon Smirl
2006-08-04 16:01                 ` A Large Angry SCM
2006-08-04 16:11                   ` Jon Smirl
2006-08-04 16:32                     ` Linus Torvalds
2006-08-04 16:56                   ` Linus Torvalds
2006-08-04 16:39                 ` Rogan Dawes
2006-08-04 16:53                   ` Jon Smirl
2006-08-04 16:53                 ` Linus Torvalds
2006-08-04 17:17                   ` Jon Smirl
2006-08-04 17:29                     ` Linus Torvalds
2006-08-04 18:06                       ` Linus Torvalds
2006-08-04 18:24                         ` Junio C Hamano
2006-08-04 19:20                           ` Linus Torvalds
2006-08-04 19:31                             ` Carl Worth
2006-08-04 19:57                               ` Junio C Hamano
2006-08-04 20:08                                 ` Carl Worth
2006-08-04 20:08                                 ` Carl Worth
2006-08-04 20:12                                 ` Jakub Narebski
2006-08-04 20:30                                   ` Junio C Hamano
2006-08-04 20:37                                     ` Jakub Narebski
2006-08-05  4:15                     ` Martin Langhoff
2006-08-05  5:12                       ` Jon Smirl
2006-08-05  5:21                         ` Shawn Pearce
2006-08-05  5:40                           ` Jon Smirl
2006-08-05  5:52                             ` Shawn Pearce
2006-08-05  5:46                           ` Shawn Pearce [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060805054655.GB18679@spearce.org \
    --to=spearce@spearce.org \
    --cc=git@vger.kernel.org \
    --cc=jonsmirl@gmail.com \
    --cc=martin.langhoff@gmail.com \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.