git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* git-archimport-script - 2nd iteration
@ 2005-08-24 12:14 Martin Langhoff
  0 siblings, 0 replies; only message in thread
From: Martin Langhoff @ 2005-08-24 12:14 UTC (permalink / raw)
  To: GIT

[-- Attachment #1: Type: text/plain, Size: 492 bytes --]

Gave the code another pass. The code should be more readable, and make
a bit more sense.

It now:
 - handles commit timestamps correctly
 - handles binary files correctly
 - uses parselog() to tell git-update-cache what's been
added/deleted/modified - much faster commits on large trees
 - gets the commit msgs mostly ok

In my immediate TODO:
 - handling renames
 - branches
 - incremental import

A bit further off:
 - file modes
 - merge detection

cheers,


martin

[-- Attachment #2: git-archimport-script --]
[-- Type: application/octet-stream, Size: 11396 bytes --]

#!/usr/bin/perl -w

# This tool is copyright (c) 2005, Matthias Urlichs.
# It is released under the Gnu Public License, version 2.
#
# The basic idea is to walk the output of tla abrowse, 
# fetch the changesets and apply them. 
#

=head1 Devel tricks

Add print in front of the shell commands invoked via backticks. 

=head2 TODO

 - deal with renames correctly
 - deal with branches correctly
 - keep track of merged patches, and mark a git merge when it happens
 - smarter rules to parse the archive history "up" and "down"
 - be able to continue an import where we left off

=cut

use strict;
use warnings;
use Getopt::Std;
use File::Spec;
use File::Temp qw(tempfile);
use File::Path qw(mkpath);
use File::Basename qw(basename dirname);
use String::ShellQuote;
use Time::Local;
use IO::Socket;
use IO::Pipe;
use POSIX qw(strftime dup2);
use Data::Dumper qw/ Dumper /;
use IPC::Open2;

$SIG{'PIPE'}="IGNORE";
$ENV{'TZ'}="UTC";

our($opt_h,$opt_A,$opt_v,$opt_k,
    $opt_d,$opt_p,$opt_C,$opt_z,
    $opt_i,$opt_t);

sub usage() {
	print STDERR <<END;
Usage: ${\basename $0}     # fetch/update GIT from Arch
       [ -h ] [ -v ] [ -A archive ]
       [ -C GIT_repository ] [ -t tempdir ] 
       [ arch-branch ]
END
	exit(1);
}

getopts("hviA:C:t:") or usage();
usage if $opt_h;

@ARGV <= 1 or usage();


my $tmp = $opt_t;
$tmp ||= '/tmp';
$tmp .= '/git-archimport/';

my $git_tree = $opt_C;
$git_tree ||= ".";

my $arch_branch = '';
if ($#ARGV == 0) {
	$arch_branch = $ARGV[0];
} 

# TODO: handle more than one repo
open ABROWSE, "tla abrowse -f -A $opt_A --desc --merges $arch_branch |" 
    or die "Problems with tla abrowse: $!";

my @psets  = (); # the collection
my %ps        = (); # the current one
my $mode      = '';
my $lastseen  = '';

while (<ABROWSE>) {
    chomp;

    # first record padded w 8 spaces
    if (s/^\s{8}\b//) {
	
	# store the record we just captured
	if (%ps) {
	    my %temp = %ps; # break references
	    push (@psets, \%temp);
	    %ps = ();
	}

	my ($id, $type) = split(m/\s{3}/, $_);
	$ps{id} = $id;
	
	# deal with types
	if ($type =~ m/^\(simple changeset\)/) {
	    $ps{type} = 's';
	} elsif ($type eq '(initial import)') {
	    $ps{type} = 'i';
	} elsif ($type =~ m/^\(tag revision of (.+)\)/) {
	    $ps{type} = 't';
	    $ps{tag}  = $1;
	} else { 
	    warn "Unknown type $type";
	}
	$lastseen = 'id';
    }

    if (s/^\s{10}//) { 
        # 10 leading spaces or more 
	# indicate commit metadata

	# date & author 
	if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {

	    my ($date, $authoremail) = split(m/\s{2,}/, $_);
	    $ps{date}   = $date;
	    $ps{date}   =~ s/\bGMT$//; # strip off trailign GMT
	    if ($ps{date} =~ m/\b\w+$/) {
		warn 'Arch dates not in GMT?! - imported dates will be wrong';
	    }

	    $authoremail =~ m/^(.+)\s(\S+)$/;
	    $ps{author} = $1;
	    $ps{email}  = $2;

	    $lastseen = 'date';

	} elsif ($lastseen eq 'date') {
	    # the only hint is position
	    # subject is after date
	    $ps{subj} = $_;
	    $lastseen = 'subj';

	} elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
	    $ps{merges} = [];
	    $lastseen = 'merges';

	} elsif ($lastseen eq 'merges' && s/^\s{2}//) {
	    push (@{$ps{merges}}, $_);
	} else {
	    warn 'more metadata after merges!?';
	}

    }
}
if (%ps) {
    my %temp = %ps; # break references
    push (@psets, \%temp);
    %ps = ();
}

close ABROWSE;

## Order patches by time
@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;

#print Dumper \@psets;

##
## TODO cleanup irrelevant patches
##      and put an initial import
##      or a full tag

if ($opt_i) { # initial import 
    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
	print "Starting import from $psets[0]{id}\n";
    } else {
	die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
    }
    `git-init-db`;
    die $! if $?;
}

# process
my $lastbranch = branchname($psets[0]{id}); # only good for initial import

foreach my $ps (@psets) {

    $ps->{branch} =  branchname($ps->{id});

    #
    # ensure we have a clean state 
    # 
    if (`git diff-files`){
	die "Unclean tree when about to process $ps->{id} " .
	    " - did we fail to commit cleanly before?";
    }
    die $! if $?;

    # 
    # create the branch if needed
    # TODO: Find the ancestor!
    #
    unless ( -e ".git/refs/heads/$ps->{branch}") {
	`git checkout -b $ps->{branch}`;
    } else {
	`git checkout    $ps->{branch}`;
    }
    die $! if $?;

    #
    # Apply the import/changeset/merge into the working tree
    # 
    if ($ps->{type} eq 'i') {
	apply_import($ps) or die $!;
    } elsif ($ps->{type} eq 's') {
	apply_cset($ps);
    }

    #
    # prepare update git's index, based on what arch knows
    # about the pset, resolve parents, etc
    #
    my $tree;

    my $commitlog = `tla cat-archive-log -A $opt_A $ps->{id}`; 
    die "Error in cat-archive-log: $!" if $?;
	
    # parselog will git-add/rm files
    # and generally prepare things for the commit
    # NOTE: parselog will shell-quote filenames! 
    my ($sum, $msg, $add, $del, $mod) = parselog($commitlog);
    my $logmessage = "$sum\n$msg";


    # imports don't give us good info
    # on added files. Shame on them
    if ($ps->{type} eq 'i'){ 
	`find . -type f -print0 | grep -zv '^./.git' | xargs -0 git-update-cache --add`;
    }

    if (@$add) {
	$add = join(' ', @$add);
	`git-update-cache --add $add`;
	die "Error in git-update-cache --add: $!" if $?;
    }
    if (@$del) {
	foreach my $file (@$del) {
	    unlink $file or die "Problems deleting $file : $!";
	}
	$del = join (' ', @$del);
	`git-update-cache --remove $del`;
	die "Error in git-update-cache --remove: $!" if $?;
    }
    if (@$mod) {
	$mod = join (' ', @$mod);
	`git-update-cache $mod`;
	die "Error in git-update-cache: $!" if $?;
    }

    # warn "errors when runnign git-update-cache! $!";
    $tree = `git-write-tree`;
    die "cannot write tree $!" if $?;
    chomp $tree;
	
	
	

    #
    # Commit and clean state
    #
    my @par;
    if ( -e ".git/refs/heads/$ps->{branch}"){
	if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
	    my $p = <HEAD>;
	    close HEAD;
	    chomp $p;
	    push @par, '-p', $p;
	} else { 
	    if ($ps->{type} eq 's') {
		warn "Could not find the right head for the branch $ps->{branch}";
	    }
	}
    }
    my $par = join (' ', @par);
    $ENV{TZ}                  = 'GMT';
    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
    print "\t+ commit date is  $ps->{date} \n";

    my ($pid, $commit_rh, $commit_wh);
    $commit_rh = 'commit_rh';
    $commit_wh = 'commit_wh';
    
    $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par") 
	or die $!;
    print WRITER $logmessage; # write
    close WRITER;
    my $commitid = <READER>; # read
    chomp $commitid;
    close READER;
    waitpid $pid,0;           # close;

    if (length $commitid != 40) {
	die "Something went wrong with the commit! $! $commitid";
    }
    #
    # Update the branch
    # 
    open  HEAD, ">.git/refs/heads/$ps->{branch}";
    print HEAD $commitid;
    close HEAD;
    unlink ('.git/HEAD');
    symlink("refs/heads/$ps->{branch}",".git/HEAD");

    print " * Committed $ps->{id}\n";
    print "   + tree   $tree\n";
    print "   + commit $commitid\n";
}

sub branchname {
    my $id = shift;
    $id =~ s#^.+?/##;
    my @parts = split(m/--/, $id);
    return join('--', @parts[0..1]);
}

sub apply_import {
    my $ps = shift;
    my $bname = branchname($ps->{id});

    `mkdir -p $tmp`;

    `tla get -s --no-pristine -A $opt_A $ps->{id} $tmp/import`;
    die "Cannot get import: $!" if $?;    
    `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
    die "Cannot rsync import:$!" if $?;
    
    `rm -fr $tmp/import`;
    die "Cannot remove tempdir: $!" if $?;
    

    return 1;
}

sub apply_cset {
    my $ps = shift;

    `mkdir -p $tmp`;

    # get the changeset
    `tla get-changeset  -A $opt_A  $ps->{id} $tmp/changeset`;
    die "Cannot get changeset: $!" if $?;
    
    # apply patches
    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
	# this can be sped up considerably by doing
	#    (find | xargs cat) | patch
	# but that cna get mucked up by patches
	# with missing trailing newlines or the standard 
	# 'missing newline' flag in the patch - possibly
	# produced with an old/buggy diff.
	# slow and safe, we invoke patch once per patchfile
	`find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
	die "Problem applying patches! $!" if $?;
    }

    # apply changed binary files
    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
	foreach my $mod (@modified) {
	    chomp $mod;
	    my $orig = $mod;
	    $orig =~ s/\.modified$//; # lazy
	    $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
	    print "rsync -p '$mod' '$orig'";
	    `rsync -p $mod ./$orig`;
	    die "Problem applying binary changes! $!" if $?;
	}
    }

    # bring in new files
    `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;

    # deleted files are hinted from the commitlog processing

    `rm -fr $tmp/changeset`;
}


    # =for reference
    # A log entry looks like 
    # Revision: moodle-org--moodle--1.3.3--patch-15
    # Archive: arch-eduforge@catalyst.net.nz--2004
# Creator: Penny Leach <penny@catalyst.net.nz>
# Date: Wed May 25 14:15:34 NZST 2005
# Standard-date: 2005-05-25 02:15:34 GMT
# New-files: lang/de/.arch-ids/block_glossary_random.php.id
#     lang/de/.arch-ids/block_html.php.id
# New-directories: lang/de/help/questionnaire
#     lang/de/help/questionnaire/.arch-ids
# Removed-files: lang/be/docs/.arch-ids/release.html.id
#     lang/be/docs/.arch-ids/releaseold.html.id
# Modified-files: admin/cron.php admin/delete.php
#     admin/editor.html backup/lib.php backup/restore.php
# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
# Keywords:
#
# Updating yadda tadda tadda madda
sub parselog {
    my $log = shift;
    print $log;

    my (@add, @del, @mod, @kw, $sum, $msg );

    if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@add = split(m/\s+/s, $files);
    }
       
    if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@del = split(m/\s+/s, $files);
    }
    
    if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@mod = split(m/\s+/s, $files);
    }
    

    if ($log =~ m/^Summary:(.+?)$/m ) {
	$sum = $1;
	$sum =~ s/^\s+//;
	$sum =~ s/\s+$//;
    }

    if ($log =~ m/\n\n(.+)$/s) {
	$msg = $1;
	$msg =~ s/^\s+//;
	$msg =~ s/\s+$//;
    }


    # cleanup the arrays
    foreach my $ref ( (\@add, \@del, \@mod) ) {
	my @tmp = ();
	    while (my $t = pop @$ref) {
		next unless length ($t);
		next if $t =~ m!\{arch\}/!;
		next if $t =~ m!\.arch-ids/!;
		push (@tmp, shell_quote($t));
	    }
	@$ref = @tmp;
    }
    
    print Dumper [$sum, $msg, \@add, \@del, \@mod]; 
    return       ($sum, $msg, \@add, \@del, \@mod); 
}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2005-08-24 12:14 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-24 12:14 git-archimport-script - 2nd iteration Martin Langhoff

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).