git-archimport-script - 2nd iteration

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Martin Langhoff <martin.langhoff@gmail.com>
To: GIT <git@vger.kernel.org>
Subject: git-archimport-script - 2nd iteration
Date: Thu, 25 Aug 2005 00:14:08 +1200	[thread overview]
Message-ID: <46a038f90508240514aa918ac@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 492 bytes --]

Gave the code another pass. The code should be more readable, and make
a bit more sense.

It now:
 - handles commit timestamps correctly
 - handles binary files correctly
 - uses parselog() to tell git-update-cache what's been
added/deleted/modified - much faster commits on large trees
 - gets the commit msgs mostly ok

In my immediate TODO:
 - handling renames
 - branches
 - incremental import

A bit further off:
 - file modes
 - merge detection

cheers,


martin

[-- Attachment #2: git-archimport-script --]
[-- Type: application/octet-stream, Size: 11396 bytes --]

#!/usr/bin/perl -w

# This tool is copyright (c) 2005, Matthias Urlichs.
# It is released under the Gnu Public License, version 2.
#
# The basic idea is to walk the output of tla abrowse, 
# fetch the changesets and apply them. 
#

=head1 Devel tricks

Add print in front of the shell commands invoked via backticks. 

=head2 TODO

 - deal with renames correctly
 - deal with branches correctly
 - keep track of merged patches, and mark a git merge when it happens
 - smarter rules to parse the archive history "up" and "down"
 - be able to continue an import where we left off

=cut

use strict;
use warnings;
use Getopt::Std;
use File::Spec;
use File::Temp qw(tempfile);
use File::Path qw(mkpath);
use File::Basename qw(basename dirname);
use String::ShellQuote;
use Time::Local;
use IO::Socket;
use IO::Pipe;
use POSIX qw(strftime dup2);
use Data::Dumper qw/ Dumper /;
use IPC::Open2;

$SIG{'PIPE'}="IGNORE";
$ENV{'TZ'}="UTC";

our($opt_h,$opt_A,$opt_v,$opt_k,
    $opt_d,$opt_p,$opt_C,$opt_z,
    $opt_i,$opt_t);

sub usage() {
	print STDERR <<END;
Usage: ${\basename $0}     # fetch/update GIT from Arch
       [ -h ] [ -v ] [ -A archive ]
       [ -C GIT_repository ] [ -t tempdir ] 
       [ arch-branch ]
END
	exit(1);
}

getopts("hviA:C:t:") or usage();
usage if $opt_h;

@ARGV <= 1 or usage();


my $tmp = $opt_t;
$tmp ||= '/tmp';
$tmp .= '/git-archimport/';

my $git_tree = $opt_C;
$git_tree ||= ".";

my $arch_branch = '';
if ($#ARGV == 0) {
	$arch_branch = $ARGV[0];
} 

# TODO: handle more than one repo
open ABROWSE, "tla abrowse -f -A $opt_A --desc --merges $arch_branch |" 
    or die "Problems with tla abrowse: $!";

my @psets  = (); # the collection
my %ps        = (); # the current one
my $mode      = '';
my $lastseen  = '';

while (<ABROWSE>) {
    chomp;

    # first record padded w 8 spaces
    if (s/^\s{8}\b//) {
	
	# store the record we just captured
	if (%ps) {
	    my %temp = %ps; # break references
	    push (@psets, \%temp);
	    %ps = ();
	}

	my ($id, $type) = split(m/\s{3}/, $_);
	$ps{id} = $id;
	
	# deal with types
	if ($type =~ m/^\(simple changeset\)/) {
	    $ps{type} = 's';
	} elsif ($type eq '(initial import)') {
	    $ps{type} = 'i';
	} elsif ($type =~ m/^\(tag revision of (.+)\)/) {
	    $ps{type} = 't';
	    $ps{tag}  = $1;
	} else { 
	    warn "Unknown type $type";
	}
	$lastseen = 'id';
    }

    if (s/^\s{10}//) { 
        # 10 leading spaces or more 
	# indicate commit metadata

	# date & author 
	if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {

	    my ($date, $authoremail) = split(m/\s{2,}/, $_);
	    $ps{date}   = $date;
	    $ps{date}   =~ s/\bGMT$//; # strip off trailign GMT
	    if ($ps{date} =~ m/\b\w+$/) {
		warn 'Arch dates not in GMT?! - imported dates will be wrong';
	    }

	    $authoremail =~ m/^(.+)\s(\S+)$/;
	    $ps{author} = $1;
	    $ps{email}  = $2;

	    $lastseen = 'date';

	} elsif ($lastseen eq 'date') {
	    # the only hint is position
	    # subject is after date
	    $ps{subj} = $_;
	    $lastseen = 'subj';

	} elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
	    $ps{merges} = [];
	    $lastseen = 'merges';

	} elsif ($lastseen eq 'merges' && s/^\s{2}//) {
	    push (@{$ps{merges}}, $_);
	} else {
	    warn 'more metadata after merges!?';
	}

    }
}
if (%ps) {
    my %temp = %ps; # break references
    push (@psets, \%temp);
    %ps = ();
}

close ABROWSE;

## Order patches by time
@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;

#print Dumper \@psets;

##
## TODO cleanup irrelevant patches
##      and put an initial import
##      or a full tag

if ($opt_i) { # initial import 
    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
	print "Starting import from $psets[0]{id}\n";
    } else {
	die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
    }
    `git-init-db`;
    die $! if $?;
}

# process
my $lastbranch = branchname($psets[0]{id}); # only good for initial import

foreach my $ps (@psets) {

    $ps->{branch} =  branchname($ps->{id});

    #
    # ensure we have a clean state 
    # 
    if (`git diff-files`){
	die "Unclean tree when about to process $ps->{id} " .
	    " - did we fail to commit cleanly before?";
    }
    die $! if $?;

    # 
    # create the branch if needed
    # TODO: Find the ancestor!
    #
    unless ( -e ".git/refs/heads/$ps->{branch}") {
	`git checkout -b $ps->{branch}`;
    } else {
	`git checkout    $ps->{branch}`;
    }
    die $! if $?;

    #
    # Apply the import/changeset/merge into the working tree
    # 
    if ($ps->{type} eq 'i') {
	apply_import($ps) or die $!;
    } elsif ($ps->{type} eq 's') {
	apply_cset($ps);
    }

    #
    # prepare update git's index, based on what arch knows
    # about the pset, resolve parents, etc
    #
    my $tree;

    my $commitlog = `tla cat-archive-log -A $opt_A $ps->{id}`; 
    die "Error in cat-archive-log: $!" if $?;
	
    # parselog will git-add/rm files
    # and generally prepare things for the commit
    # NOTE: parselog will shell-quote filenames! 
    my ($sum, $msg, $add, $del, $mod) = parselog($commitlog);
    my $logmessage = "$sum\n$msg";


    # imports don't give us good info
    # on added files. Shame on them
    if ($ps->{type} eq 'i'){ 
	`find . -type f -print0 | grep -zv '^./.git' | xargs -0 git-update-cache --add`;
    }

    if (@$add) {
	$add = join(' ', @$add);
	`git-update-cache --add $add`;
	die "Error in git-update-cache --add: $!" if $?;
    }
    if (@$del) {
	foreach my $file (@$del) {
	    unlink $file or die "Problems deleting $file : $!";
	}
	$del = join (' ', @$del);
	`git-update-cache --remove $del`;
	die "Error in git-update-cache --remove: $!" if $?;
    }
    if (@$mod) {
	$mod = join (' ', @$mod);
	`git-update-cache $mod`;
	die "Error in git-update-cache: $!" if $?;
    }

    # warn "errors when runnign git-update-cache! $!";
    $tree = `git-write-tree`;
    die "cannot write tree $!" if $?;
    chomp $tree;
	
	
	

    #
    # Commit and clean state
    #
    my @par;
    if ( -e ".git/refs/heads/$ps->{branch}"){
	if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
	    my $p = <HEAD>;
	    close HEAD;
	    chomp $p;
	    push @par, '-p', $p;
	} else { 
	    if ($ps->{type} eq 's') {
		warn "Could not find the right head for the branch $ps->{branch}";
	    }
	}
    }
    my $par = join (' ', @par);
    $ENV{TZ}                  = 'GMT';
    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
    print "\t+ commit date is  $ps->{date} \n";

    my ($pid, $commit_rh, $commit_wh);
    $commit_rh = 'commit_rh';
    $commit_wh = 'commit_wh';
    
    $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par") 
	or die $!;
    print WRITER $logmessage; # write
    close WRITER;
    my $commitid = <READER>; # read
    chomp $commitid;
    close READER;
    waitpid $pid,0;           # close;

    if (length $commitid != 40) {
	die "Something went wrong with the commit! $! $commitid";
    }
    #
    # Update the branch
    # 
    open  HEAD, ">.git/refs/heads/$ps->{branch}";
    print HEAD $commitid;
    close HEAD;
    unlink ('.git/HEAD');
    symlink("refs/heads/$ps->{branch}",".git/HEAD");

    print " * Committed $ps->{id}\n";
    print "   + tree   $tree\n";
    print "   + commit $commitid\n";
}

sub branchname {
    my $id = shift;
    $id =~ s#^.+?/##;
    my @parts = split(m/--/, $id);
    return join('--', @parts[0..1]);
}

sub apply_import {
    my $ps = shift;
    my $bname = branchname($ps->{id});

    `mkdir -p $tmp`;

    `tla get -s --no-pristine -A $opt_A $ps->{id} $tmp/import`;
    die "Cannot get import: $!" if $?;    
    `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
    die "Cannot rsync import:$!" if $?;
    
    `rm -fr $tmp/import`;
    die "Cannot remove tempdir: $!" if $?;
    

    return 1;
}

sub apply_cset {
    my $ps = shift;

    `mkdir -p $tmp`;

    # get the changeset
    `tla get-changeset  -A $opt_A  $ps->{id} $tmp/changeset`;
    die "Cannot get changeset: $!" if $?;
    
    # apply patches
    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
	# this can be sped up considerably by doing
	#    (find | xargs cat) | patch
	# but that cna get mucked up by patches
	# with missing trailing newlines or the standard 
	# 'missing newline' flag in the patch - possibly
	# produced with an old/buggy diff.
	# slow and safe, we invoke patch once per patchfile
	`find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
	die "Problem applying patches! $!" if $?;
    }

    # apply changed binary files
    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
	foreach my $mod (@modified) {
	    chomp $mod;
	    my $orig = $mod;
	    $orig =~ s/\.modified$//; # lazy
	    $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
	    print "rsync -p '$mod' '$orig'";
	    `rsync -p $mod ./$orig`;
	    die "Problem applying binary changes! $!" if $?;
	}
    }

    # bring in new files
    `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;

    # deleted files are hinted from the commitlog processing

    `rm -fr $tmp/changeset`;
}


    # =for reference
    # A log entry looks like 
    # Revision: moodle-org--moodle--1.3.3--patch-15
    # Archive: arch-eduforge@catalyst.net.nz--2004
# Creator: Penny Leach <penny@catalyst.net.nz>
# Date: Wed May 25 14:15:34 NZST 2005
# Standard-date: 2005-05-25 02:15:34 GMT
# New-files: lang/de/.arch-ids/block_glossary_random.php.id
#     lang/de/.arch-ids/block_html.php.id
# New-directories: lang/de/help/questionnaire
#     lang/de/help/questionnaire/.arch-ids
# Removed-files: lang/be/docs/.arch-ids/release.html.id
#     lang/be/docs/.arch-ids/releaseold.html.id
# Modified-files: admin/cron.php admin/delete.php
#     admin/editor.html backup/lib.php backup/restore.php
# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
# Keywords:
#
# Updating yadda tadda tadda madda
sub parselog {
    my $log = shift;
    print $log;

    my (@add, @del, @mod, @kw, $sum, $msg );

    if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@add = split(m/\s+/s, $files);
    }
       
    if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@del = split(m/\s+/s, $files);
    }
    
    if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
	my $files = $1;
	@mod = split(m/\s+/s, $files);
    }
    

    if ($log =~ m/^Summary:(.+?)$/m ) {
	$sum = $1;
	$sum =~ s/^\s+//;
	$sum =~ s/\s+$//;
    }

    if ($log =~ m/\n\n(.+)$/s) {
	$msg = $1;
	$msg =~ s/^\s+//;
	$msg =~ s/\s+$//;
    }


    # cleanup the arrays
    foreach my $ref ( (\@add, \@del, \@mod) ) {
	my @tmp = ();
	    while (my $t = pop @$ref) {
		next unless length ($t);
		next if $t =~ m!\{arch\}/!;
		next if $t =~ m!\.arch-ids/!;
		push (@tmp, shell_quote($t));
	    }
	@$ref = @tmp;
    }
    
    print Dumper [$sum, $msg, \@add, \@del, \@mod]; 
    return       ($sum, $msg, \@add, \@del, \@mod); 
}

                 reply	other threads:[~2005-08-24 12:14 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=46a038f90508240514aa918ac@mail.gmail.com \
    --to=martin.langhoff@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.