* git-archimport-script - 2nd iteration
@ 2005-08-24 12:14 Martin Langhoff
0 siblings, 0 replies; only message in thread
From: Martin Langhoff @ 2005-08-24 12:14 UTC (permalink / raw)
To: GIT
[-- Attachment #1: Type: text/plain, Size: 492 bytes --]
Gave the code another pass. The code should be more readable, and make
a bit more sense.
It now:
- handles commit timestamps correctly
- handles binary files correctly
- uses parselog() to tell git-update-cache what's been
added/deleted/modified - much faster commits on large trees
- gets the commit msgs mostly ok
In my immediate TODO:
- handling renames
- branches
- incremental import
A bit further off:
- file modes
- merge detection
cheers,
martin
[-- Attachment #2: git-archimport-script --]
[-- Type: application/octet-stream, Size: 11396 bytes --]
#!/usr/bin/perl -w
# This tool is copyright (c) 2005, Matthias Urlichs.
# It is released under the Gnu Public License, version 2.
#
# The basic idea is to walk the output of tla abrowse,
# fetch the changesets and apply them.
#
=head1 Devel tricks
Add print in front of the shell commands invoked via backticks.
=head2 TODO
- deal with renames correctly
- deal with branches correctly
- keep track of merged patches, and mark a git merge when it happens
- smarter rules to parse the archive history "up" and "down"
- be able to continue an import where we left off
=cut
use strict;
use warnings;
use Getopt::Std;
use File::Spec;
use File::Temp qw(tempfile);
use File::Path qw(mkpath);
use File::Basename qw(basename dirname);
use String::ShellQuote;
use Time::Local;
use IO::Socket;
use IO::Pipe;
use POSIX qw(strftime dup2);
use Data::Dumper qw/ Dumper /;
use IPC::Open2;
$SIG{'PIPE'}="IGNORE";
$ENV{'TZ'}="UTC";
our($opt_h,$opt_A,$opt_v,$opt_k,
$opt_d,$opt_p,$opt_C,$opt_z,
$opt_i,$opt_t);
sub usage() {
print STDERR <<END;
Usage: ${\basename $0} # fetch/update GIT from Arch
[ -h ] [ -v ] [ -A archive ]
[ -C GIT_repository ] [ -t tempdir ]
[ arch-branch ]
END
exit(1);
}
getopts("hviA:C:t:") or usage();
usage if $opt_h;
@ARGV <= 1 or usage();
my $tmp = $opt_t;
$tmp ||= '/tmp';
$tmp .= '/git-archimport/';
my $git_tree = $opt_C;
$git_tree ||= ".";
my $arch_branch = '';
if ($#ARGV == 0) {
$arch_branch = $ARGV[0];
}
# TODO: handle more than one repo
open ABROWSE, "tla abrowse -f -A $opt_A --desc --merges $arch_branch |"
or die "Problems with tla abrowse: $!";
my @psets = (); # the collection
my %ps = (); # the current one
my $mode = '';
my $lastseen = '';
while (<ABROWSE>) {
chomp;
# first record padded w 8 spaces
if (s/^\s{8}\b//) {
# store the record we just captured
if (%ps) {
my %temp = %ps; # break references
push (@psets, \%temp);
%ps = ();
}
my ($id, $type) = split(m/\s{3}/, $_);
$ps{id} = $id;
# deal with types
if ($type =~ m/^\(simple changeset\)/) {
$ps{type} = 's';
} elsif ($type eq '(initial import)') {
$ps{type} = 'i';
} elsif ($type =~ m/^\(tag revision of (.+)\)/) {
$ps{type} = 't';
$ps{tag} = $1;
} else {
warn "Unknown type $type";
}
$lastseen = 'id';
}
if (s/^\s{10}//) {
# 10 leading spaces or more
# indicate commit metadata
# date & author
if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
my ($date, $authoremail) = split(m/\s{2,}/, $_);
$ps{date} = $date;
$ps{date} =~ s/\bGMT$//; # strip off trailign GMT
if ($ps{date} =~ m/\b\w+$/) {
warn 'Arch dates not in GMT?! - imported dates will be wrong';
}
$authoremail =~ m/^(.+)\s(\S+)$/;
$ps{author} = $1;
$ps{email} = $2;
$lastseen = 'date';
} elsif ($lastseen eq 'date') {
# the only hint is position
# subject is after date
$ps{subj} = $_;
$lastseen = 'subj';
} elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
$ps{merges} = [];
$lastseen = 'merges';
} elsif ($lastseen eq 'merges' && s/^\s{2}//) {
push (@{$ps{merges}}, $_);
} else {
warn 'more metadata after merges!?';
}
}
}
if (%ps) {
my %temp = %ps; # break references
push (@psets, \%temp);
%ps = ();
}
close ABROWSE;
## Order patches by time
@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
#print Dumper \@psets;
##
## TODO cleanup irrelevant patches
## and put an initial import
## or a full tag
if ($opt_i) { # initial import
if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
print "Starting import from $psets[0]{id}\n";
} else {
die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
}
`git-init-db`;
die $! if $?;
}
# process
my $lastbranch = branchname($psets[0]{id}); # only good for initial import
foreach my $ps (@psets) {
$ps->{branch} = branchname($ps->{id});
#
# ensure we have a clean state
#
if (`git diff-files`){
die "Unclean tree when about to process $ps->{id} " .
" - did we fail to commit cleanly before?";
}
die $! if $?;
#
# create the branch if needed
# TODO: Find the ancestor!
#
unless ( -e ".git/refs/heads/$ps->{branch}") {
`git checkout -b $ps->{branch}`;
} else {
`git checkout $ps->{branch}`;
}
die $! if $?;
#
# Apply the import/changeset/merge into the working tree
#
if ($ps->{type} eq 'i') {
apply_import($ps) or die $!;
} elsif ($ps->{type} eq 's') {
apply_cset($ps);
}
#
# prepare update git's index, based on what arch knows
# about the pset, resolve parents, etc
#
my $tree;
my $commitlog = `tla cat-archive-log -A $opt_A $ps->{id}`;
die "Error in cat-archive-log: $!" if $?;
# parselog will git-add/rm files
# and generally prepare things for the commit
# NOTE: parselog will shell-quote filenames!
my ($sum, $msg, $add, $del, $mod) = parselog($commitlog);
my $logmessage = "$sum\n$msg";
# imports don't give us good info
# on added files. Shame on them
if ($ps->{type} eq 'i'){
`find . -type f -print0 | grep -zv '^./.git' | xargs -0 git-update-cache --add`;
}
if (@$add) {
$add = join(' ', @$add);
`git-update-cache --add $add`;
die "Error in git-update-cache --add: $!" if $?;
}
if (@$del) {
foreach my $file (@$del) {
unlink $file or die "Problems deleting $file : $!";
}
$del = join (' ', @$del);
`git-update-cache --remove $del`;
die "Error in git-update-cache --remove: $!" if $?;
}
if (@$mod) {
$mod = join (' ', @$mod);
`git-update-cache $mod`;
die "Error in git-update-cache: $!" if $?;
}
# warn "errors when runnign git-update-cache! $!";
$tree = `git-write-tree`;
die "cannot write tree $!" if $?;
chomp $tree;
#
# Commit and clean state
#
my @par;
if ( -e ".git/refs/heads/$ps->{branch}"){
if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
my $p = <HEAD>;
close HEAD;
chomp $p;
push @par, '-p', $p;
} else {
if ($ps->{type} eq 's') {
warn "Could not find the right head for the branch $ps->{branch}";
}
}
}
my $par = join (' ', @par);
$ENV{TZ} = 'GMT';
$ENV{GIT_AUTHOR_NAME} = $ps->{author};
$ENV{GIT_AUTHOR_EMAIL} = $ps->{email};
$ENV{GIT_AUTHOR_DATE} = $ps->{date};
$ENV{GIT_COMMITTER_NAME} = $ps->{author};
$ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
$ENV{GIT_COMMITTER_DATE} = $ps->{date};
print "\t+ commit date is $ps->{date} \n";
my ($pid, $commit_rh, $commit_wh);
$commit_rh = 'commit_rh';
$commit_wh = 'commit_wh';
$pid = open2(*READER, *WRITER, "git-commit-tree $tree $par")
or die $!;
print WRITER $logmessage; # write
close WRITER;
my $commitid = <READER>; # read
chomp $commitid;
close READER;
waitpid $pid,0; # close;
if (length $commitid != 40) {
die "Something went wrong with the commit! $! $commitid";
}
#
# Update the branch
#
open HEAD, ">.git/refs/heads/$ps->{branch}";
print HEAD $commitid;
close HEAD;
unlink ('.git/HEAD');
symlink("refs/heads/$ps->{branch}",".git/HEAD");
print " * Committed $ps->{id}\n";
print " + tree $tree\n";
print " + commit $commitid\n";
}
sub branchname {
my $id = shift;
$id =~ s#^.+?/##;
my @parts = split(m/--/, $id);
return join('--', @parts[0..1]);
}
sub apply_import {
my $ps = shift;
my $bname = branchname($ps->{id});
`mkdir -p $tmp`;
`tla get -s --no-pristine -A $opt_A $ps->{id} $tmp/import`;
die "Cannot get import: $!" if $?;
`rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
die "Cannot rsync import:$!" if $?;
`rm -fr $tmp/import`;
die "Cannot remove tempdir: $!" if $?;
return 1;
}
sub apply_cset {
my $ps = shift;
`mkdir -p $tmp`;
# get the changeset
`tla get-changeset -A $opt_A $ps->{id} $tmp/changeset`;
die "Cannot get changeset: $!" if $?;
# apply patches
if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
# this can be sped up considerably by doing
# (find | xargs cat) | patch
# but that cna get mucked up by patches
# with missing trailing newlines or the standard
# 'missing newline' flag in the patch - possibly
# produced with an old/buggy diff.
# slow and safe, we invoke patch once per patchfile
`find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
die "Problem applying patches! $!" if $?;
}
# apply changed binary files
if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
foreach my $mod (@modified) {
chomp $mod;
my $orig = $mod;
$orig =~ s/\.modified$//; # lazy
$orig =~ s!^\Q$tmp\E/changeset/patches/!!;
print "rsync -p '$mod' '$orig'";
`rsync -p $mod ./$orig`;
die "Problem applying binary changes! $!" if $?;
}
}
# bring in new files
`rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
# deleted files are hinted from the commitlog processing
`rm -fr $tmp/changeset`;
}
# =for reference
# A log entry looks like
# Revision: moodle-org--moodle--1.3.3--patch-15
# Archive: arch-eduforge@catalyst.net.nz--2004
# Creator: Penny Leach <penny@catalyst.net.nz>
# Date: Wed May 25 14:15:34 NZST 2005
# Standard-date: 2005-05-25 02:15:34 GMT
# New-files: lang/de/.arch-ids/block_glossary_random.php.id
# lang/de/.arch-ids/block_html.php.id
# New-directories: lang/de/help/questionnaire
# lang/de/help/questionnaire/.arch-ids
# Removed-files: lang/be/docs/.arch-ids/release.html.id
# lang/be/docs/.arch-ids/releaseold.html.id
# Modified-files: admin/cron.php admin/delete.php
# admin/editor.html backup/lib.php backup/restore.php
# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
# Keywords:
#
# Updating yadda tadda tadda madda
sub parselog {
my $log = shift;
print $log;
my (@add, @del, @mod, @kw, $sum, $msg );
if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
my $files = $1;
@add = split(m/\s+/s, $files);
}
if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
my $files = $1;
@del = split(m/\s+/s, $files);
}
if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
my $files = $1;
@mod = split(m/\s+/s, $files);
}
if ($log =~ m/^Summary:(.+?)$/m ) {
$sum = $1;
$sum =~ s/^\s+//;
$sum =~ s/\s+$//;
}
if ($log =~ m/\n\n(.+)$/s) {
$msg = $1;
$msg =~ s/^\s+//;
$msg =~ s/\s+$//;
}
# cleanup the arrays
foreach my $ref ( (\@add, \@del, \@mod) ) {
my @tmp = ();
while (my $t = pop @$ref) {
next unless length ($t);
next if $t =~ m!\{arch\}/!;
next if $t =~ m!\.arch-ids/!;
push (@tmp, shell_quote($t));
}
@$ref = @tmp;
}
print Dumper [$sum, $msg, \@add, \@del, \@mod];
return ($sum, $msg, \@add, \@del, \@mod);
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2005-08-24 12:14 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-24 12:14 git-archimport-script - 2nd iteration Martin Langhoff
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.