All of lore.kernel.org
 help / color / mirror / Atom feed
From: Brandon Casey <drafnel@gmail.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, peff@peff.net, spearce@spearce.org,
	Brandon Casey <drafnel@gmail.com>
Subject: Re: git as an sfc member project
Date: Sat, 23 Oct 2010 17:48:29 -0500	[thread overview]
Message-ID: <1287874109-15565-1-git-send-email-drafnel@gmail.com> (raw)
In-Reply-To: <cs0GhwZZ9W-pJdXPmTo0di_hrUwMa14GE8dSJeIQtOwrvDdl4KxJ_g@cipher.nrlssc.navy.mil>

Here's an updated version of this script if anyone is interested.

It can now do the git-blame calls in parallel.  Use -t #threads.

Here's the usage info:

      ' -c     print count of all lines in all files at the end',
      ' -f     produce file centric output (overrides -l and -s)',
      ' -l     produce longer format',
      ' -s     produce short format, line count and author only',
      ' -ls    Both -l and -s produce an even longer long format',
      ' -t n   set number of threads to use',
      ' -x re  exclude files matching regex',
      ' -v     be more verbose',

Enjoy.

-Brandon

---
 git_blame_stats.perl |  387 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 387 insertions(+), 0 deletions(-)
 create mode 100755 git_blame_stats.perl

diff --git a/git_blame_stats.perl b/git_blame_stats.perl
new file mode 100755
index 0000000..618cf0a
--- /dev/null
+++ b/git_blame_stats.perl
@@ -0,0 +1,387 @@
+#!/usr/bin/perl -w
+
+use lib (split(/:/, $ENV{GITPERLLIB} || '/path/to/git/lib/perl5/site_perl/5.10.0'));
+
+use strict;
+use threads;
+use Thread::Queue;
+use Getopt::Std;
+use Git;
+
+my @LSTREE_OPTS = ('-r', '--name-only');
+my @BLAME_OPTS = ('-C', '-C', '-w', '--incremental');
+
+$Getopt::Std::STANDARD_HELP_VERSION = 1;
+$main::VERSION = 1.0;
+
+sub usage {
+	my $name;
+
+	eval {
+		require File::Basename;
+		$name = File::Basename::basename($0);
+	} or do {
+		$name = substr $0, rindex($0, '/') + 1;
+	};
+
+	print 'Usage: ', $name, ' [--help] [-cflstvx] <rev> [paths...]', "\n";
+}
+
+sub main::HELP_MESSAGE {
+	my $fh = shift;
+
+	eval {select $fh; usage};
+
+	local $\ = "\n";
+	local $, = "\n";
+
+	print $fh '',
+	      'Generate authorship statistics from a git repository.',
+	      '',
+	      'OPTIONS',
+	      ' -c     print count of all lines in all files at the end',
+	      ' -f     produce file centric output (overrides -l and -s)',
+	      ' -l     produce longer format',
+	      ' -s     produce short format, line count and author only',
+	      ' -ls    Both -l and -s produce an even longer long format',
+	      ' -t n   set number of threads to use',
+	      ' -x re  exclude files matching regex',
+	      ' -v     be more verbose',
+	      ' --help this text',
+	      '';
+}
+
+sub parse_blame_entry {
+	my $fh = shift;
+
+	return () unless defined($_ = <$fh>);
+	chomp;
+
+	my ($sha1, $sourceline, $resultline, $num_lines) = split;
+
+	return () unless defined $num_lines;
+
+	my %h = (sha1 => $sha1, sourceline => $sourceline,
+		 resultline => $resultline, lines => $num_lines);
+	while (<$fh>) {
+		chomp;
+		my ($key, $val) = split ' ', $_, 2;
+		$h{$key} = $val;
+		last if m/^filename /;
+	}
+
+	return %h;
+}
+
+sub blame_file {
+	my $repo = shift;
+	my $ref = shift;
+	my $filename = shift;
+	my $authors = shift;
+
+	my ($fh, $ctx) = $repo->command_output_pipe('blame', @BLAME_OPTS,
+		$ref, '--', $filename);
+
+	my %commits;
+	while (my %h = parse_blame_entry $fh) {
+
+		if (! exists $commits{$h{'sha1'}}) {
+
+			if (! exists $authors->{$h{'author'}}->{$filename}) {
+				$authors->{$h{'author'}}->{$filename} = 0;
+			}
+			$commits{$h{'sha1'}} =
+				\$authors->{$h{'author'}}->{$filename};
+		}
+
+		${$commits{$h{'sha1'}}} += $h{'lines'};
+	}
+
+	$repo->command_close_pipe($fh, $ctx);
+}
+
+sub count_total_lines {
+	my $authors = shift;
+
+	my $lines = 0;
+
+	for (values %{$authors}) {
+		for (values %{$_}) { $lines += $_; }
+	}
+
+	return $lines;
+}
+
+# Returns hash
+# key: author name
+# value: authored lines
+sub count_author_lines {
+	my $authors = shift;
+
+	my %alines;
+
+	foreach my $author (keys %{$authors}) {
+		my $lines = 0;
+		for (values %{$authors->{$author}}) { $lines += $_; }
+		$alines{$author} = $lines;
+	}
+
+	return %alines;
+}
+
+# Returns hash
+# key: filename
+# value: lines in file
+sub count_file_lines {
+	my $authors = shift;
+
+	my %flines;
+
+	for (values %{$authors}) {
+		foreach my $file (keys %{$_}) {
+			$flines{$file} += $_->{$file};
+		}
+	}
+
+	return %flines;
+}
+
+# Short format
+#   lines author
+sub print_short {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%6d  %s\n", $alines{$author}, $author;
+	}
+}
+
+# Long format
+# author (lines):
+#    file_lines filename
+#    file_lines filename
+#    file_lines filename
+sub print_long {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		print $author, ' (', $alines{$author}, '):', "\n";
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d %s\n", $authors->{$author}->{$file},
+			      $file;
+		}
+	}
+}
+
+# Longer format
+# author (lines, % of all lines):
+#    file_lines (% of author lines) filename
+#    file_lines (% of author lines) filename
+sub print_longer {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+	my $total_lines = count_total_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%s (%d, %.2f%%):\n", $author, $alines{$author},
+			100. * $alines{$author} / $total_lines;
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d (%5.2f%%) %s\n",
+			       $authors->{$author}->{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $alines{$author},
+			       $file;
+		}
+	}
+}
+
+# Longer format
+# author (# lines in X files, % of all lines, % of all files):
+#    lines (% of file) file_lines (% of author lines) filename
+#    lines (% of file) file_lines (% of author lines) filename
+sub print_with_file_percentage {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+	my %flines = count_file_lines $authors;
+	my $total_lines = count_total_lines $authors;
+	my $total_files = scalar(keys %flines);
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%s (%d lines in %d files, " .
+		       "%.2f%% of all lines, %.2f%% of all files):\n",
+		       $author, $alines{$author},
+		       scalar(keys %{$authors->{$author}}),
+		       100. * $alines{$author} / $total_lines,
+		       100. * scalar(keys %{$authors->{$author}})/$total_files;
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d (%6.2f%%) of %6d (%6.2f%%) %s\n",
+			       $authors->{$author}->{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $flines{$file},
+			       $flines{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $alines{$author},
+			       $file;
+		}
+	}
+}
+
+# File perspective format
+# filename (lines):
+#    lines author
+#    lines author
+sub print_with_file_perspective {
+	my $authors = shift;
+
+	my %flines = count_file_lines $authors;
+
+	foreach my $file (sort keys %flines) {
+		my @auths = grep {exists $authors->{$_}->{$file}}
+			keys %{$authors};
+		print $file, ' (', $flines{$file}, '):', "\n";
+		foreach my $author (sort
+		    {$authors->{$b}->{$file} <=> $authors->{$a}->{$file}}
+		    @auths) {
+			printf " %10d %s\n", $authors->{$author}->{$file},
+				$author;
+		}
+	}
+}
+
+
+my $verbose = 0;
+my $output_format = 0;
+my $show_total = 0;
+my $exclude_pattern;
+my $nthreads = 1;
+
+our ($opt_c, $opt_f, $opt_l, $opt_s, $opt_t, $opt_v, $opt_x);
+getopts('cflst:vx:') or die 'Invalid options specified';
+
+	if ($opt_c) {
+		$show_total = 1;
+	}
+	if ($opt_f) {
+		$output_format = 4;
+	} elsif ($opt_l && $opt_s) {
+		$output_format = 3;
+	} elsif ($opt_l) {
+		$output_format = 2;
+	} elsif ($opt_s) {
+		$output_format = 1;
+	}
+	if (defined $opt_t) {
+		$nthreads = $opt_t;
+		if ($nthreads !~ /^\d+$/ || $nthreads < 0) {
+			die 'Error: argument to -t must be integer >= 0';
+		}
+		if ($nthreads == 0) {
+			eval {
+				require Sys::CPU;
+				$nthreads = Sys::CPU::cpu_count();
+			} or $nthreads = 1;
+		}
+	}
+	if ($opt_v) {
+		$verbose = 1;
+	}
+	if ($opt_x) {
+		$exclude_pattern = $opt_x;
+	}
+
+eval {select STDERR; usage; exit 1} unless $#ARGV >= 0;
+
+my %authors;
+my @thr;
+my $repo = Git->repository();
+
+# Spawn ls-tree now, so it can fail before creating the threads
+my ($fh, $ctx) = $repo->command_output_pipe('ls-tree', @LSTREE_OPTS,
+	'--', @ARGV);
+
+print STDERR 'Using ', $nthreads, ' thread(s).', "\n" if $verbose;
+
+my $DataQueue = Thread::Queue->new();
+
+# start the threads
+for (my $i = 0; $i < $nthreads; $i++) {
+	($thr[$i]) = threads->create(sub {
+		my $tid = threads->tid();
+		my %a;
+		while (my $f = $DataQueue->dequeue()) {
+			print STDERR "[$tid]Processing file: $f\n" if $verbose;
+			blame_file $repo, $ARGV[0], $f, \%a;
+		}
+		return %a;
+	});
+}
+
+# now queue up the files
+while (<$fh>) {
+	chomp;
+
+	if ($exclude_pattern && m/$exclude_pattern/o) {
+		print STDERR "Skipping file: $_\n" if $verbose;
+		next;
+	} else {
+		print STDERR "Queuing file: $_\n" if $verbose;
+	}
+
+	$DataQueue->enqueue($_);
+}
+$repo->command_close_pipe($fh, $ctx);
+
+# queue up an undef entry for each thread
+for (my $i = 0; $i < $nthreads; $i++) {
+	$DataQueue->enqueue(undef);
+}
+
+# merge the author hash from each thread
+for (my $i = 0; $i < $nthreads; $i++) {
+	my %th_authors = $thr[$i]->join;
+
+	foreach my $author (keys %th_authors) {
+		if (! exists $authors{$author}) {
+			$authors{$author} = $th_authors{$author};
+			next;
+		}
+		foreach my $filename (keys %{$th_authors{$author}}) {
+			if (! exists $authors{$author}->{$filename}) {
+				$authors{$author}->{$filename} =
+					$th_authors{$author}->{$filename};
+			} else {
+				$authors{$author}->{$filename} +=
+					$th_authors{$author}->{$filename};
+			}
+		}
+	}
+}
+
+
+if ($output_format == 0) {
+	print_long \%authors;
+} elsif ($output_format == 1) {
+	print_short \%authors;
+} elsif ($output_format == 2) {
+	print_longer \%authors;
+} elsif ($output_format == 3) {
+	print_with_file_percentage \%authors;
+} elsif ($output_format == 4) {
+	print_with_file_perspective \%authors;
+}
+
+printf "%6d  total lines\n", count_total_lines(\%authors) if $show_total;
+
+exit;
-- 
1.7.3.1.45.g9855b

  reply	other threads:[~2010-10-23 22:49 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-22 18:30 git as an sfc member project Jeff King
2010-10-22 19:19 ` Shawn Pearce
2010-10-22 19:35   ` Jeff King
2010-10-22 20:06     ` Shawn Pearce
2010-10-22 20:59       ` Sverre Rabbelier
2010-10-22 21:48   ` Junio C Hamano
2010-10-22 22:59   ` Junio C Hamano
2010-10-22 23:18     ` Jeff King
2010-10-22 23:21       ` Brandon Casey
2010-10-22 23:26         ` Junio C Hamano
     [not found]         ` <hh0bQq8TcM0saDTuJo6qVdOMgn-14aysvhF_S70syB678Of7zQOsY9jLajG2WpeGXid8jtG4kVA@cipher.nrlssc.navy.mil>
2010-10-23  0:09           ` Brandon Casey
2010-10-23  1:30             ` Brandon Casey
2010-10-23 22:48               ` Brandon Casey [this message]
2010-10-22 23:22       ` Junio C Hamano
2010-10-23 11:52       ` Ævar Arnfjörð Bjarmason
2010-10-23 13:39         ` Jeff King
2010-10-23 16:03           ` A Large Angry SCM
2010-10-26 22:39 ` Jeff King
2010-10-27  7:03 ` Tait
2010-10-27 11:08   ` Jeff King
2010-11-02 23:03     ` Bradley M. Kuhn
  -- strict thread matches above, loose matches on Subject: below --
2010-02-24 15:44 Jeff King
2010-02-24 16:07 ` Jakub Narebski
2010-02-26 12:39   ` Jeff King
2010-02-26 15:56     ` Jakub Narebski
2010-03-01 10:58       ` Jeff King
2010-02-24 16:22 ` Shawn O. Pearce
2010-02-26 12:49   ` Jeff King
2010-02-24 17:44 ` Christian Couder
2010-02-26 12:25   ` Jeff King
2010-02-24 18:12 ` Junio C Hamano
2010-02-26 12:29   ` Jeff King
2010-02-26 12:37     ` Jeff King
2010-02-26 12:59 ` Jeff King
2010-02-26 13:14   ` Julian Phillips
2010-03-01 10:53     ` Jeff King
2010-02-27  6:35   ` Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1287874109-15565-1-git-send-email-drafnel@gmail.com \
    --to=drafnel@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peff@peff.net \
    --cc=spearce@spearce.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.