git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Brandon Casey <drafnel@gmail.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, peff@peff.net, spearce@spearce.org,
	Brandon Casey <drafnel@gmail.com>
Subject: Re: git as an sfc member project
Date: Sat, 23 Oct 2010 17:48:29 -0500	[thread overview]
Message-ID: <1287874109-15565-1-git-send-email-drafnel@gmail.com> (raw)
In-Reply-To: <cs0GhwZZ9W-pJdXPmTo0di_hrUwMa14GE8dSJeIQtOwrvDdl4KxJ_g@cipher.nrlssc.navy.mil>

Here's an updated version of this script if anyone is interested.

It can now do the git-blame calls in parallel.  Use -t #threads.

Here's the usage info:

      ' -c     print count of all lines in all files at the end',
      ' -f     produce file centric output (overrides -l and -s)',
      ' -l     produce longer format',
      ' -s     produce short format, line count and author only',
      ' -ls    Both -l and -s produce an even longer long format',
      ' -t n   set number of threads to use',
      ' -x re  exclude files matching regex',
      ' -v     be more verbose',

Enjoy.

-Brandon

---
 git_blame_stats.perl |  387 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 387 insertions(+), 0 deletions(-)
 create mode 100755 git_blame_stats.perl

diff --git a/git_blame_stats.perl b/git_blame_stats.perl
new file mode 100755
index 0000000..618cf0a
--- /dev/null
+++ b/git_blame_stats.perl
@@ -0,0 +1,387 @@
+#!/usr/bin/perl -w
+
+use lib (split(/:/, $ENV{GITPERLLIB} || '/path/to/git/lib/perl5/site_perl/5.10.0'));
+
+use strict;
+use threads;
+use Thread::Queue;
+use Getopt::Std;
+use Git;
+
+my @LSTREE_OPTS = ('-r', '--name-only');
+my @BLAME_OPTS = ('-C', '-C', '-w', '--incremental');
+
+$Getopt::Std::STANDARD_HELP_VERSION = 1;
+$main::VERSION = 1.0;
+
+sub usage {
+	my $name;
+
+	eval {
+		require File::Basename;
+		$name = File::Basename::basename($0);
+	} or do {
+		$name = substr $0, rindex($0, '/') + 1;
+	};
+
+	print 'Usage: ', $name, ' [--help] [-cflstvx] <rev> [paths...]', "\n";
+}
+
+sub main::HELP_MESSAGE {
+	my $fh = shift;
+
+	eval {select $fh; usage};
+
+	local $\ = "\n";
+	local $, = "\n";
+
+	print $fh '',
+	      'Generate authorship statistics from a git repository.',
+	      '',
+	      'OPTIONS',
+	      ' -c     print count of all lines in all files at the end',
+	      ' -f     produce file centric output (overrides -l and -s)',
+	      ' -l     produce longer format',
+	      ' -s     produce short format, line count and author only',
+	      ' -ls    Both -l and -s produce an even longer long format',
+	      ' -t n   set number of threads to use',
+	      ' -x re  exclude files matching regex',
+	      ' -v     be more verbose',
+	      ' --help this text',
+	      '';
+}
+
+sub parse_blame_entry {
+	my $fh = shift;
+
+	return () unless defined($_ = <$fh>);
+	chomp;
+
+	my ($sha1, $sourceline, $resultline, $num_lines) = split;
+
+	return () unless defined $num_lines;
+
+	my %h = (sha1 => $sha1, sourceline => $sourceline,
+		 resultline => $resultline, lines => $num_lines);
+	while (<$fh>) {
+		chomp;
+		my ($key, $val) = split ' ', $_, 2;
+		$h{$key} = $val;
+		last if m/^filename /;
+	}
+
+	return %h;
+}
+
+sub blame_file {
+	my $repo = shift;
+	my $ref = shift;
+	my $filename = shift;
+	my $authors = shift;
+
+	my ($fh, $ctx) = $repo->command_output_pipe('blame', @BLAME_OPTS,
+		$ref, '--', $filename);
+
+	my %commits;
+	while (my %h = parse_blame_entry $fh) {
+
+		if (! exists $commits{$h{'sha1'}}) {
+
+			if (! exists $authors->{$h{'author'}}->{$filename}) {
+				$authors->{$h{'author'}}->{$filename} = 0;
+			}
+			$commits{$h{'sha1'}} =
+				\$authors->{$h{'author'}}->{$filename};
+		}
+
+		${$commits{$h{'sha1'}}} += $h{'lines'};
+	}
+
+	$repo->command_close_pipe($fh, $ctx);
+}
+
+sub count_total_lines {
+	my $authors = shift;
+
+	my $lines = 0;
+
+	for (values %{$authors}) {
+		for (values %{$_}) { $lines += $_; }
+	}
+
+	return $lines;
+}
+
+# Returns hash
+# key: author name
+# value: authored lines
+sub count_author_lines {
+	my $authors = shift;
+
+	my %alines;
+
+	foreach my $author (keys %{$authors}) {
+		my $lines = 0;
+		for (values %{$authors->{$author}}) { $lines += $_; }
+		$alines{$author} = $lines;
+	}
+
+	return %alines;
+}
+
+# Returns hash
+# key: filename
+# value: lines in file
+sub count_file_lines {
+	my $authors = shift;
+
+	my %flines;
+
+	for (values %{$authors}) {
+		foreach my $file (keys %{$_}) {
+			$flines{$file} += $_->{$file};
+		}
+	}
+
+	return %flines;
+}
+
+# Short format
+#   lines author
+sub print_short {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%6d  %s\n", $alines{$author}, $author;
+	}
+}
+
+# Long format
+# author (lines):
+#    file_lines filename
+#    file_lines filename
+#    file_lines filename
+sub print_long {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		print $author, ' (', $alines{$author}, '):', "\n";
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d %s\n", $authors->{$author}->{$file},
+			      $file;
+		}
+	}
+}
+
+# Longer format
+# author (lines, % of all lines):
+#    file_lines (% of author lines) filename
+#    file_lines (% of author lines) filename
+sub print_longer {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+	my $total_lines = count_total_lines $authors;
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%s (%d, %.2f%%):\n", $author, $alines{$author},
+			100. * $alines{$author} / $total_lines;
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d (%5.2f%%) %s\n",
+			       $authors->{$author}->{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $alines{$author},
+			       $file;
+		}
+	}
+}
+
+# Longer format
+# author (# lines in X files, % of all lines, % of all files):
+#    lines (% of file) file_lines (% of author lines) filename
+#    lines (% of file) file_lines (% of author lines) filename
+sub print_with_file_percentage {
+	my $authors = shift;
+
+	my %alines = count_author_lines $authors;
+	my %flines = count_file_lines $authors;
+	my $total_lines = count_total_lines $authors;
+	my $total_files = scalar(keys %flines);
+
+	foreach my $author (sort {$alines{$b} <=> $alines{$a}} keys %alines) {
+		printf "%s (%d lines in %d files, " .
+		       "%.2f%% of all lines, %.2f%% of all files):\n",
+		       $author, $alines{$author},
+		       scalar(keys %{$authors->{$author}}),
+		       100. * $alines{$author} / $total_lines,
+		       100. * scalar(keys %{$authors->{$author}})/$total_files;
+		foreach my $file (sort
+		    {$authors->{$author}->{$b} <=> $authors->{$author}->{$a}}
+		    keys %{$authors->{$author}}) {
+			printf "  %10d (%6.2f%%) of %6d (%6.2f%%) %s\n",
+			       $authors->{$author}->{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $flines{$file},
+			       $flines{$file},
+			       100. *
+			       $authors->{$author}->{$file} / $alines{$author},
+			       $file;
+		}
+	}
+}
+
+# File perspective format
+# filename (lines):
+#    lines author
+#    lines author
+sub print_with_file_perspective {
+	my $authors = shift;
+
+	my %flines = count_file_lines $authors;
+
+	foreach my $file (sort keys %flines) {
+		my @auths = grep {exists $authors->{$_}->{$file}}
+			keys %{$authors};
+		print $file, ' (', $flines{$file}, '):', "\n";
+		foreach my $author (sort
+		    {$authors->{$b}->{$file} <=> $authors->{$a}->{$file}}
+		    @auths) {
+			printf " %10d %s\n", $authors->{$author}->{$file},
+				$author;
+		}
+	}
+}
+
+
+my $verbose = 0;
+my $output_format = 0;
+my $show_total = 0;
+my $exclude_pattern;
+my $nthreads = 1;
+
+our ($opt_c, $opt_f, $opt_l, $opt_s, $opt_t, $opt_v, $opt_x);
+getopts('cflst:vx:') or die 'Invalid options specified';
+
+	if ($opt_c) {
+		$show_total = 1;
+	}
+	if ($opt_f) {
+		$output_format = 4;
+	} elsif ($opt_l && $opt_s) {
+		$output_format = 3;
+	} elsif ($opt_l) {
+		$output_format = 2;
+	} elsif ($opt_s) {
+		$output_format = 1;
+	}
+	if (defined $opt_t) {
+		$nthreads = $opt_t;
+		if ($nthreads !~ /^\d+$/ || $nthreads < 0) {
+			die 'Error: argument to -t must be integer >= 0';
+		}
+		if ($nthreads == 0) {
+			eval {
+				require Sys::CPU;
+				$nthreads = Sys::CPU::cpu_count();
+			} or $nthreads = 1;
+		}
+	}
+	if ($opt_v) {
+		$verbose = 1;
+	}
+	if ($opt_x) {
+		$exclude_pattern = $opt_x;
+	}
+
+eval {select STDERR; usage; exit 1} unless $#ARGV >= 0;
+
+my %authors;
+my @thr;
+my $repo = Git->repository();
+
+# Spawn ls-tree now, so it can fail before creating the threads
+my ($fh, $ctx) = $repo->command_output_pipe('ls-tree', @LSTREE_OPTS,
+	'--', @ARGV);
+
+print STDERR 'Using ', $nthreads, ' thread(s).', "\n" if $verbose;
+
+my $DataQueue = Thread::Queue->new();
+
+# start the threads
+for (my $i = 0; $i < $nthreads; $i++) {
+	($thr[$i]) = threads->create(sub {
+		my $tid = threads->tid();
+		my %a;
+		while (my $f = $DataQueue->dequeue()) {
+			print STDERR "[$tid]Processing file: $f\n" if $verbose;
+			blame_file $repo, $ARGV[0], $f, \%a;
+		}
+		return %a;
+	});
+}
+
+# now queue up the files
+while (<$fh>) {
+	chomp;
+
+	if ($exclude_pattern && m/$exclude_pattern/o) {
+		print STDERR "Skipping file: $_\n" if $verbose;
+		next;
+	} else {
+		print STDERR "Queuing file: $_\n" if $verbose;
+	}
+
+	$DataQueue->enqueue($_);
+}
+$repo->command_close_pipe($fh, $ctx);
+
+# queue up an undef entry for each thread
+for (my $i = 0; $i < $nthreads; $i++) {
+	$DataQueue->enqueue(undef);
+}
+
+# merge the author hash from each thread
+for (my $i = 0; $i < $nthreads; $i++) {
+	my %th_authors = $thr[$i]->join;
+
+	foreach my $author (keys %th_authors) {
+		if (! exists $authors{$author}) {
+			$authors{$author} = $th_authors{$author};
+			next;
+		}
+		foreach my $filename (keys %{$th_authors{$author}}) {
+			if (! exists $authors{$author}->{$filename}) {
+				$authors{$author}->{$filename} =
+					$th_authors{$author}->{$filename};
+			} else {
+				$authors{$author}->{$filename} +=
+					$th_authors{$author}->{$filename};
+			}
+		}
+	}
+}
+
+
+if ($output_format == 0) {
+	print_long \%authors;
+} elsif ($output_format == 1) {
+	print_short \%authors;
+} elsif ($output_format == 2) {
+	print_longer \%authors;
+} elsif ($output_format == 3) {
+	print_with_file_percentage \%authors;
+} elsif ($output_format == 4) {
+	print_with_file_perspective \%authors;
+}
+
+printf "%6d  total lines\n", count_total_lines(\%authors) if $show_total;
+
+exit;
-- 
1.7.3.1.45.g9855b

  reply	other threads:[~2010-10-23 22:49 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-22 18:30 git as an sfc member project Jeff King
2010-10-22 19:19 ` Shawn Pearce
2010-10-22 19:35   ` Jeff King
2010-10-22 20:06     ` Shawn Pearce
2010-10-22 20:59       ` Sverre Rabbelier
2010-10-22 21:48   ` Junio C Hamano
2010-10-22 22:59   ` Junio C Hamano
2010-10-22 23:18     ` Jeff King
2010-10-22 23:21       ` Brandon Casey
2010-10-22 23:26         ` Junio C Hamano
     [not found]         ` <hh0bQq8TcM0saDTuJo6qVdOMgn-14aysvhF_S70syB678Of7zQOsY9jLajG2WpeGXid8jtG4kVA@cipher.nrlssc.navy.mil>
2010-10-23  0:09           ` Brandon Casey
2010-10-23  1:30             ` Brandon Casey
2010-10-23 22:48               ` Brandon Casey [this message]
2010-10-22 23:22       ` Junio C Hamano
2010-10-23 11:52       ` Ævar Arnfjörð Bjarmason
2010-10-23 13:39         ` Jeff King
2010-10-23 16:03           ` A Large Angry SCM
2010-10-26 22:39 ` Jeff King
2010-10-27  7:03 ` Tait
2010-10-27 11:08   ` Jeff King
2010-11-02 23:03     ` Bradley M. Kuhn
  -- strict thread matches above, loose matches on Subject: below --
2010-02-24 15:44 Jeff King
2010-02-24 16:07 ` Jakub Narebski
2010-02-26 12:39   ` Jeff King
2010-02-26 15:56     ` Jakub Narebski
2010-03-01 10:58       ` Jeff King
2010-02-24 16:22 ` Shawn O. Pearce
2010-02-26 12:49   ` Jeff King
2010-02-24 17:44 ` Christian Couder
2010-02-26 12:25   ` Jeff King
2010-02-24 18:12 ` Junio C Hamano
2010-02-26 12:29   ` Jeff King
2010-02-26 12:37     ` Jeff King
2010-02-26 12:59 ` Jeff King
2010-02-26 13:14   ` Julian Phillips
2010-03-01 10:53     ` Jeff King
2010-02-27  6:35   ` Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1287874109-15565-1-git-send-email-drafnel@gmail.com \
    --to=drafnel@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peff@peff.net \
    --cc=spearce@spearce.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).