* Re: [SCRIPT] chomp: trim trailing whitespace
2006-05-27 11:32 ` Jeff Garzik
@ 2006-05-27 11:48 ` Dmitry Fedorov
2006-05-27 12:42 ` Jan Engelhardt
1 sibling, 0 replies; 11+ messages in thread
From: Dmitry Fedorov @ 2006-05-27 11:48 UTC (permalink / raw)
To: Git Mailing List, Linux Kernel
[-- Attachment #1: Type: text/plain, Size: 226 bytes --]
Jan Engelhardt wrote:
>> Attached to this email is chomp.pl, a Perl script which removes trailing
>> whitespace from several files. I've had this for years, as
trailing whitespace
>> is one of my pet peeves.
And my scripts.
[-- Attachment #2: find-text-files --]
[-- Type: application/octet-stream, Size: 7833 bytes --]
#!/usr/bin/perl -w
=head1 NAME
find-text-files - traverse a file tree and guess plain text files
=head1 SYNOPSIS
find-text-files [options] dir ...
=head1 DESCRIPTION
This program traverse a file tree, guess plain text files
and outputs their names to STDOUT.
=cut
require 5.004;
use strict;
use integer;
use File::Find;
use Getopt::Long;
use IPC::Open2;
sub usage {
warn "\n".join(" ", @_)."\n" if @_;
warn <<EOF;
Usage:
find-text-files [-exclude='perlre' ...] [-include='perlre' ...] \
[-total] [-excluded] [-included] [-selectors] \
dir ...
EOF
exit(1);
}
=head1 PARAMETERS
=over 4
=item dir ...
Directories list.
=back
=cut
=head1 OPTIONS
=over 4
=item -exclude='perlre' ...
Perl regular expression, case insensitive.
Matched file names excluded from output list.
=item -include='perlre' ...
Perl regular expression, case insensitive.
Matched file names included to output list.
=head2 Note
Directory part of the file name stripped before match,
'^filename\.ext$' will be matched exactly to filename.ext
with any directory prepended.
=item -total
print statistic counters to STDERR.
=item -excluded
print to STDERR what files are excluded and why.
=item -included
print to STDERR what files are included and why.
=item -selectors
Prints exclude/include regular expressions and file suffices and exits.
=back
=head1 HOW IT WORKS
Each of file names checked in that order:
* check against exclude RE; matched file excluded (see -exclude option);
if not matched, then:
* check against include RE; matched file included (see -include option);
if not matched, then:
* check against binary suffices table; matched file excluded;
if not matched, then:
* check against text suffices table; matched file included;
if not matched, then:
* checked by file(1)
All of this allows to avoid file(1)'s misdetection on some texts
and reduce time spent for file(1) calls.
=head1 NOTES
Does not follows symlinks.
Zero size files are skipped.
=cut
my $ help_option = 0;
my @ include_options;
my @ exclude_options;
my $ total_option = 0;
my $ excluded_option = 0;
my $ included_option = 0;
my $selectors_option = 0;
GetOptions(
'help' => \$ help_option,
'exclude=s' => \@ exclude_options,
'include=s' => \@ include_options,
'total' => \$ total_option,
'excluded' => \$ excluded_option,
'included' => \$ included_option,
'selectors' => \$selectors_option,
) or usage;
usage if $help_option;
my %bin_suffices;
my %txt_suffices;
BEGIN
{
map { $bin_suffices{$_} = undef }
(
'gif', 'tif', 'tiff', 'png', 'jpg', 'jpeg',
'avi', 'mpg', 'mpeg',
'o', 'obj', 'exe',
'cab', 'a', 'rar', 'arj', 'zip', 'tar', 'cpio',
'z', 'gz', 'bz', 'bz2', 'tgz', 'tbz', 'tbz2',
'iso', 'bin', 'img', 'imag', 'image',
'diff', 'patch' # diff/patch files could have EOL spaces!
);
map { $txt_suffices{$_} = undef }
(
'txt', 'text', 'html', 'htm', 'xml', 'php',
'c', 'cpp', 'c++', 'cc', 'cxx',
'h', 'hpp', 'h++', 'hh', 'hxx',
'asm', 'inc', 'mod',
'for', 'f77', 'g77',
'java', 'jav',
'bas', 'vb',
'pl', 'pm', 'pod',
'make', 'mak', 'mk',
'awk', 'sh', 'bat', 'cmd', 'rexx', 'rex',
'sql', 'def', 'man',
'cvsignore'
);
}
my $exclude_re = '(,v$)';
map { $exclude_re .= '|('.lc $_.')'; } @exclude_options;
my $include_re = '(^makefile$)';
map { $include_re .= '|('.lc $_.')'; } @include_options;
if ($selectors_option)
{
my $bin_suffices = join(" ", sort keys %bin_suffices);
my $txt_suffices = join(" ", sort keys %txt_suffices);
print STDERR "\n";
print STDERR "Exclude RE: ".$exclude_re."\n";
print STDERR "\n";
print STDERR "Include RE: ".$include_re."\n";
print STDERR "\n";
print STDERR "Exclude suffices: ".$bin_suffices."\n";
print STDERR "\n";
print STDERR "Include suffices: ".$txt_suffices."\n";
print STDERR "\n";
exit 0;
}
scalar(@ARGV) >= 1 or usage("no directory specified");
my (
$total_files_checked,
$total_files_empty,
$total_files_excluded_by_re,
$total_files_included_by_re,
$total_files_excluded_by_suffix,
$total_files_included_by_suffix,
$total_files_excluded_by_file,
$total_files_included_by_file
) = (0,0,0,0,0,0,0,0);
sub _by($$$$)
{
my ($inex_option, $inex_str, $by, $name) = @_;
printf(STDERR "%scluded by %13s: %s\n", $inex_str, $by, $name)
if $inex_option;
}
sub inby($$) { _by($included_option, 'in', $_[0], $_[1]); }
sub exby($$) { _by($excluded_option, 'ex', $_[0], $_[1]); }
local *FILE_RH;
local *FILE_WH;
my $file_pid;
$SIG{PIPE} = sub
{
close FILE_WH;
waitpid $file_pid, 0;
die "file(1) pipe broken"
};
$file_pid = open2(\*FILE_RH, \*FILE_WH, "file -n -f -" )
or die "can't fork: $!";
#+ main work
$| = 1; # STDOUT autoflush
find(\&onfile, @ARGV);
#- main work
close FILE_WH;
waitpid $file_pid, 0;
format STDERR =
Total files: checked empty
------- -------
@>>>>>> @>>>>>>
$total_files_checked, $total_files_empty
suffix re file(1)
------- ------- -------
excluded by: @>>>>>> @>>>>>> @>>>>>>
$total_files_excluded_by_suffix, $total_files_excluded_by_re, $total_files_excluded_by_file
included by: @>>>>>> @>>>>>> @>>>>>>
$total_files_included_by_suffix, $total_files_included_by_re, $total_files_included_by_file
.
write STDERR if $total_option;
exit 0;
sub onfile()
{
my $shortname = $_;
my $ fullname = "$File::Find::name";
return unless -f $shortname;
$total_files_checked++;
if ( ! -s $shortname )
{
$total_files_empty++;
return;
}
my $lcshortname = lc $shortname;
if ( $lcshortname =~ m/$exclude_re/o )
{
exby('RE', $fullname);
$total_files_excluded_by_re++;
return;
}
if ( $lcshortname =~ m/$include_re/o )
{
inby('RE', $fullname);
$total_files_included_by_re++;
}
else # check by suffix
{
my $suffix = $1 if $lcshortname =~ m/\.([^\.]+)$/;
if ( defined $suffix and length $suffix and
exists $bin_suffices{$suffix} )
{
exby('binary suffix', $fullname);
$total_files_excluded_by_suffix++;
return;
}
if ( defined $suffix and length $suffix and
exists $txt_suffices{$suffix} )
{
inby('text suffix', $fullname);
$total_files_included_by_suffix++;
}
else # check by file(1)
{
print FILE_WH $fullname."\n"
or die "bad write to file(1) pipe: $! $?";
my $fread = <FILE_RH>;
defined $fread or die "bad read from file(1) pipe: $! $?";
chomp $fread;
unless ( $fread =~ m|^(.+):\s+(.+)$| )
{
die "file(1) output does not match pattern:\n$fread\n";
}
my ($fname,$fdesc) = ($1,$2);
die "can't parse file(1) output:\n$fread\n"
if (! defined $fname) or (! defined $fdesc);
die "file name after file(1) does not match the original one:\n".
"\tbefore: $fullname\n".
"\tafter : $fname\n"
if $fname ne $fullname;
if ( $fdesc =~ m/^.* (text)|(source).*$/ )
{
inby('file(1)', $fullname);
$total_files_included_by_file++;
}
else
{
exby('file(1)', $fullname);
$total_files_excluded_by_file++;
return;
}
}
}
print $fullname . "\n";
}
=head1 AUTHOR
Dmitry Fedorov <dm.fedorov@gmail.com>
=head1 COPYRIGHT
Copyright (C) 2003 Dmitry Fedorov <dm.fedorov@gmail.com>
=head1 LICENSE
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
=head1 DISCLAIMER
The author disclaims any responsibility for any mangling of your system
etc, that this script may cause.
=cut
[-- Attachment #3: truncate-eol-whitespace --]
[-- Type: application/octet-stream, Size: 4916 bytes --]
#!/usr/bin/perl -w
=head1 NAME
truncate-eol-whitespace - truncate white spaces at end of line.
=head1 SYNOPSIS
truncate-eol-whitespace [-total] [-truncated] [-nontruncated] [-dry-run] \
[file ...] [-f files-from]
=head1 DESCRIPTION
This program truncates extra white spaces just before end of line
in specified files. File names can be specified as parameters
and/or readed from specified file, '-' for STDIN.
=head1 EXAMPLE
Truncate all text files under DIR:
find-text-files DIR -total | truncate-eol-whitespace -total -f -
=cut
require 5.004;
use strict;
use integer;
use Getopt::Long;
sub usage {
warn "\n".join(" ", @_)."\n" if @_;
warn <<EOF;
Usage:
truncate-eol-whitespace [-total] [-truncated] [-nontruncated] [-dry-run] \
[file ...] [-f files-from]
Warning: this script truncates files! Use -dry-run for test first.
EOF
exit(1);
}
=head1 OPTIONS
=over 4
=item -total
print statistic counters to STDERR.
=item -truncated
print to STDERR what files was truncated;
=item -nontruncated
print to STDERR what files was not truncated;
=item -dry-run
Do not write files, report only
=item file ...
Files to truncate (optional)
=item -f files-from
File name with file names to truncate, one name per line.
Use '-' for STDIN.
=back
=cut
my $ help_option = 0;
my $ dry_run_option = 0;
my $ files_from_option;
my $ total_option = 0;
my $ truncated_option = 0;
my $nontruncated_option = 0;
GetOptions(
'help' => \$ help_option,
'total' => \$ total_option,
'truncated' => \$ truncated_option,
'nontruncated' => \$nontruncated_option,
'dry-run' => \$ dry_run_option,
'f=s' => \$ files_from_option,
) or usage;
usage if $help_option;
usage("no files specified")
if (! defined $files_from_option) and scalar(@ARGV) < 1;
my (
$total_files_checked,
$total_files_empty,
$total_files_truncated,
$total_files_no_chars_truncated
) = (0,0,0,0,0,0,0,0,0,0);
my ( $total_chars_readed, $total_chars_truncated ) = (0,0);
sub truncate_file($)
{
my $fname = shift;
$total_files_checked++;
if ( ! -f $fname )
{
print STDERR "is not a plain file: ".$fname."\n";
return;
}
if ( ! -s $fname )
{
print STDERR "zero size file: ".$fname."\n";
$total_files_empty++;
return;
}
local $/ = undef; # no records, slurp mode
local *IN;
open IN, "< $fname"
or die "Can't open $fname: $!";
my $file = <IN>;
defined $file or die "Can't read $fname: $!";
close IN;
my $length_before = length $file;
$total_chars_readed += $length_before;
$file =~ s/[\000-\011\013-\040]+\n/\n/mg;
my $length_after = length $file;
my $chars_truncated = $length_before - $length_after;
die "size become greater after truncating: ".$fname
if $chars_truncated < 0;
if ( $chars_truncated > 0 )
{
$total_files_truncated++;
$total_chars_truncated += $chars_truncated;
}
else
{
$total_files_no_chars_truncated++;
}
if ( $chars_truncated >0 and $truncated_option )
{
printf(STDOUT "%6u of %6u chars truncated from $fname\n",
$chars_truncated, $length_before);
}
elsif ( $chars_truncated==0 and $nontruncated_option )
{
printf(STDOUT "%-16s chars truncated from $fname\n", 'no');
}
if ( ! $dry_run_option and $chars_truncated > 0 )
{
local *OUT;
open OUT, "> $fname" or die "Can't open $fname: $!";
print OUT $file or die "Can't write $fname: $!";
close OUT or die "Error on closing $fname: $!";
}
}
#+ main work
# do process file names from the @ARGV first
truncate_file($_) while defined ($_ = shift);
if (defined $files_from_option) # do process file names from file|STDIN
{
local *IN;
open (IN, $files_from_option) or die "Can't open $files_from_option: $!";
while ( my $fname = <IN> )
{
chomp $fname;
next if length($fname) < 1; # skip empty lines
truncate_file($fname);
}
}
#- main work
format STDERR =
Total files: checked empty truncated non-truncated
------- ------- ------- -------
@>>>>>> @>>>>>> @>>>>>> @>>>>>>
$total_files_checked, $total_files_empty, $total_files_truncated, $total_files_no_chars_truncated
Total chars truncated: @>>>>>> of @<<<<<<<<<<<<<<<<<
$total_chars_truncated, $total_chars_readed
.
write STDERR if $total_option;
exit 0;
=head1 AUTHOR
Dmitry Fedorov <dm.fedorov@gmail.com>
=head1 COPYRIGHT
Copyright (C) 2003 Dmitry Fedorov <dm.fedorov@gmail.com>
=head1 LICENSE
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
=head1 DISCLAIMER
The author disclaims any responsibility for any mangling of your system
etc, that this script may cause.
=cut
^ permalink raw reply [flat|nested] 11+ messages in thread