#!/usr/bin/perl -w use strict; # mbox_anal [-S] # # I suppose you could use this on things other than mbox, but # it might not be of much use. However, it doesn't really assume # anything about the structure of the file(s) it opens. # # It analyses the input file(s) line by line. It will keep track # of how many lines exceed 78 characters in length (the recommended # maximum length of a line in the RFCs), how many lines exceed # 998 characters (at which point, mail services may chop the line # or refuse to accept the message), how many lines have non-standard # EOL behavior. # # The -S flag is used to print out summary stats. # # The applicable license for distributing this module, is that # which is common across much of the perl world, that being you get # to use your choice of either The Artistic License of perl, # or the GPL. You can find copies of the Artistic License at # perl.com (and other places), and you can find copies of the GNU # Public License at Gnu.org (and other places). # # Gordon Haverland, Matter Realisations, perl@materialisations.com # 2003/05/11 use Getopt::Std; use vars qw($opt_S); getopts('S'); my( $fname, # Name of file currently open $lineno, # Line no of data in file $line, # A line of data. $length, # The string length of a line of data. $Illegal, # Number of illegal length (> 998) lines $Unrecommended, # Number of excessive length (> 78) lines $non_rfc_eol, # Flag for having seen non-RFC-2282 EOL behavior %EOL, # Hash of EOL counters %hist, # Hash of histogram data (geometric bin widths) ); while( $fname = shift( @ARGV ) ) { unless( -e $fname ) { print STDERR "Warning: $fname doesn't exist, skipping\n"; } $lineno = 0; $Illegal = 0; $Unrecommended = 0; unless( open( MBOX, "< $fname" ) ) { print STDERR "Warning, Can't open $fname as mbox: $!\n"; } else { while( $line = ) { $lineno++; $line = &examine_eol( $line ); $length = length( $line ); if( $length > 78 ) { if( $length > 988 ) { &ma_warn( $fname, $lineno, "Illegal length $length" ); $Illegal++; } else { &ma_warn( $fname, $lineno, "Unrecommended length $length" ); $Unrecommended++; } &build_hist( $length ); } } if( $opt_S ) { # Spit out summary stats, if desired if( $non_rfc_eol > 0 ) { # Print out EOL stats print "End-of-Line behavior\n"; print "RFC Compliant: $EOL{RFC}\n" if( exists($EOL{RFC}) ); print "UNIX EOL: $EOL{UNIX}\n" if( exists($EOL{UNIX}) ); print "Macintosh EOL: $EOL{Mac}\n" if( exists($EOL{Mac}) ); print "QNX EOL: $EOL{QNX}\n" if( exists($EOL{QNX}) ); print "NEL (EBCDIC?): $EOL{NEL}\n" if( exists($EOL{NEL}) ); print "Strange EOL: $EOL{Strange}\n" if( exists($EOL{Strange}) ); } $length = 0; if( $Unrecommended > 0 ) { print "Excessive (unrecommended) line lengths on $Unrecommended lines\n"; $length++; } if( $Illegal > 0 ) { print "Illegal line lengths on $Illegal lines\n"; $length++; } if( $length > 0 ) { # Print histogram print "Distribution of line lengths\n"; print " Bin Count\n"; printf(" 79-102 %5d\n", $hist{' 79-102'}) if( exists( $hist{' 79-102'} ) ); printf("103-131 %5d\n", $hist{'103-131'}) if( exists( $hist{'103-131'} ) ); printf("132-169 %5d\n", $hist{'132-169'}) if( exists( $hist{'132-169'} ) ); printf("170-218 %5d\n", $hist{'170-218'}) if( exists( $hist{'170-218'} ) ); printf("219-281 %5d\n", $hist{'219-281'}) if( exists( $hist{'219-281'} ) ); printf("282-362 %5d\n", $hist{'282-362'}) if( exists( $hist{'282-362'} ) ); printf("363-466 %5d\n", $hist{'363-466'}) if( exists( $hist{'363-466'} ) ); printf("467-601 %5d\n", $hist{'467-601'}) if( exists( $hist{'467-601'} ) ); printf("602-774 %5d\n", $hist{'602-774'}) if( exists( $hist{'602-774'} ) ); printf("775-998 %5d\n", $hist{'775-998'}) if( exists( $hist{'775-998'} ) ); printf(" 999+ %5d\n", $hist{' 999+'}) if( exists( $hist{' 999+'} ) ); } } close( MBOX ); } } sub examine_eol { my $line = shift; if( $line =~ s/^(.*)\n\r$/$1/ ) { # RFC-2822 email EOL # Do nothing, this is normal $EOL{RFC}++; } elsif( $line =~ s/^(.*)\n$/$1/ ) { # UNIX EOL &ma_warn( $fname, $lineno, "UNIX EOL" ); $EOL{UNIX}++; $non_rfc_eol++; } elsif( $line =~ s/^(.*)\r$/$1/ ) { # Mac EOL &ma_warn( $fname, $lineno, "Mac EOL" ); $EOL{Mac}++; $non_rfc_eol++; } elsif( $line =~ s/^(.*)\x1E$/$1/ ) { # QNX EOL &ma_warn( $fname, $lineno, "QNX EOL" ); $EOL{QNX}++; $non_rfc_eol++; } elsif( $line =~ s/^(.*)\x85$/$1/ ) { # (EBCDIC NEL?) EOL &ma_warn( $fname, $lineno, "NEL EOL" ); $EOL{NEL}++; $non_rfc_eol++; } else { # Bad EOL &ma_warn( $fname, $lineno, "Strange EOL" ); $EOL{Strange}++; $non_rfc_eol++; } return( $line ); } sub ma_warn { my $fname = shift; my $lineno = shift; return if( $opt_S ); # User wants summary, not blow by blow if( $#_ == -1 ) { # No reason print "Warning: $fname($lineno)\n"; } elsif( $#_ == 0 ) { # Reason given print "Warning: $fname($lineno) - $_[0]\n"; } else { die "Strange call to ma_warn\n"; } } sub build_hist { my $length = shift; # Evenly space boundaries from ln(79) chars to ln(998) to make # a geometric progression for bin width. # Build up histogram. ln(79)=4.37 ln(998)=6.91 # 79-102, 103-131, 132-169, 170-218, 219-281, 282-362, 363-466, # 467-601, 602-774, 775-998, 998+ if( $length < 103 ) { # We already know it's > 78 $hist{' 79-102'}++; } elsif( $length < 132 ) { $hist{'103-131'}++; } elsif( $length < 170 ) { $hist{'132-169'}++; } elsif( $length < 219 ) { $hist{'170-218'}++; } elsif( $length < 282 ) { $hist{'219-281'}++; } elsif( $length < 363 ) { $hist{'282-362'}++; } elsif( $length < 467 ) { $hist{'363-466'}++; } elsif( $length < 602 ) { $hist{'467-601'}++; } elsif( $length < 775 ) { $hist{'602-774'}++; } elsif( $length < 998 ) { $hist{'775-998'}++; } else { $hist{' 999+'}++; } }