#!/usr/bin/perl -w use strict; # msort [-y] [-m] [-d] [-r] [-R] [-h] # Sort a Mail Box by times. # # We can split the sorted output by year, or by year and month. We can # (attempt to) pick the Date out of the Date field, or from somewhere # in the Received headers. There can be other Dates in the header as # well (From and possibly X-* headers). My dummy message I put in mboxen # I build from scratch, has Jan 1, 1970 for it. It looks like each # received header has at most, one Date in it. It would be nice if all # Dates only had numeric timezone information, but I suspect some will # have alphabetical (which aren't unique unfortunately). # We are ignoring time zones. It has been mentioned that Date::Manip will # eventually support time zones within its internal data structures. # Dates are stored internally as the packed ISO-like format used by # Date::Manip. # YYYYMMDDHH:MM:SS # I will mention, that the Date::Manip function ParseDate, will parse # its own output. Which is something that is nice to see. # # This script uses the newer Mail::Box::Manager, but people really # should move away from mbox formats anyway. # # The applicable license for distributing this module, is that # which is common across much of the perl world, that being you get # to use your choice of either The Artistic License of perl, # or the GPL. You can find copies of the Artistic License at # perl.com (and other places), and you can find copies of the GNU # Public License at Gnu.org (and other places). # # Gordon Haverland, Matter Realisations, perl@materialisations.com # 2003/05/11 # This code is NOT meant to be efficient. If so, I would scarcely be # using Date::Manip in it. Rather, it is meant to be more or less # obvious. It is also not meant to be production code. It probably # has a few problems in it. Any I have found, I have either documented # or fixed. # # Okay, this program works with the default Date: header, and with # Received: headers (-r tested, -R assumed to work). It has been # tested on mail archives containing at least one message, which has # no date information in the header. It probably splits a file # into sorted, year/month files. Probably the only thing that should # be added, is code to test for the existance of files with the same # name as the new sorted files which are written. Exercise to the # user as far as I am concerned. :-) # I hadn't written any code to handle missing Date's except at beginning # or end of mail folder. Then I have it my composite INBOX of 20-30k messages, # and found O(3000) mails with no Date's. Oh well, that's what you get # for being lazy. use Getopt::Std; use Mail::Box::Manager; use Date::Manip; # Swiss army chainsaw of perl date modules # Parse command line options # y Year # m Month (implies Year) # d Use Date field (or approximation, default) # r Use earliest date in Received headers # R Use latest date in Received headers # h Help, usage message my( %arg, %Msg, $cut, $prog_name ); { my @junk = split(m|/|, $0); $prog_name = $junk[-1]; } getopts('ymdrRh', \%arg); exit 0 unless( @ARGV ); my $inbox = shift( @ARGV ); # By default, use Date. if( exists( $arg{r} ) || exists( $arg{R} ) ) { &usage() if( exists( $arg{r} ) && exists( $arg{R} ) ); } else { $arg{d}++ ; } if( exists( $arg{m} ) ) { # Sort by year/month $cut = 2; } elsif( exists( $arg{y} ) ) { $cut = 1; } else { $cut = 0; } # We need 1 manager, and 2 folders. Open our inbox here. Wait for later # to "publish" our sorted mail. my $mgr = Mail::Box::Manager->new; my $folder = $mgr->open( folder => "$inbox", extract => 'LAZY', # Don't deal with body unless needed access => 'r', create => '0', ); # Get our list of messages. If we call ->messages in scalar context, # we get the number of messages in the folder. It seems, messages # start at 0, not 1. my $N_messages = $folder->messages; foreach my $i (0..$N_messages) { my $message = $folder->message($i); next unless( $message ); my $Date; # Our message may have a Date field, so we can get that. # Our message may have multiple received header fields, the earliest # and/or latest Date in there might be useful. if( exists( $arg{d} ) ) { unless( $Date = $message->get('Date') ) { # Set time to undef, deal with later. $Date = undef; } else { $Date = &ParseDate( $Date ); } # We need to use the message->head->get() form. If we use message->get(), # it only gets a single Received header, and it doesn't get the complete # header (it stops after a semicolon, leaving the date behind. } elsif( exists( $arg{r} ) ) { my @received = $message->head->get('Received'); $Date = &get_date( -1, @received ); } elsif( exists( $arg{R} ) ) { my @received = $message->head->get('Received'); $Date = &get_date( 1, @received ); } else { # Never happens print STDERR "No DATE type specified\n"; &usage(); } # We should have the proper $Date for each message now. Save it. if( $Date ) { $Msg{$i} = $Date; } else { $Msg{$i} = undef; } } # Deal with 'undef' Dates now. Assume Mail folder has local chronological # order, and that dates have been lost/hidden. So, order in file is # at least partially correct. Undef at beginning gets set to some seconds # before the first known Date. Undef at end to some seconds after last # known Date, and undef in middle to some interpolation. &handle_missing( $N_messages ); # We need to sort the values, or rather we want the keys in the order # of increasing value of the key. Should use Date_Cmp($Msg{$a}, $Msg{$b}) # which will handle time zones properly, in the future (ignored at present). my @list = sort { $Msg{$a} cmp $Msg{$b} } keys %Msg; my $begin = $Msg{$list[0]}; my $end; if( $cut == 0 ) { $end = $Msg{$list[-1]}; } elsif( $cut == 1 ) { $end = &end_year( $begin ); } elsif( $cut == 2 ) { $end = &end_month( $begin ); } do { my( @this, $name, $delta); # Push elements onto @this list, until we get a time past end. Unshift # the elements from @list as we do this. %Msg has no order to speak of, # but @list is ordered on the Dates of %Msg. So, once we find an element # of list which isn't less than end, we stop the push/shifting of elements. while( @list && &Date_Cmp( $Msg{$list[0]}, $end ) <= 0 ) { push( @this, shift( @list ) ); } # We now have complete list, "publish it". $end contains a Date:Manip # version of the end of the time period. Extract end_year and end_month # strings by reg_exp. $end =~ /^(\d\d\d\d)(\d\d)(\d\d)(\d\d):(\d\d):(\d\d)$/; ($name, $delta) = &make_name( $cut, $inbox, $1, $2 ); &publish( $name, @this ); $end = &DateCalc( $end, $delta ); } while( @list ); # ============= Subroutines follow ================= sub usage { print STDERR "Usage: $prog_name [-(y|m)] [-(d|r|R)] [-h] mail_folder\n"; print STDERRR "\twhere:"; print STDERRR "\t -y Split mail by years"; print STDERRR "\t -m Split mail by months and years"; print STDERRR "\t -d Split mail by Date field (default)"; print STDERRR "\t -r Split mail by earliest Received field"; print STDERRR "\t -R Split mail by latest Received field"; print STDERRR "\t -h Generate this usage message"; exit 0; } sub get_date { my $direction = shift; my @received = @_; my( @dates, $date ); foreach my $string (@received) { # ParseDate finds the first "date" in string. Are there ever multiple # "dates" in a string? Any correlation in time? The string has the # form: # Received: ...; Date_Time next unless( $string ); if( $string =~ /^.*;([^;]+)$/i ) { $string = $1; } else { next; } my $ret = &ParseDate( $string ); push( @dates, $ret ) if( $string && $ret ); } return( undef ) unless( @dates ); my @sorted = sort { $a cmp $b } @dates; if( $direction == 1 ) { $date = $sorted[-1]; } elsif( $direction == -1 ) { $date = $sorted[0]; } else { die "Bad format\n"; } return( $date ); } sub make_name { my $cut = shift; my $base = shift; my $end_y = ($cut > 0) ? shift : undef; my $end_m = ($cut > 1) ? shift : undef; my( $name, $delta ); if( $cut == 0 ) { $name = "$base-sorted"; $delta = "+0d"; } elsif( $cut == 1 ) { $name = "$base-sorted-$end_y"; $delta = "+1year"; } elsif( $cut == 2 ) { $name = "$base-sorted-$end_y-$end_m"; $delta = "+1month"; } else { die "huh?\n"; } return( $name, $delta ); } sub publish { my $name = shift; my @list = @_; # Open a folder to hold our sorted data with $name. Copy the data # from $folder, to our $sorted folder. When done, close the $sorted # folder to finish things up. my $sorted = $mgr->open( folder => "$name", extract => 'LAZY', access => 'rw', create => '1', ); foreach my $msg (@list) { my $message = $folder->message($msg); $message->copyTo( $sorted ); } $sorted->close; } sub handle_missing { my $N = shift; # Look for missing data at beginning. foreach my $i (0..$N) { if( exists( $Msg{$i} ) ) { # Message number $i existed, was there data? if( $Msg{$i} ) { # Yes. Is there earlier missing data? my $j = $i - 1; my $count = 1; do { if( exists( $Msg{$j} ) ) { # Yes. # Data to patch. $i points to last seen data. my $delta = sprintf("-%ds", $count); $Msg{$j} = &DateCalc( $Msg{$i}, $delta ); $count++; } $j = $j - 1; } while( $j >= 0 ); last; # Done patching } } } # Look for missing data at end. foreach my $i ($N..0) { if( exists( $Msg{$i} ) ) { # Message number $i existed, was there data? if( $Msg{$i} ) { # Yes. Is there earlier missing data? my $j = $i + 1; my $count = 1; do { if( exists( $Msg{$j} ) ) { # Yes. # Data to patch. $i points to last seen data. my $delta = sprintf("+%ds", $count); $Msg{$j} = &DateCalc( $Msg{$i}, $delta ); $count++; } $j = $j + 1; } while( $j <= $N ); last; # Done patching } } } # Now look for missing data in the center. Step across until we # a hash element with an undef value. foreach my $i (0..$N) { if( $Msg{$i} ) { next; } last if( $i >= $N ); # Okay, we found the beginning of undef data. Where's the end? my $j; foreach my $i2 ($i..$N) { if( $Msg{$i2} ) { $j = $i2-1; last; } } if( ! $j ) { die "Strange, no end of undef-Date\n"; } # $i..{$j-1} is all undef. Fill in the gap from the ends. my $n = ($j - $i) / 2; # int( ($j - $i + 0.5) / 2 ); ?? # printf "Replacing %d undef-Dates\n", $j-$i+1; # if( $n > 2 ) { # print "Hello\n"; # } $begin = $Msg{$i-1}; $end = $Msg{$j+1}; foreach my $d (0..$n) { $begin = &DateCalc( $begin, "+1s" ); $end = &DateCalc( $end, "-1s" ); $Msg{$j-$d} = $end; $Msg{$i+$d} = $begin; } # Finished that gap, continue scanning } # Contrary to promises, I haven't yet written this code. One, I # don't think it happens much. Two, I haven't quite figured out # how I want to treat it. I am inclined to "Date" into the void # at one second intervals from either end symmetrically. If by # some accident, the time difference across the interval is negative, # this will partition about half of the data to one part of the # file, and about half to the other side. The real solution, is # to not have to do this by algorithm. } sub end_year { my $day = shift; my $end; # Find the date/time corresponding to the end of the year in which # this $day occurs. In the case of a year, we know that the end is # Dec 31 at 23:59:59. However, months have varying ends, while # the beginning is constant. So, calculate the beginning of the # next year, and then subtract 1 second to be consistent in algorithm # between these 2 functions. my $date = &ParseDate( $day ); return( undef ) unless( $date ); if( $date =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d):(\d\d):(\d\d)/ ) { my $year = $1 + 1; $end = sprintf("%04d%02d%02d%02d:%02d:%02d", $year, 1, 1, 0, 0, 0 # Jan 1 at 00:00:00 ); $end = &DateCalc( $end, "-1s" ); return( $end ); } else { return( undef ); } } sub end_month { my $day = shift; my $end; # Find the date/time corresponding to the end of the month in which # this $day occurs. In the case of a year, we know that the end is # Dec 31 at 23:59:59. However, months have varying ends, while # the beginning is constant. So, calculate the beginning of the # next year, and then subtract 1 second to be consistent in algorithm # between these 2 functions. my $date = &ParseDate( $day ); return( undef ) unless( $date ); if( $date =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d):(\d\d):(\d\d)/ ) { $end = sprintf("%04d%02d%02d%02d:%02d:%02d", $1, $2, 1, 0, 0, 0 # Jan 1 at 00:00:00 ); $end = &DateCalc( $end, "+1 month" ); # This wraps at Dec properly. $end = &DateCalc( $end, "-1s" ); return( $end ); } else { return( undef ); } }