#!/usr/bin/perl -w use strict; # clean_mbox # # The mbox format for folders of email is not a good format for # storing email in. And, the family of perl modules of Mail::Folder # are more or less obsolete. That notwithstanding, this is a # perl script which does a rough job of cleaning a mbox file # using Mail::Folder and Mail::Internet. # # The applicable license for distributing this module, is that # which is common across much of the perl world, that being you get # to use your choice of either The Artistic License of perl, # or the GPL. You can find copies of the Artistic License at # perl.com (and other places), and you can find copies of the GNU # Public License at Gnu.org (and other places). # # Gordon Haverland, Matter Realisations, perl@materialisations.com # 2003/05/11 use Mail::Folder::Mbox; use Mail::Internet; # Get a mbox file name off command line, open two mail folders # to process the mail. Rename existing file to file.bak before # transferring mail. while( my $fname = shift( @ARGV ) ) { if( -e "$fname.bak" ) { print "Backup of $fname ($fname.bak) exists, skipping\n"; next; } rename $fname, "$fname.bak"; my $old_folder = new Mail::Folder('AUTODETECT', "$fname.bak", MailFrom => "KEEP" ) || next; my $new_folder = new Mail::Folder('mbox', $fname, Create => 1, MailFrom => "KEEP" ) || next; # Get a list of the existing messages and loop over those msgs. my @msg_list = sort {$a <=> $b} $old_folder->message_list; foreach my $msg (@msg_list) { # Get a copy of the message my $email = $old_folder->get_message( $msg ); # Get the header and clean it up. There isn't much we can do. # We can attempt to better refold the header lines, but unfold() # and refold(78) doesn't guaranteee 78 characters per line maximum. # And the built in cleanup() doesn't do much either. Sorry. my $hdr = $email->head(); $hdr->unfold(); $hdr->cleanup(); $hdr->fold( 78 ); # Unless we actually know what is in the body, there isn't much # we can do to clean it up either. Getting rid of extraneous # leading/trailing space in front of/behind body is about it. $email->tidy_body(); my $body = $email->body(); # This doesn't really work as well as I would like, but I have # seen mbox with a variety of end-of-line behavior. Hopefully # this makes things common. foreach (@{$body}) { # Remove trailing spaces, and change # CRLF, CR, LF, RS or NEL to CRLF s/\s[\n\r\x1E\x85]+$/\n\r/; # RFC-2822 says EOL is CRLF } # That's all the generic cleaning I know of. Reconstruct the message # and put in new mbox. my $new_msg = new Mail::Internet( 'nothing', Header => $hdr, Body => $body, MailFrom => "KEEP", ); $new_folder->append_message( $new_msg ); } # Sync and close the new folder, just close old one. $new_folder->sync; $new_folder->close; $old_folder->close; }