#!/usr/bin/perl # # $Id: sf-digest.pl,v 1.11 2011/06/03 15:35:00 cherry Exp $ # # Copyright Tim Rayner, 2002-2009 (tfrayner@gmail.com) # This code may be modified and distributed on the same terms as the Perl source code. # # Script to periodically query a web page, extract a table and identify new rows in that table. # New results are cached in $savefile; if a second file ($archivefile) has not been modified # within the time $cacheperiod, the new table rows (i.e. those in $savefile but not in # $archivefile) are mailed to the $mailto email address. The results are then written out to # $archivefile to be omitted from future emails. # # The reason for this convoluted approach is that we aim to capture table rows which may not # be present on the web page for very long (in some cases, only a matter of hours). However, # we want to avoid sending emails more than once a day (change $cacheperiod to alter this # behaviour). We also want to maintain a persistant cache of results to overcome difficulties # connecting to the web page (originally sourceforge.net, hardly a paragon of reliability). # # Use the command 'sf-digest.pl now' (as opposed to simply 'sf-digest.pl') to override # the result cache mechanism and mail all the current results now. # use strict; use warnings; use LWP::UserAgent; use HTTP::Request; use HTML::TableExtract; use MIME::Lite; ############################################# ############ User Config Section ############ ############################################# # # Address to send mail to. You will want to change this. # <<<<<<< sf-digest.pl <<<<<<< sf-digest.pl <<<<<<< sf-digest.pl #my $mailto='tfrayner@sdf-eu.org'; my $mailto='cherry@stanford.edu'; ======= #my $mailto='tfrayner@sdf-eu.org'; ======= >>>>>>> 1.8 my $mailto='go@geneontology.org'; >>>>>>> 1.7 ======= my $mailto='go-consortium@lists.stanford.edu,go-curator-tracker@lists.stanford.edu'; >>>>>>> 1.9 # # Address to send mail from. Ensure that your SMTP server or local MTA will accept this value. # my $mailfrom='cherry@stanford.edu'; # # Address of the email maintainer. Emails will be BCCed to this address # my $maintainer = ''; # # Subject line of the sent email: # my $mailsubject = 'SourceForge Update'; # # Table columns to extract (in order that they will appear in the email). # N.B. Keep 'Request ID' as column 1; changing this will break the script. # my $tableheaders = [ 'ID', 'Opened', 'Summary' ]; # # Lines at start and end of the email body text: # my $mailbodyhead = "Here are today's new SourceForge requests:

" . "Follow the link to see the full description or to add comments to an item.
" . "You can use the monitor option to receive any comments added to an item.
"; my $mailbodytail = "
You can also go to the SourceForge GO Curator Requests Tracker
" . " to see the complete list of submissions:
" . qq!http://sourceforge.net/tracker/?atid=440764&group_id=36855&func=browse

! . "Signed,

the sf-digest daemon."; # # Save files used to store the last set of data downloaded. # It's likely that you will want to change these to a set location # (e.g. /home/user/.sf-digest-latest.txt). # my $savefile = $ENV{HOME} . '/.sf-digest-latest.txt'; my $archivefile = $ENV{HOME} . '/.sf-digest-archive.txt'; # # SMTP server via which to send mail. If undefined, use local sendmail command. # my $smtp_server = ''; # # Period for which results are cached prior to emailing them (in seconds). Initially # set to 1 day minus 15 minutes (85500 seconds) # my $cacheperiod = ( 1 * 24 * 60 * 60 ) - ( 15 * 60 ); ######################################################################## ### URL vars - You shouldn't need to touch these, unless SourceForge does. ### ######################################################################## # # Offset between pages # my $delta = 50; # # Web page to check (we will concatenate an offset ($delta) later; # here we include $delta as the number of requests to list per query). # my $url = "http://sourceforge.net/tracker/index.php?func=browse&group_id=36855&atid=440764&limit=${delta}&offset="; # # Total limit on number of request IDs to download from the web page. This is a safety feature, and # as such should not need changing. Change this if the project ever balloons out of control :-) # my $limit = 2000; # # HTML tags embedded in the email: $idurl=$idurl_start.''.$idurl_end..""; # see below. # # - part one of requestID URL: my $idurl_start = ''; ############################################# ########## End User Config Section ########## ############################################# sub strip_whitespace { my ($str) = @_; return q{} unless defined $str; $str =~ s/^\s*(.*?)\s*$/$1/gs; $str =~ s/[\t\n\r]*//gs; return $str; } sub gettable { # Download the table data, return a hashref with column 1 as key # and the other columns as values, joined in a tab-delimited # string. my ( $url, $delta, $limit, $tableheaders ) = @_; my %results; # Here we $limit results to prevent infinite loop. OFFSET: for ( my $offset = 0 ; $offset < $limit ; $offset = $offset + $delta ) { my $pageurl = $url . $offset; my $ua = LWP::UserAgent->new( timeout => 10 ); my $request = HTTP::Request->new( 'GET', $pageurl ); my $response = $ua->request($request); if ( $response->is_success ) { my $te = new HTML::TableExtract( headers => $tableheaders ); $te->parse( $response->content ); last OFFSET unless $te->table_states; # No more table to parse foreach my $ts ( $te->table_states ) { foreach my $rowref ( $ts->rows ) { my @row = map { strip_whitespace($_) } @{$rowref}; # Strip out useless rows (sourceforge - specific) next if ( $row[0] =~ /^ID$/ || $row[0] =~ /^\s*Assignee:/ || $row[0] =~ /^\s*No results/ ); # Format data and push into %results my $idurl = $idurl_start . $row[0] . $idurl_end . $row[0] . ""; $results{ $row[0] } = join( "\t", $idurl, @row[ 1 .. $#row ] ); } } } else { print "Error: " . $response->status_line . "\n"; last OFFSET; } } return \%results; } sub readfile { # Read in the old results file, return old results hashref. my ($file) = @_; my %oldresults; open( SAVEFILE, "<$file" ) or do { warn("No save file; creating one named \'$file\'.\n"); return undef; }; while ( my $line = ) { chomp $line; $line =~ /(\w*)\t(.*)/; $oldresults{$1} = $2; } return \%oldresults; } sub writefile { # Write new results to save file. my ( $file, $results ) = @_; open( SAVEFILE, ">$file" ) or die("Could not open save file for writing: $!\n"); foreach my $key ( sort keys %{ $results } ) { print SAVEFILE ("$key\t$results->{$key}\n"); } } ############ ### Main ### ############ # Set the cache period to zero if we're called with the 'now' # directive (i.e. 'sf-digest now'). if ( $ARGV[0] && ( $ARGV[0] eq 'now' ) ) { $cacheperiod = 0; } # Get old and new table data; overwrite old save file with new data. my $resref = readfile($savefile); my %allresults = %{$resref} if $resref; my %newresults = %{ gettable( $url, $delta, $limit, $tableheaders ) }; # Merge the hashes to prevent false positive upon SourceForge timeouts, # write everything out to the save file. @allresults{ keys %newresults } = values %newresults; writefile( $savefile, \%allresults ); # We can either quit now or send the new message. # If the archive file is older than 1 day minus 5 minutes, # or if the archive file does not exist (i.e. first run), we send the message. if ( ( !-f $archivefile ) || ( ( ( stat($archivefile) )[9] ) <= ( time - $cacheperiod ) ) ) { # Read in the archive file. my $archiveref = readfile($archivefile); my %archiveresults = %{$archiveref} if $archiveref; # Construct main mail body text; omit entries found in the archived table data. my @mailbody; foreach my $id ( sort keys %allresults ) { # Strip out non-ascii characters (certain mail reader programs prefer this). $allresults{$id} =~ s/[^[:ascii:]]//g; push( @mailbody, "$allresults{$id}\n" ) unless $archiveresults{$id}; } # Construct the rest of the mail and send it. if (@mailbody) { # Don't send if there are no changes. # Finish off the mail. unshift( @mailbody, $mailbodyhead ); push( @mailbody, $mailbodytail ); my $body = join( "
", @mailbody ); # Construct the MIME::Lite object. my $mail = MIME::Lite->new( From => $mailfrom, To => $mailto, Bcc => $maintainer, Subject => $mailsubject, Encoding => 'quoted-printable', Data => $body, Type => 'text/html', ); # This is added to prevent post-send attachments (e.g. as part # of an email list) messing up the html segment. Comment out # if this is unnecessary. $mail->attach( Data => ' ', Type => 'text/plain', ); # Finally, send the mail. if ($smtp_server) { $mail->send( 'smtp', $smtp_server ); } else { $mail->send(); } } # Merge all data and spew it into the archive. @archiveresults{ keys %allresults } = values %allresults; writefile( $archivefile, \%archiveresults ); }