#!/usr/bin/perl
#
# $Id: sf-digest.pl,v 1.11 2011/06/03 15:35:00 cherry Exp $
#
# Copyright Tim Rayner, 2002-2009 (tfrayner@gmail.com)
# This code may be modified and distributed on the same terms as the Perl source code.
#
# Script to periodically query a web page, extract a table and identify new rows in that table.
# New results are cached in $savefile; if a second file ($archivefile) has not been modified
# within the time $cacheperiod, the new table rows (i.e. those in $savefile but not in
# $archivefile) are mailed to the $mailto email address. The results are then written out to
# $archivefile to be omitted from future emails.
#
# The reason for this convoluted approach is that we aim to capture table rows which may not
# be present on the web page for very long (in some cases, only a matter of hours). However,
# we want to avoid sending emails more than once a day (change $cacheperiod to alter this
# behaviour). We also want to maintain a persistant cache of results to overcome difficulties
# connecting to the web page (originally sourceforge.net, hardly a paragon of reliability).
#
# Use the command 'sf-digest.pl now' (as opposed to simply 'sf-digest.pl') to override
# the result cache mechanism and mail all the current results now.
#

use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use HTML::TableExtract;
use MIME::Lite;

#############################################
############ User Config Section ############
#############################################

#
# Address to send mail to. You will want to change this.
#
<<<<<<< sf-digest.pl
<<<<<<< sf-digest.pl
<<<<<<< sf-digest.pl
#my $mailto='tfrayner@sdf-eu.org';
my $mailto='cherry@stanford.edu';
=======
#my $mailto='tfrayner@sdf-eu.org';
=======
>>>>>>> 1.8
my $mailto='go@geneontology.org';
>>>>>>> 1.7
=======
my $mailto='go-consortium@lists.stanford.edu,go-curator-tracker@lists.stanford.edu';
>>>>>>> 1.9

#
# Address to send mail from. Ensure that your SMTP server or local MTA will accept this value.
#
my $mailfrom='cherry@stanford.edu';

#
# Address of the email maintainer. Emails will be BCCed to this address
#
my $maintainer = '';

#
# Subject line of the sent email:
#
my $mailsubject = 'SourceForge Update';

#
# Table columns to extract (in order that they will appear in the email).
# N.B. Keep 'Request ID' as column 1; changing this will break the script.
#
my $tableheaders = [ 'ID', 'Opened', 'Summary' ];

#
# Lines at start and end of the email body text:
#
my $mailbodyhead =
    "<html>Here are today's new SourceForge requests:<br><br>"
  . "Follow the link to see the full description or to add comments to an item. <br>"
  . "You can use the monitor option to receive any comments added to an item.<br>";
my $mailbodytail =
    "<br>You can also go to the SourceForge GO Curator Requests Tracker<br>"
  . " to see the complete list of submissions:<br>"
  . qq!<a href="http://sourceforge.net/tracker/?atid=440764&group_id=36855&func=browse">http://sourceforge.net/tracker/?atid=440764&group_id=36855&func=browse</a><br><br>!
  . "Signed,<br><br>the sf-digest daemon.</html>";

#
# Save files used to store the last set of data downloaded.
# It's likely that you will want to change these to a set location
# (e.g. /home/user/.sf-digest-latest.txt).
#
my $savefile    = $ENV{HOME} . '/.sf-digest-latest.txt';
my $archivefile = $ENV{HOME} . '/.sf-digest-archive.txt';

#
# SMTP server via which to send mail. If undefined, use local sendmail command.
#
my $smtp_server = '';

#
# Period for which results are cached prior to emailing them (in seconds). Initially
# set to 1 day minus 15 minutes (85500 seconds)
#
my $cacheperiod = ( 1 * 24 * 60 * 60 ) - ( 15 * 60 );

########################################################################
### URL vars - You shouldn't need to touch these, unless SourceForge does. ###
########################################################################

#
# Offset between pages
#
my $delta = 50;

#
# Web page to check (we will concatenate an offset ($delta) later;
# here we include $delta as the number of requests to list per query).
#
my $url =
"http://sourceforge.net/tracker/index.php?func=browse&group_id=36855&atid=440764&limit=${delta}&offset=";

#
# Total limit on number of request IDs to download from the web page. This is a safety feature, and
# as such should not need changing. Change this if the project ever balloons out of control :-)
#
my $limit = 2000;

#
# HTML tags embedded in the email: $idurl=$idurl_start.'<request ID>'.$idurl_end.<request ID>."</a>";
# see below.
#
# - part one of requestID URL:
my $idurl_start =
  '<a href="http://sourceforge.net/tracker/index.php?func=detail&aid=';

#
# - second part of requestID URL:
my $idurl_end = '&group_id=36855&atid=440764">';

#############################################
########## End User Config Section ##########
#############################################

sub strip_whitespace {

    my ($str) = @_;

    return q{} unless defined $str;

    $str =~ s/^\s*(.*?)\s*$/$1/gs;
    $str =~ s/[\t\n\r]*//gs;

    return $str;
}

sub gettable {

    # Download the table data, return a hashref with column 1 as key
    # and the other columns as values, joined in a tab-delimited
    # string.

    my ( $url, $delta, $limit, $tableheaders ) = @_;

    my %results;

    # Here we $limit results to prevent infinite loop.
    OFFSET:
    for ( my $offset = 0 ; $offset < $limit ; $offset = $offset + $delta )
    {

        my $pageurl  = $url . $offset;
        my $ua       = LWP::UserAgent->new( timeout => 10 );
        my $request  = HTTP::Request->new( 'GET', $pageurl );
        my $response = $ua->request($request);

        if ( $response->is_success ) {
            my $te = new HTML::TableExtract( headers => $tableheaders );
            $te->parse( $response->content );

            last OFFSET unless $te->table_states;    # No more table to parse

            foreach my $ts ( $te->table_states ) {
                foreach my $rowref ( $ts->rows ) {

                    my @row =
                      map { strip_whitespace($_) }
                      @{$rowref};

                    # Strip out useless rows (sourceforge - specific)
                    next if ( $row[0] =~ /^ID$/
                        || $row[0] =~ /^\s*Assignee:/
                        || $row[0] =~ /^\s*No results/ );

                    # Format data and push into %results
                    my $idurl =
                        $idurl_start
                      . $row[0]
                      . $idurl_end
                      . $row[0] . "</a>";
                    $results{ $row[0] } =
                      join( "\t", $idurl, @row[ 1 .. $#row ] );
                }
            }
        }
        else {
            print "Error: " . $response->status_line . "\n";
            last OFFSET;
        }
    }
    return \%results;
}

sub readfile {

    # Read in the old results file, return old results hashref.

    my ($file) = @_;
    my %oldresults;

    open( SAVEFILE, "<$file" ) or do {
        warn("No save file; creating one named \'$file\'.\n");
        return undef;
    };

    while ( my $line = <SAVEFILE> ) {
        chomp $line;
        $line =~ /(\w*)\t(.*)/;
        $oldresults{$1} = $2;
    }
    return \%oldresults;
}

sub writefile {

    # Write new results to save file.

    my ( $file, $results ) = @_;

    open( SAVEFILE, ">$file" )
      or die("Could not open save file for writing: $!\n");

    foreach my $key ( sort keys %{ $results } ) {
        print SAVEFILE ("$key\t$results->{$key}\n");
    }
}

############
### Main ###
############

# Set the cache period to zero if we're called with the 'now'
# directive (i.e. 'sf-digest now').
if ( $ARGV[0] && ( $ARGV[0] eq 'now' ) ) { $cacheperiod = 0; }

# Get old and new table data; overwrite old save file with new data.
my $resref     = readfile($savefile);
my %allresults = %{$resref} if $resref;
my %newresults = %{ gettable( $url, $delta, $limit, $tableheaders ) };

# Merge the hashes to prevent false positive upon SourceForge timeouts,
# write everything out to the save file.
@allresults{ keys %newresults } = values %newresults;
writefile( $savefile, \%allresults );

# We can either quit now or send the new message.
# If the archive file is older than 1 day minus 5 minutes,
# or if the archive file does not exist (i.e. first run), we send the message.
if (   ( !-f $archivefile )
    || ( ( ( stat($archivefile) )[9] ) <= ( time - $cacheperiod ) ) )
{

    # Read in the archive file.
    my $archiveref = readfile($archivefile);
    my %archiveresults = %{$archiveref} if $archiveref;

  # Construct main mail body text; omit entries found in the archived table data.
    my @mailbody;
    foreach my $id ( sort keys %allresults ) {

     # Strip out non-ascii characters (certain mail reader programs prefer this).
        $allresults{$id} =~ s/[^[:ascii:]]//g;
        push( @mailbody, "$allresults{$id}\n" ) unless $archiveresults{$id};
    }

    # Construct the rest of the mail and send it.
    if (@mailbody) {    # Don't send if there are no changes.

        # Finish off the mail.
        unshift( @mailbody, $mailbodyhead );
        push( @mailbody, $mailbodytail );

        my $body = join( "<br>", @mailbody );

        # Construct the MIME::Lite object.
        my $mail = MIME::Lite->new(
            From     => $mailfrom,
            To       => $mailto,
            Bcc      => $maintainer,
            Subject  => $mailsubject,
            Encoding => 'quoted-printable',
            Data     => $body,
            Type     => 'text/html',
        );

        # This is added to prevent post-send attachments (e.g. as part
        # of an email list) messing up the html segment. Comment out
        # if this is unnecessary.
        $mail->attach(
            Data => ' ',
            Type => 'text/plain',
        );

        # Finally, send the mail.
        if ($smtp_server) {
            $mail->send( 'smtp', $smtp_server );
        }
        else {
            $mail->send();
        }
    }

    # Merge all data and spew it into the archive.
    @archiveresults{ keys %allresults } = values %allresults;
    writefile( $archivefile, \%archiveresults );
}
