#!/usr/bin/perl

=head1 NAME

bibAddPubMed - Script to add URLs and abstracts from PubMed to entries in a
given BibTeX file.

=head1 SYNOPSIS

bibAddPubMed [-a] input.bib output.bib

=head1 DESCRIPTION

This script takes a BibTeX file as input, searches PubMed for each entry in it, 
and outputs another BibTeX file after adding URLs for the entries it found on 
PubMed.  The URL is added as the last field of the entry.  If the -a option is 
given, the abstracts are also fetched from PubMed and added to the entries.  
The new fields are added to an entry only if it did not have them previously.  
The PubMed match is determined by comparing the title, first author last name 
and publication year. The PubMed database is queried using the eUtils interface 
(see reference below).

A 3-second delay is imposed between sending eUtils queries to comply with their 
user requirements; so this script could take some time to finish executing.  
They also ask that we "run retrieval scripts on weekends or between 9 pm and 5 
am Eastern Time weekdays for any series of more than 100 requests".

A log file named input.bib.log keeps track of the entries in the bib file that 
have been processed.  If the script is aborted and later restarted, processing 
continues at the point at which it was aborted after reading the information 
for the previous entries from the log file.  This means that if you want to 
start over, you have to remove the log file.  In the log file, only those lines 
of URL containing "[match]" were successfully matched with the title of the 
entry.  The output.bib file is created only after processing all entries in 
input.bib.

 Assumptions about the format of the input bib file:
@EntryType{Key,  % Entry begins with '@' on column 1; key on the same line.
  ...            % No empty lines before the closing brace that ends entry.
}                % Closing brace that ends entry is on column 1.

=head1 PREREQUISITES

 The script requires the following modules:
XML::Twig <http://www.xmltwig.com/>
Text::BibTeX <http://www.gerg.ca/software/btOOL/>
LWP::Simple <http://search.cpan.org/dist/libwww-perl/lib/LWP/Simple.pm>
URI::Escape <http://search.cpan.org/search?module=URI::Escape>
Getopt::Std <http://search.cpan.org/search?module=Getopt::Std>

=head1 SEE ALSO

 References for eUtils, PubMed and linking to the Entrez databases: 
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html

=head1 AUTHOR

Vinod Valsalam <vkv@cs.utexas.edu>

=head1 COPYRIGHT AND LICENSE

Copyright 2005-2006, Vinod Valsalam

This program is free software; you may redistribute it and/or modify it under
the same terms as Perl itself.

=head1 SCRIPT CATEGORIES

Educational
Search

=head1 README

This script takes a BibTeX file as input, searches PubMed for each entry in it,
and outputs another BibTeX file after adding URLs and/or abstracts for the 
entries it found on PubMed.

=cut

#######################################################################################


my $version = "0.2";

# The path where your modules are installed.
use lib '/u/nn/bin/modules';  # Add this directory to the @INC search directory list

# Some of these modules are installed in /u/nn/bin/modules/; the rest should
# be installed by default on most systems already.  On Debian or Ubuntu
# systems, you may need to install libxml-perl and libwww-perl.
use XML::Twig;      # http://www.xmltwig.com/
use Text::BibTeX;   # http://www.gerg.ca/software/btOOL/
use LWP::Simple;    # http://search.cpan.org/dist/libwww-perl/lib/LWP/Simple.pm
use URI::Escape;    # http://search.cpan.org/search?module=URI::Escape
use Getopt::Std;    # http://search.cpan.org/search?module=Getopt::Std


#-------------------------------------------------------------------------------------
# Process command line options
#-------------------------------------------------------------------------------------

sub main::HELP_MESSAGE {
    print << 'EndHelp';
Usage: bibAddPubMed [-a] input.bib output.bib

    -a: add PubMed abstracts to entries in addition to URLs
EndHelp
}

sub main::VERSION_MESSAGE {
    print "bibAddPubMed version $version\n";
}

$Getopt::Std::STANDARD_HELP_VERSION = 1;  # Use standard-conforming behavior

my %opts = ();
getopts('a', \%opts);

# Make sure input and output file names are supplied.
if (@ARGV != 2) {
    main::HELP_MESSAGE();
    exit 1;
}

my $input = shift;
my $output = shift;

# Make sure input file exists.
-e $input || die "Input file does not exist";


#-------------------------------------------------------------------------------------
# Process the log file if it already exists, and open it for appending.
#-------------------------------------------------------------------------------------

my $logfile = $input . ".log";
my @log = ();
if (-e $logfile) {
    open(IL, $logfile) || die "Cannot open log file for reading";
    @log = <IL>;
    close IL;
}

my %urlhash = (); # Hash for storing URLs of entries.
my %abshash = (); # Hash for storing abstracts of entries.
my $logline = 0;  # Keeps track of the current line number in the log file.
while ($logline < @log) {
    # Parse the hits line and read URL into %urlhash if there is a match.
    if (my ($loghits, $logkey) = $log[$logline] =~ /^(\d+) hits returned for (.+)$/) {
        $logline++;

        # If there is a matching URL, extract it into the URL hash.
        for (my $i=0; $i<$loghits; $i++) {
            my ($logurl) = $log[$logline] =~ /^(http.+) \[match\]$/;
            $urlhash{$logkey} = $logurl if (($logurl ne "") && (! exists $urlhash{$logkey}));
            $logline++;
        }

        # If no matching URL was found, indicate that by the empty string.
        $urlhash{$logkey} = "" unless exists $urlhash{$logkey};
    }

    # If the log contains the abstract for an entry, read it into %abshash
    if (my ($logkey) = $log[$logline] =~ /^Abstract for (.+)$/) {
        $logline++;

        # Copy the abstract into the hash after removing the new line character.
        chomp $log[$logline];
        $abshash{$logkey} = $log[$logline];
        $logline++;
    }
}

open(OL, ">>$logfile") || die "Cannot open log file for appending";


#-------------------------------------------------------------------------------------
# eUtils stuff.
#-------------------------------------------------------------------------------------

my $db = "pubmed";  # Database to query
my $retmax = "5";  # Maximum number of results to return

# Parameters that NCBI would like us to include in our queries so that they can track
# our usage and notify us of any problems.
my $email = "vkv\@cs.utexas.edu";
my $tool = "bibAddPubMed";

my $eutils = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils";  # eUtils base URL
my $esearch = "$eutils/esearch.fcgi?db=$db&tool=$tool&email=$email&retmax=$retmax";
my $esummary = "$eutils/esummary.fcgi?db=$db&tool=$tool&email=$email&retmax=$retmax";
my $efetch = "$eutils/efetch.fcgi?db=$db&tool=$tool&email=$email&retmax=$retmax";

my $publink = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?";  # Publication base URL
my $pubabst = $publink . "cmd=retrieve&db=pubmed&dopt=abstract&list_uids=";


#-------------------------------------------------------------------------------------
# Find URLs for bib entries and store them in a hash indexed by the bib entry keys.
#-------------------------------------------------------------------------------------

# This bib file input object is used for constructing queries and requires macros to be 
# processed.  So preserve_values must be 0 (default).
my $bibfile0 = new Text::BibTeX::File $input;

while (my $entry0 = new Text::BibTeX::Entry $bibfile0) {
    if (! $entry0->parse_ok()) {
        $entry0->warn("parse error");
        next;
    }

    # Proceed only if regular bibtex entry.
    next if ($entry0->metatype != BTE_REGULAR);

    my $key = $entry0->key();  # bib key of the entry

    # Check to see if we have already processed this entry.
    next if (exists $urlhash{$key});

    # Get the title field of the entry and prepare it for use in the query.
    my $title = $entry0->get('title');
    $title =~ s/\\emph//;
    $title =~ s/\\em / /;
    $title =~ s/\\rm / /;
    $title = Text::BibTeX::purify_string($title);

    my $year = Text::BibTeX::purify_string($entry0->get('year'));

    # Parsing author names seems to be pretty good, although I am not sure how reliable
    # it is if there are unconventional name formats.  We include first author name in
    # the search query to narrow the search because the title specification doesn't seem 
    # to be working as expected (see comment below).
    my @names = $entry0->names('author');
    my @last = $names[0]->part('last') if (@names != 0);  # Last name of first author
    my $author = Text::BibTeX::purify_string($last[0]);

    # Construct the query and retrieve the results.
    # The [ti] modifier is supposed to restrict query to Title, but it doesn't seem to be 
    # working (entry is not found if this modifier is used).
    my $query = uri_escape($title);
    $query .= "+AND+" . $year . "[dp]" if ($year =~ /\d+/);
    $query .= "+AND+" . uri_escape($author) . "[1au]" if ($author ne "");
    my @ids = getuids($query);

    # Accumulate text to be output to log file for this entry and later write to the log
    # file in one shot, to prevent ending up with partial log for an entry in the event
    # of an error.
    my $logtext = scalar(@ids) . " hits returned for " . $key . "\n";

    # Check all ids returned for a direct match with the title.
    for (my $i=0; $i<@ids; $i++) {
        my $puburl = $pubabst . $ids[$i]->text;
        $logtext .= $puburl;

        # If the title corresponding to the id matches, and it is the first match,
        # add the URL to the hash.
        if (compactlc($title) eq compactlc(gettitle($ids[$i]->text))) {
            $urlhash{$key} = $puburl unless exists $urlhash{$key};
            $logtext .= " [match]";
        }

        $logtext .= "\n";
    }

    print OL $logtext;
    flush OL;

    # If no matching URL was found, indicate that by the empty string.
    $urlhash{$key} = "" unless exists $urlhash{$key};
}


# ------------------------------------------------------------------------------------
# If the -a option is given, fetch the abstracts also.
# ------------------------------------------------------------------------------------

if ($opts{'a'}) {
    for my $key (sort keys %urlhash) {
        # Skip if the URL is empty
        next if $urlhash{$key} eq "";

        # Check to see if we have already fetched this abstract.
        next if exists $abshash{$key};

        # Extract PubMed id of the article from the URL and fetch abstract.
        my ($pmid) = $urlhash{$key} =~ /list_uids=(.+)$/;
        $abshash{$key} = getabstract($pmid);

        my $logtext = "Abstract for $key\n";
        $logtext .= $abshash{$key} . "\n";
        print OL $logtext;
        flush OL;
    }
}

close OL;  # Close log file.


# ------------------------------------------------------------------------------------
# Create the output file by copying the input file to it and adding the URL fields.
# ------------------------------------------------------------------------------------

open(IB, $input) || die "Cannot open input file";
open(OB, ">$output") || die "Cannot open output file";

# We use a bib file input object to easily check if a URL or abstract field exists.
my $bibfile1 = new Text::BibTeX::File "$input";
my $entry1 = "";

# Lines are copied from input file to output file with a delay of one line to make
# sure that when a URL field is added, the previous line is terminated with a comma.
my $inentry = 0;
my $key = "";
my $prev = "";
while (<IB>) {
    if (/^@.+\{\s*(\S+)\s*,\s*$/) {  # Beginning of an entry
        $inentry == 0 || die "Unexpected beginning of entry";
        $inentry = 1;
        $key = $1;
        
        # Read the same entry from the bib file object.
        while ($entry1 = new Text::BibTeX::Entry $bibfile1) {
            my $bibkey = $entry1->key();
            last if $key eq $bibkey;
        }
    }
    elsif (/^\}/) {  # End of an entry
        $inentry == 1 || die "Unexpected end of entry";
        $inentry = 0;

        # Before printing the previous line, make sure it ends with a comma.
        if ($prev =~ /^\s*$/) {
            die "Remove empty lines before the closing brace in entry $key";
        }
        else {
            $prev =~ /,\s*$/ || $prev =~ s/\s*$/,\n/;
        }

        # If we found the URL for this entry in PubMed, and if it already doesn't 
        # exist in the bib file, then add it as the last field of the entry.
        if (exists $urlhash{$key} && $urlhash{$key} ne "" && ! $entry1->exists('url')) {
            print OB $prev;
            $prev = "  url          = \"$urlhash{$key}\",\n";
        }

        # If the -a option is given, and if we found the abstract in PubMed, and it 
        # doesn't exist already in the bib file, then add the abstract also.
        if ($opts{'a'} && exists $abshash{$key} && $abshash{$key} ne "" && 
            ! $entry1->exists('abstract')) {
            print OB $prev;
            my $abstract = $abshash{$key};
            $abstract =~ s/%/\\%/g;   # Escape any % characters.
            $abstract =~ s/\"/\'/g;   # Replace " with '
            $abstract = fillstring($abstract);
            $prev = "  abstract     = \"$abstract\",\n";
        }

        $key = "";
    }

    print OB $prev;  # Print previous line of the input file to output file.
    
    $prev = $_;
}
print OB $prev;  # Print last line of the input file to output file.

close IB;
close OB;


# ------------------------------------------------------------------------------------
# This section shows an alternative way of constructing the output file using the
# Text::BibTeX functionality.  However, this method breaks the formatting, so it is
# not used here.
# ------------------------------------------------------------------------------------
#
# A second input object from the same input file is used for constructing the output 
# file without macro substitutions.  So set preserve_values to 1.
#my $bibfile1 = new Text::BibTeX::File "$input";
#$bibfile1->preserve_values(1);
#
#my $newfile = new Text::BibTeX::File ">$output";
#
# Now add newly found URLs to entries and construct the output file.
#while (my $entry1 = new Text::BibTeX::Entry $bibfile1) {
#    my $key = $entry1->key();
#    $entry1->set('url', $urlhash{$key}) if (exists $urlhash{$key} && $urlhash{$key} ne "");
#    $entry1->write($newfile);
#}
#


#-------------------------------------------------------------------------------------
# Query the database and return an array of matching UIDs
#-------------------------------------------------------------------------------------

sub getuids {
    my $query = shift(@_);
    my $url = $esearch . "&term=$query";

    my $xml = get($url) || die "ESearch failed: $url";

    # Sleep for 3 seconds to comply with eutils etiquette.
    sleep 3;
    
    # We are only interested in the IdList returned in the XML document.  These are the
    # unique Ids of the documents matching the query.
    my $twig = new XML::Twig(TwigRoots => {'IdList/Id' => 1});
    $twig->parse($xml);
    my @ids = $twig->root->children;

    return @ids;
}


#-------------------------------------------------------------------------------------
# Query the database and return the title of the specified UID
#-------------------------------------------------------------------------------------

sub gettitle {
    my $uid = shift(@_);
    my $url = $esummary . "&id=$uid";

    my $xml = get($url) || die "ESummary failed: $url";

    # Sleep for 3 seconds to comply with eutils etiquette.
    sleep 3;
    
    # Collect all the items in a twig object.
    my $twig = new XML::Twig(TwigRoots => {'DocSum/Item' => 1});
    $twig->parse($xml);
    my @items = $twig->root->children;

    # We are only interested in the publication title.
    my $title = "";
    foreach my $item (@items) {
        if ($item->{'att'}->{'Name'} eq "Title") {
            $title = $item->text;
            chop $title;  # Remove the full-stop at the end of the title.
            last;
        }
    }

    # Use the same purify function we used for the title from the bib file.
    return Text::BibTeX::purify_string($title);
}


#-------------------------------------------------------------------------------------
# Query the database and return the abstract of the specified UID
#-------------------------------------------------------------------------------------

sub getabstract {
    my $uid = shift(@_);
    my $url = $efetch . "&rettype=abstract&retmode=xml&id=$uid";

    my $xml = get($url) || die "EFetch failed: $url";

    # Sleep for 3 seconds to comply with eutils etiquette.
    sleep 3;

    # We are only interested in the AbstractText returned in the XML document.
    my $twig = new XML::Twig(TwigRoots => {'Abstract/AbstractText' => 1});
    $twig->parse($xml);
    my @abstract = $twig->root->children;

    my $abstxt = "";
    $abstxt = $abstract[0]->text if @abstract == 1;

    return $abstxt;
}


#-------------------------------------------------------------------------------------
# Compacts a string by removing non-essential things and converts letters to lowercase.
#-------------------------------------------------------------------------------------

sub compactlc {
    my $string = shift(@_);
    $string = lc $string;  # Convert to lower case
    $string =~ s/the //g;  # Remove article "the"
    $string =~ s/\s+//g;   # Remove all white space
    return $string;
}


#-------------------------------------------------------------------------------------
# Breaks up a string into lines by replacing white space with new line when the line
# exceeds the specified number of characters.
#-------------------------------------------------------------------------------------

sub fillstring {
    my $string = shift(@_);
    my $strlen = length $string;
    my $width = 53;  # Maximum number of characters (including new line) in a line.

    # Look for the last white space in stretchs of $width characters of the string,
    # and replace them with new line characters.
    my $pos = $width;
    while ($pos < $strlen) {
        my $index = rindex($string, " ", $pos);
        
        # If we couldn't find a space in the current range of characters, relax the
        # max characters in line requirement and look ahead to try to find one.
        if ($index == -1 || $index < $pos-$width) {
            $index = index($string, " ", $pos);
            # If there is no space till end of string, exit loop.
            if ($index == -1) { 
                last;
            }
        }

        # Replace the space with a new line character.
        substr($string, $index, 1, "\n");
        $pos = $index + $width;
    }

    # Insert white space indendation after each new line.
    $string =~ s/\n/\n                  /g;

    return $string;
}


#-------------------------------------------------------------------------------------