Contribute: Plugins/ImportPluginsAWS
Contents
Import Plugin Tutorial 2: Amazon Web Services
In the last tutorial we created an import plugin that took data which needed very little modification to import into the respository. The column names in the CSV file matched the names of metadata fields present in the repository. In this tutorial we'll look at importing data that needs some modification to be imported, needs more error checking and is obtained in a different way.
We will be using Amazon's E-Commerce Webservice to import books from their website into our respository given a list of ASINs (Amazon Standard Identification Numbers).
We will be accessing the service using a REST approach, communicating with the server using URL parameters and retrieving an XML document in response to our request. It is also possible to access their services using SOAP, but that will not be discussed here.
Before You Start
Amazon Web Services
To use Amazon's web services you must first signup for an account here. Their site has extensive documentation on the services that they offer as well as example programs including some written in Perl.
Required Modules
To prepare for this tutorial you should make sure the LWP::UserAgent module is installed. The following command as root, or using sudo should work.
cpan LWP::UserAgent
AWS.pm
package EPrints::Plugin::Import::MyPlugins::AWS;
use EPrints::Plugin::Import::TextFile;
use strict;
use URI::Escape;
our @ISA = ('EPrints::Plugin::Import::TextFile');
my $endpoint = "http://ecs.amazonaws.co.uk/onca/xml";
my $accesskey = '<YOURAMAZONWSKEY>';
my $service = "AWSECommerceService";
my $operation = "ItemLookup";
my $version = "2007-07-16";
my $responsegroup = "Large,EditorialReview";
sub new
{
        my( $class, %params ) = @_;
        my $self = $class->SUPER::new( %params );
        $self->{name} = 'AWS';
        $self->{visible} = 'all';
        $self->{produce} = [ 'list/eprint' , 'dataobj/eprint'];
        my $rc = EPrints::Utils::require_if_exists('LWP::UserAgent');
        unless ($rc)
        {
                $self->{visible} = '';
                $self->{error} = 'Module LWP::UserAgent not found.';
        }
        return $self;
}
sub input_fh
{
        my( $plugin, %opts ) = @_;
        my @ids;
        my $fh = $opts{fh};
        my @records = <$fh>;
        foreach my $input_data (@records)
        {
                my $epdata = $plugin->convert_input($input_data);
                next unless defined $epdata;
                my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata);
                if( defined $dataobj )
                {
                        push @ids, $dataobj->get_id;
                }
        }
        return EPrints::List->new(
                        dataset => $opts{dataset},
                        session => $plugin->{session},
                        ids=>\@ids );
}
sub convert_input
{
        my ($plugin, $input) = @_;
        my %output = ();
        $input =~ m/([0-9]+)/;
        $input = $1;
        my $request =
                "$endpoint?".
                "Service=$service&".
                "AWSAccessKeyId=$accesskey&".
                "Operation=$operation&".
                "ItemId=$input&".
                "Version=$version&".
                "ResponseGroup=Large,EditorialReview&";
        my $ua = LWP::UserAgent->new;
        $ua->timeout(30);
        my $response = $ua->get($request);
        my $dom = EPrints::XML::parse_xml_string($response->content);
        my $rep =
                $dom->getElementsByTagName("Items")->item(0)->
                getElementsByTagName("Request")->item(0);
        my $reptext =
                EPrints::Utils::tree_to_utf8($rep->getElementsByTagName("IsValid")->item(0));
        unless ($reptext eq 'True') 
        {
                $plugin->error("Invalid AWS Request");
                return undef;
        }
        #Get Item Object
        my $item =
                $dom->getElementsByTagName("Items")->item(0)->
                getElementsByTagName("Item")->item(0);
        unless (defined $item) 
        {
                $plugin->error("No Item element found");
                return undef;
        }
        my $attr = $item->getElementsByTagName("ItemAttributes")->item(0);
        my $pg = EPrints::Utils::tree_to_utf8($attr->getElementsByTagName("ProductGroup")->item(0));
        unless ($pg eq 'Book') 
        {
                $plugin->error("Product is not a book.");
                return undef;
        }
        $output{type} = "book";
        $output{refereed} = "FALSE";
        $output{ispublished} = "pub";
        my $title = $attr->getElementsByTagName("Title")->item(0);
        $output{title} = EPrints::Utils::tree_to_utf8($title);
        my $url = $item->getElementsByTagName("DetailPageURL")->item(0);
        $output{official_url} = uri_unescape(EPrints::Utils::tree_to_utf8($url));
        my $isbn = $attr->getElementsByTagName("ISBN")->item(0);
        if (defined $isbn)
        {
                $output{isbn} = EPrints::Utils::tree_to_utf8($isbn);
        }
        my $pages = $attr->getElementsByTagName("NumberOfPages")->item(0);
        if (defined $pages)
        {
                $output{pages} = EPrints::Utils::tree_to_utf8($pages);
        }
        my $publisher = $attr->getElementsByTagName("Publisher")->item(0);
        if (defined $publisher)
        {
                $output{publisher} = EPrints::Utils::tree_to_utf8($publisher);
        }
        my $pubdate = $attr->getElementsByTagName("PublicationDate")->item(0);
        if (defined $pubdate)
        {
                $output{date} = EPrints::Utils::tree_to_utf8($pubdate);
        }
        return \%output;
}
1;
In More Detail
We will use the URI::Escape module in this plugin. As it is included with EPrints we don't need to check if it exists first.
use URI::Escape;
Here we setup a number of values for parameters that will be part of our web service requests. The endpoint variable determines which server will be sent the request. Here we have used the UK server, but by changing the TLD we can use the US, Canadian, German or French servers.
The accesskey stores the access key you will have gained from signing up to Amazon earlier. You should use the normal access key and not the secret one.
Here we use the ItemLookup operation of the AWSECommerceService with the 2007-07-16 version of the service API. Other operations allow searching for items, but here we want to look up specific products. Finally the variable responsegroup determines the amount and nature of the information returned, we select "Large" in this case, giving a lot of information about the item.
my $endpoint = "http://ecs.amazonaws.co.uk/onca/xml"; my $accesskey = '<YOURAMAZONWSKEY>'; my $service = "AWSECommerceService"; my $operation = "ItemLookup"; my $version = "2007-07-16"; my $responsegroup = "Large";
Constructor
The constructor is similar to the one used for the CSV plugin, except this one will import individual eprints, given an ASIN.
        $self->{produce} = [ 'list/eprint' , 'dataobj/eprint'];
Like we imported Text::CSV in the last tutorial, here we import LWP::UserAgent which will be used for making requests to the web service.
        my $rc = EPrints::Utils::require_if_exists('LWP::UserAgent');
        unless ($rc)
        {
                $self->{visible} = '';
                $self->{error} = 'Module LWP::UserAgent not found.';
        }
Input
input_fh
This method is similar to the one used in the CSV plugin, but doesn't have to do quite so much work.
First we create the array to hold our imported eprint ids.
my @ids;
Next we read all the lines in the supplied file handle into our records array.
        my $fh = $opts{fh};
        my @records = <$fh>;
Then we iterate over each record, running convert_input on it, importing it into our repository and adding the id to our array.
        foreach my $input_data (@records)
        {
                my $epdata = $plugin->convert_input($input_data);
                next unless defined $epdata;
                my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata);
                if( defined $dataobj )
                {
                        push @ids, $dataobj->get_id;
                }
        }
Then we return a List object of the items imported.
        return EPrints::List->new(
                        dataset => $opts{dataset},
                        session => $plugin->{session},
                        ids=>\@ids );
convert_input
ASINs are strings of decimal digits which may have leading zeroes which identify a product. Here we remove any non-numerical characters which are surrounding the ASIN.
        $input =~ m/([0-9]+)/;
        $input = $1;
We form the request from the variables we created earlier and the ASIN we have just obtained.
        my $request =
                "$endpoint?".
                "Service=$service&".
                "AWSAccessKeyId=$accesskey&".
                "Operation=$operation&".
                "ItemId=$input&".
                "Version=$version&".
                "ResponseGroup=$responsegroup";
        #Send the request
        my $ua = LWP::UserAgent->new;
        $ua->timeout(30);
        my $response = $ua->get($request);
        #Create domtree
        my $dom = EPrints::XML::parse_xml_string($response->content);
        #Get and check Amazon Response
        my $rep =
                $dom->getElementsByTagName("Items")->item(0)->
                getElementsByTagName("Request")->item(0);
        my $reptext =
                EPrints::Utils::tree_to_utf8($rep->getElementsByTagName("IsValid")->item(0));
        unless ($reptext eq 'True') 
        {
                $plugin->error("Invalid AWS Request");
                return undef;
        }
        #Get Item Object
        my $item =
                $dom->getElementsByTagName("Items")->item(0)->
                getElementsByTagName("Item")->item(0);
        unless (defined $item) 
        {
                $plugin->error("No Item element found");
                return undef;
        }
        #Get Attribute Object
        my $attr = $item->getElementsByTagName("ItemAttributes")->item(0);
        #Check this is a book
        my $pg = EPrints::Utils::tree_to_utf8($attr->getElementsByTagName("ProductGroup")->item(0));
        unless ($pg eq 'Book') 
        {
                $plugin->error("Product is not a book.");
                return undef;
        }
        #Populate Hash
        $output{type} = "book";
        $output{refereed} = "FALSE";
        $output{ispublished} = "pub";
        #Add Title
        my $title = $attr->getElementsByTagName("Title")->item(0);
        $output{title} = EPrints::Utils::tree_to_utf8($title);
        #Add URL
        my $url = $item->getElementsByTagName("DetailPageURL")->item(0);
        $output{official_url} = uri_unescape(EPrints::Utils::tree_to_utf8($url));
        #Add ISBN
        my $isbn = $attr->getElementsByTagName("ISBN")->item(0);
        if (defined $isbn)
        {
                $output{isbn} = EPrints::Utils::tree_to_utf8($isbn);
        }
        #Add Number of Pages
        my $pages = $attr->getElementsByTagName("NumberOfPages")->item(0);
        if (defined $pages)
        {
                $output{pages} = EPrints::Utils::tree_to_utf8($pages);
        }
        #Add Publisher/Publication Date
        my $publisher = $attr->getElementsByTagName("Publisher")->item(0);
        if (defined $publisher)
        {
                $output{publisher} = EPrints::Utils::tree_to_utf8($publisher);
        }
        my $pubdate = $attr->getElementsByTagName("PublicationDate")->item(0);
        if (defined $pubdate)
        {
                $output{date} = EPrints::Utils::tree_to_utf8($pubdate);
        }
        return \%output;
