Contribute: Plugins/ImportPluginsAWS
Contents
Import Plugin Tutorial 2: Amazon Web Services
In the last tutorial we created an import plugin that took data which needed very little modification to import into the respository. The column names in the CSV file matched the names of metadata fields present in the repository. In this tutorial we'll look at importing data that needs some modification to be imported, needs more error checking and is obtained in a different way.
We'll be using Amazon's E-Commerce Webservice to import books from their website into our respository given a list of ASINs (Amazon Standard Identification Numbers).
Before You Start
Amazon Web Services
To use Amazon's web services you must first signup for an account here. Their site has extensive documentation on the services that they offer as well as example programs including some written in Perl.
Required Modules
To prepare for this tutorial you should make sure the LWP::UserAgent module is installed. The following command as root, or using sudo should work.
cpan LWP::UserAgent
AWS.pm
package EPrints::Plugin::Import::MyPlugins::AWS; use EPrints::Plugin::Import::TextFile; use strict; use URI::Escape; our @ISA = ('EPrints::Plugin::Import::TextFile'); my $endpoint = "http://ecs.amazonaws.co.uk/onca/xml"; my $accesskey = '<YOURAMAZONWSKEY>'; my $service = "AWSECommerceService"; my $operation = "ItemLookup"; my $version = "2007-07-16"; sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new( %params ); $self->{name} = 'AWS'; $self->{visible} = 'all'; $self->{produce} = [ 'list/eprint' , 'dataobj/eprint']; my $rc = EPrints::Utils::require_if_exists('LWP::UserAgent'); unless ($rc) { $self->{visible} = ''; $self->{error} = 'Module LWP::UserAgent not found.'; } return $self; } sub input_fh { my( $plugin, %opts ) = @_; my @ids; my $fh = $opts{fh}; my @records = <$fh>; foreach my $input_data (@records) { my $epdata = $plugin->convert_input($input_data); next unless defined $epdata; my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata); if( defined $dataobj ) { push @ids, $dataobj->get_id; } } return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids=>\@ids ); } sub convert_input { my ($plugin, $input) = @_; my %output = (); $input =~ m/([0-9]+)/; $input = $1; my $request = "$endpoint?". "Service=$service&". "AWSAccessKeyId=$accesskey&". "Operation=$operation&". "ItemId=$input&". "Version=$version&". "ResponseGroup=Large,EditorialReview&"; my $ua = LWP::UserAgent->new; $ua->timeout(30); my $response = $ua->get($request); my $dom = EPrints::XML::parse_xml_string($response->content); my $rep = $dom->getElementsByTagName("Items")->item(0)-> getElementsByTagName("Request")->item(0); my $reptext = EPrints::Utils::tree_to_utf8($rep->getElementsByTagName("IsValid")->item(0)); unless ($reptext eq 'True') { $plugin->error("Invalid AWS Request"); return undef; } #Get Item Object my $item = $dom->getElementsByTagName("Items")->item(0)-> getElementsByTagName("Item")->item(0); unless (defined $item) { $plugin->error("No Item element found"); return undef; } my $attr = $item->getElementsByTagName("ItemAttributes")->item(0); my $pg = EPrints::Utils::tree_to_utf8($attr->getElementsByTagName("ProductGroup")->item(0)); unless ($pg eq 'Book') { $plugin->error("Product is not a book."); return undef; } $output{type} = "book"; $output{refereed} = "FALSE"; $output{ispublished} = "pub"; my $title = $attr->getElementsByTagName("Title")->item(0); $output{title} = EPrints::Utils::tree_to_utf8($title); my $url = $item->getElementsByTagName("DetailPageURL")->item(0); $output{official_url} = uri_unescape(EPrints::Utils::tree_to_utf8($url)); my $isbn = $attr->getElementsByTagName("ISBN")->item(0); if (defined $isbn) { $output{isbn} = EPrints::Utils::tree_to_utf8($isbn); } my $pages = $attr->getElementsByTagName("NumberOfPages")->item(0); if (defined $pages) { $output{pages} = EPrints::Utils::tree_to_utf8($pages); } my $publisher = $attr->getElementsByTagName("Publisher")->item(0); if (defined $publisher) { $output{publisher} = EPrints::Utils::tree_to_utf8($publisher); } my $pubdate = $attr->getElementsByTagName("PublicationDate")->item(0); if (defined $pubdate) { $output{date} = EPrints::Utils::tree_to_utf8($pubdate); } return \%output; } 1;
In More Detail
use URI::Escape;
my $endpoint = "http://ecs.amazonaws.co.uk/onca/xml"; my $accesskey = '<YOURAMAZONWSKEY>'; my $service = "AWSECommerceService"; my $operation = "ItemLookup"; my $version = "2007-07-16";
Constructor
$self->{produce} = [ 'list/eprint' , 'dataobj/eprint'];
my $rc = EPrints::Utils::require_if_exists('LWP::UserAgent'); unless ($rc) { $self->{visible} = ''; $self->{error} = 'Module LWP::UserAgent not found.'; }
Input
input_fh
my @ids;
my $fh = $opts{fh}; my @records = <$fh>;
foreach my $input_data (@records) { my $epdata = $plugin->convert_input($input_data); next unless defined $epdata; my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata); if( defined $dataobj ) { push @ids, $dataobj->get_id; } }
return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids=>\@ids );
convert_input
$input =~ m/([0-9]+)/; $input = $1;
#Perform the request my $request = "$endpoint?". "Service=$service&". "AWSAccessKeyId=$accesskey&". "Operation=$operation&". "ItemId=$input&". "Version=$version&". "ResponseGroup=Large,EditorialReview&";
#Send the request my $ua = LWP::UserAgent->new; $ua->timeout(30); my $response = $ua->get($request);
#Create domtree my $dom = EPrints::XML::parse_xml_string($response->content);
#Get and check Amazon Response my $rep = $dom->getElementsByTagName("Items")->item(0)-> getElementsByTagName("Request")->item(0); my $reptext = EPrints::Utils::tree_to_utf8($rep->getElementsByTagName("IsValid")->item(0)); unless ($reptext eq 'True') { $plugin->error("Invalid AWS Request"); return undef; }
#Get Item Object my $item = $dom->getElementsByTagName("Items")->item(0)-> getElementsByTagName("Item")->item(0); unless (defined $item) { $plugin->error("No Item element found"); return undef; }
#Get Attribute Object my $attr = $item->getElementsByTagName("ItemAttributes")->item(0);
#Check this is a book my $pg = EPrints::Utils::tree_to_utf8($attr->getElementsByTagName("ProductGroup")->item(0)); unless ($pg eq 'Book') { $plugin->error("Product is not a book."); return undef; }
#Populate Hash $output{type} = "book"; $output{refereed} = "FALSE"; $output{ispublished} = "pub";
#Add Title my $title = $attr->getElementsByTagName("Title")->item(0); $output{title} = EPrints::Utils::tree_to_utf8($title);
#Add URL my $url = $item->getElementsByTagName("DetailPageURL")->item(0); $output{official_url} = uri_unescape(EPrints::Utils::tree_to_utf8($url));
#Add ISBN my $isbn = $attr->getElementsByTagName("ISBN")->item(0); if (defined $isbn) { $output{isbn} = EPrints::Utils::tree_to_utf8($isbn); }
#Add Number of Pages my $pages = $attr->getElementsByTagName("NumberOfPages")->item(0); if (defined $pages) { $output{pages} = EPrints::Utils::tree_to_utf8($pages); }
#Add Publisher/Publication Date my $publisher = $attr->getElementsByTagName("Publisher")->item(0); if (defined $publisher) { $output{publisher} = EPrints::Utils::tree_to_utf8($publisher); }
my $pubdate = $attr->getElementsByTagName("PublicationDate")->item(0); if (defined $pubdate) { $output{date} = EPrints::Utils::tree_to_utf8($pubdate); } return \%output;