batch_ingest.pl
Date HTML Created
27 Apr 2010
#!/usr/bin/perl
#
# Script name: batch_ingest.pl
# Purpose:
# This script will take an excel file that describles the metadata for a batch ingest
# along with a directory containing the assets. Each row of the excel file will
# be converted into xml files of the form required by the DSPACE batch ingest system.
# The script will confirm that there were no errors found by the java code. If there
# are any errors the script termeninates with no files being ingested.
#
# Arguements:
# $file_name_input- name of the directory that cotainsd the excel file and the
# the directory containing the assets.
#
# Directory Structure
# $top_dir
# |
# batch_upload
# |
# | - logs (contains datestamped files with records of runs)
# |
# | - dir_name_input (user input containing excel and assets)
# | |
# | | - IngestMetaData.xls (contains the metadata name is fixed)
# | | - Assets_to_Ingest (directory contains the assets name is fixed)
# | - temp_output_dir_for_batch_aaa (directory generated by DCBatch, xml files
# that were created from the excel file are here)
#
# J. Silvis
# 2010 04 06
###########################################################################################
use XML::LibXML;
use DBI;
$dir_name_input = $ARGV[0];
# top dir on Odin
# Top dir on strip1
$top_dir = '/dspace/dspace-ir/';
$xml_dir=$top_dir . 'batch_upload/temp_output_dir_for_batch_aaa/IngestMetaData/';
$batch_log_dir =$top_dir . 'batch_upload/logs';
# The DB config information for strip1
$user='dspace_user';
$pw='dspace_pswd';
$database = "dspace_ir";
$host = "strip3.oit.umn.edu";
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime time;
$year = 1900 + $year;
if (length($mon) == 1) {$mon = "0" . $mon;}
if (length($mday) == 1) {$mday = "0" . $mday;}
if (length($hour) == 1) {$hour = "0" . $hour;}
if (length($min) == 1) {$min = "0" . $min;}
$date_stamp = $year . "_" . $mon . "_" . $mday . "_" . $hour . ":" . $min;
$log_file = $batch_log_dir . "/batch_upload_" . $date_stamp . ".log";
#
# The java code in DCBatch will convert rows of the excel file into xml files for
# the DSPACE code ItemImport.
system('./dsrun edu/umn/dspace/batch_upload/DCBatch ' . $dir_name_input . '>' . $log_file);
$NumERRORS =`grep ERROR $log_file | wc -l`;
if ($NumERRORS > 0){
open LOGFILE, ">>$log_file" or die "cannot open logfile $logfile for append: $!";
$message = "ERROR in excel file -- no files ingested";
print LOGFILE $message , "\n";
print $message , "\n";
close LOGFILE;
}
else{
# Get command line parameters for the java program ItemImport from config.xml.
# The two command line parameters that will be obtained are:
# 1) eperson_id - the id of the user that will ingest the files.
# 2) collection_id - the id of the collection where the items will reside in DSPACE
# Both of these values are derived from information in the excel file.
#
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($xml_dir . '/config.xml');
foreach my $config_para ($doc->findnodes('/config_parameters')) {
$eperson = $config_para->findnodes('./eperson') -> to_literal;
#
# Get the eperson_id from the eperson name
$dsn = "dbi:Pg:dbname=$database;host=$host";
$find_resource_id = DBI->connect($dsn, $user, $pw) or die "Error: Could not connect to DB\n";
my $find_resource_id_sql =
"SELECT eperson_id FROM eperson where netid=\'" . $eperson . "\'";
my $find_resource_sth = $find_resource_id->prepare($find_resource_id_sql);
$find_resource_sth-> execute();
$eperson_id = "";
while (my $ref = $find_resource_sth->fetchrow_hashref(lc)) {
$eperson_id = $ref->{'eperson_id'};
}
#Finished getting the eperson_id from the eperson name
# Get the collection_id directly from the config.xml file.
$collection_id = $config_para->findnodes('./collection_id') -> to_literal;
#
unlink glob("$xml_dir/mapfile.txt");
$cmd='./dsrun org.dspace.app.itemimport.ItemImport -a -c ' . $collection_id .
' -e ' . $eperson_id . ' -s ' . $xml_dir . ' -m ' . $xml_dir . '/mapfile.txt >> ' . $log_file;
print $cmd . "\n";
open LOGFILE, ">>$log_file" or die "cannot open logfile $logfile for append: $!";
$message = "\n\n running the command $cmd \n";
print LOGFILE $message , "\n";
print $message , "\n";
close LOGFILE;
system($cmd);
}
} #endif