batch_ingest.pl

Date HTML Created

27 Apr 2010

 
#!/usr/bin/perl
#
#  Script name: batch_ingest.pl
#  Purpose:
#   This script will take an excel file that describles the metadata for a batch ingest
#   along with a directory containing the assets.  Each row of the excel file will 
#   be converted into xml files of the form required by the DSPACE batch ingest system.
#   The script will confirm that there were no errors found by the java code.  If there 
#   are any errors the script termeninates with no files being ingested.
#
#  Arguements: 
#       $file_name_input- name of the directory that cotainsd the excel file and the 
#            the directory containing the assets.
#  
#       Directory Structure
#               $top_dir
#                  |
#                  batch_upload
#                        |
#                        | - logs  (contains datestamped files with records of runs)
#                        |
#                        | - dir_name_input (user input containing excel and assets)
#                        |      | 
#                        |      | - IngestMetaData.xls (contains the metadata name is fixed)
#                        |      | - Assets_to_Ingest  (directory contains the assets name is fixed)
#                        | -  temp_output_dir_for_batch_aaa (directory generated by DCBatch, xml files 
#                                 that were created from the excel file are here)
#
#   J. Silvis
#   2010 04 06
###########################################################################################
use XML::LibXML;
  use DBI;
  $dir_name_input = $ARGV[0];
#  top dir on Odin

#  Top dir on strip1
  $top_dir = '/dspace/dspace-ir/';
  $xml_dir=$top_dir . 'batch_upload/temp_output_dir_for_batch_aaa/IngestMetaData/';
  $batch_log_dir =$top_dir . 'batch_upload/logs';

#  The DB config information for strip1
  $user='dspace_user';
  $pw='dspace_pswd';
  $database = "dspace_ir";
  $host = "strip3.oit.umn.edu";



  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime time;
  $year = 1900 + $year;
  if (length($mon) == 1) {$mon = "0" . $mon;}
  if (length($mday) == 1) {$mday = "0" . $mday;}
  if (length($hour) == 1) {$hour = "0" . $hour;}
  if (length($min) == 1) {$min = "0" . $min;}
  $date_stamp = $year . "_" . $mon . "_" . $mday . "_" . $hour . ":" . $min;

$log_file = $batch_log_dir . "/batch_upload_" . $date_stamp . ".log";
 #
 # The java code in DCBatch will convert rows of the excel file into xml files for 
 # the DSPACE code ItemImport.
  system('./dsrun edu/umn/dspace/batch_upload/DCBatch ' . $dir_name_input . '>' . $log_file);
  $NumERRORS =`grep ERROR $log_file | wc -l`;
  
  if ($NumERRORS > 0){
    open LOGFILE, ">>$log_file" or die "cannot open logfile $logfile for append: $!";
    $message = "ERROR in excel file -- no files ingested";
    print LOGFILE  $message , "\n";
    print  $message , "\n";
    close LOGFILE;
  }
  else{
    #  Get command line parameters for the java program ItemImport from config.xml.
    #  The two command line parameters that will be obtained are:
    #  1) eperson_id - the id of the user that will ingest the files.
    #  2) collection_id - the id of the collection where the items will reside in  DSPACE
    # Both of these values are derived from information in the excel file.
    # 
    my $parser = XML::LibXML->new();
    my $doc    = $parser->parse_file($xml_dir . '/config.xml');

    foreach my $config_para  ($doc->findnodes('/config_parameters')) {
      $eperson        =   $config_para->findnodes('./eperson') -> to_literal;
      #
      #  Get the eperson_id from the eperson name
      $dsn = "dbi:Pg:dbname=$database;host=$host";
      $find_resource_id = DBI->connect($dsn, $user, $pw) or die "Error: Could not connect to DB\n";
      my $find_resource_id_sql = 
        "SELECT eperson_id  FROM eperson where netid=\'" . $eperson . "\'";

      my $find_resource_sth = $find_resource_id->prepare($find_resource_id_sql);
      $find_resource_sth-> execute();
      $eperson_id = "";
      while (my $ref = $find_resource_sth->fetchrow_hashref(lc)) {
        $eperson_id =  $ref->{'eperson_id'};
      }
  #Finished getting the eperson_id from the eperson name
    # Get the collection_id directly from the config.xml file.
      $collection_id  =   $config_para->findnodes('./collection_id') -> to_literal;
      # 
      
      unlink  glob("$xml_dir/mapfile.txt");
      $cmd='./dsrun org.dspace.app.itemimport.ItemImport -a -c ' .  $collection_id  . 
       ' -e ' .  $eperson_id  . ' -s ' . $xml_dir . ' -m ' . $xml_dir . '/mapfile.txt >> ' . $log_file;
      print $cmd . "\n";
      
    open LOGFILE, ">>$log_file" or die "cannot open logfile $logfile for append: $!";
    $message = "\n\n running the command $cmd \n";
    print LOGFILE  $message , "\n";
    print  $message , "\n";
    close LOGFILE;
      
      system($cmd);
     }
  } #endif