Location: PHPKode > projects > VuFind > vufind-1.0.1/harvest/harvest_oai.php
<?php
/**
  *
  * Copyright (c) Demian Katz 2010.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
  * as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  */
require_once '../util/util.inc.php';        // set up util environment
require_once 'sys/Proxy_Request.php';

// Read Config files
$configArray = parse_ini_file('../web/conf/config.ini', true);
$oaiSettings = @parse_ini_file('oai.ini', true);
if (empty($oaiSettings)) {
    die("Please add OAI-PMH settings to oai.ini.\n");
}

// If first command line parameter is set, see if we can limit to just the
// specified OAI harvester:
if (isset($argv[1])) {
    if (isset($oaiSettings[$argv[1]])) {
        $oaiSettings = array($argv[1] => $oaiSettings[$argv[1]]);
    } else {
        die("Could not load settings for {$argv[1]}.\n");
    }
}

// Loop through all the settings and perform harvests:
$processed = 0;
foreach($oaiSettings as $target => $settings) {
    if (!empty($target) && !empty($settings)) {
        echo "Processing {$target}...\n";
        $harvest = new HarvestOAI($target, $settings);
        $harvest->launch();
        $processed++;
    }
}

// All done.
die("Completed without errors -- {$processed} source(s) processed.\n");

/**
 * HarvestOAI Class
 *
 * This class harvests records via OAI-PMH using settings from oai.ini.
 *
 * @author      Demian Katz
 * @access      public
 */
class HarvestOAI
{
    private $baseURL;               // URL to harvest from
    private $set = null;            // Target set to harvest (null for all records)
    private $metadata = 'oai_dc';   // Metadata type to harvest
    private $idPrefix = '';         // OAI prefix to strip from ID values
    private $basePath;              // Directory for storing harvested files
    private $lastHarvestFile;       // File for tracking last harvest date
    private $startDate = null;      // Harvest start date (null for all records)
    private $granularity = 'auto';  // Date granularity

    // As we harvest records, we want to track the most recent date encountered
    // so we can set a start point for the next harvest.
    private $endDate = 0;

    /**
     * Constructor.
     *
     * @access  public
     * @param   string  $target         Target directory for harvest.
     * @param   array   $settings       OAI-PMH settings from oai.ini.
     */
    public function __construct($target, $settings)
    {
        global $configArray;

        // Don't time out during harvest!!
        set_time_limit(0);

        // Set up base directory for harvested files:
        $this->setBasePath($target);

        // Check if there is a file containing a start date:
        $this->lastHarvestFile = $this->basePath . 'last_harvest.txt';
        $this->loadLastHarvestedDate();

        // Set up base URL:
        if (empty($settings['url'])) {
            die("Missing base URL for {$target}.\n");
        }
        $this->baseURL = $settings['url'];
        if (isset($settings['set'])) {
            $this->set = $settings['set'];
        }
        if (isset($settings['metadataPrefix'])) {
            $this->metadata = $settings['metadataPrefix'];
        }
        if (isset($settings['idPrefix'])) {
            $this->idPrefix = $settings['idPrefix'];
        }
        if (isset($settings['dateGranularity'])) {
            $this->granularity = $settings['dateGranularity'];
        }
        if ($this->granularity == 'auto') {
            $this->loadGranularity();
        }
    }

    /**
     * Set a start date for the harvest (only harvest records AFTER this date).
     *
     * @access  public
     * @param   string      $date       Start date (YYYY-MM-DD format).
     */
    public function setStartDate($date)
    {
        $this->startDate = $date;
    }

    /**
     * Harvest all available documents.
     *
     * @access  public
     */
    public function launch()
    {
        $this->getRecordsByDate($this->startDate, $this->set);
    }

    /**
     * Set up directory structure for harvesting (support method for constructor).
     *
     * @access  private
     * @param   string  $target         The OAI-PMH target directory to create.
     */
    private function setBasePath($target)
    {
        // Get the base VuFind path:
        $home = getenv('VUFIND_HOME');
        if (empty($home)) {
            die("Please set the VUFIND_HOME environment variable.\n");
        }

        // Build the full harvest path:
        $this->basePath = $home . '/harvest/' . $target . '/';

        // Create the directory if it does not already exist:
        if (!is_dir($this->basePath)) {
            if (!mkdir($this->basePath)) {
                die("Problem creating directory {$this->basePath}.\n");
            }
        }
    }

    /**
     * Retrieve the date from the "last harvested" file and use it as our start
     * date if it is available.
     *
     * @access  private
     */
    private function loadLastHarvestedDate()
    {
        if (file_exists($this->lastHarvestFile)) {
            $lines = file($this->lastHarvestFile);
            if (is_array($lines)) {
                $date = trim($lines[0]);
                if (!empty($date)) {
                    $this->setStartDate(trim($date));
                }
            }
        }
    }

    /**
     * Normalize a date to a Unix timestamp.
     *
     * @param   string  $date           Date (ISO-8601 or YYYY-MM-DD HH:MM:SS)
     * @return  integer                 Unix timestamp (or false if $date invalid)
     * @access  protected
     */
    protected function normalizeDate($date)
    {
        // Remove timezone markers -- we don't want PHP to outsmart us by adjusting
        // the time zone!
        $date = str_replace(array('T', 'Z'), array(' ', ''), $date);
        
        // Translate to a timestamp:
        return strtotime($date);
    }

    /**
     * Save a date to the "last harvested" file.
     *
     * @access  private
     * @param   string      $date       Date to save.
     */
    private function saveLastHarvestedDate($date)
    {
        file_put_contents($this->lastHarvestFile, $date);
    }

    /**
     * Make an OAI-PMH request.  Die if there is an error; return a SimpleXML object
     * on success.
     *
     * @access  private
     * @param   string      $verb       OAI-PMH verb to execute.
     * @param   array       $params     GET parameters for ListRecords method.
     * @return  object                  SimpleXML-formatted response.
     */
    private function sendRequest($verb, $params = array())
    {
        // Set up the request:
        $request = new Proxy_Request();
        $request->setMethod(HTTP_REQUEST_METHOD_GET);
        $request->setURL($this->baseURL);

        // Load request parameters:
        $request->addQueryString('verb', $verb);
        foreach($params as $key => $value) {
            $request->addQueryString($key, $value);
        }

        // Perform request and die on error:
        $result = $request->sendRequest();
        if (PEAR::isError($result)) {
            die($result->getMessage() . "\n");
        }

        // If we got this far, there was no error -- send back response.
        $response = $request->getResponseBody();
        return $this->processResponse($response);
    }

    /**
     * Process an OAI-PMH response into a SimpleXML object.  Die if an error is
     * detected.
     *
     * @access  private
     * @param   string      $xml        OAI-PMH response XML.
     * @return  object                  SimpleXML-formatted response.
     */
    private function processResponse($xml)
    {
        // Parse the XML:
        $result = simplexml_load_string($xml);
        if (!$result) {
            die("Problem loading XML: {$xml}\n");
        }

        // Detect errors and die if one is found:
        if ($result->error) {
            $attribs = $result->error->attributes();
            die("OAI-PMH error -- code: {$attribs['code']}, value: {$result->error}\n");
        }

        // If we got this far, we have a valid response:
        return $result;
    }

    /**
     * Get the filename for a specific record ID.
     *
     * @access  private
     * @param   string      $id         ID of record to save.
     * @param   string      $ext        File extension to use.
     * @return                          Full path + filename.
     */
    private function getFilename($id, $ext)
    {
        return $this->basePath . time() . '_' . $id . '.' . $ext;
    }

    /**
     * Create a tracking file to record the deletion of a record.
     *
     * @access  private
     * @param   string      $id         ID of deleted record.
     */
    private function saveDeletedRecord($id)
    {
        $filename = $this->getFilename($id, 'delete');
        file_put_contents($filename, $id);
    }

    /**
     * Save a record to disk.
     *
     * @access  private
     * @param   string      $id         ID of record to save.
     * @param   object      $metadata   Metadata to save (in SimpleXML format).
     */
    private function saveRecord($id, $metadata)
    {
        // Extract the actual metadata from inside the <metadata></metadata> tags;
        // there is probably a cleaner way to do this, but this simple method avoids
        // the complexity of dealing with namespaces in SimpleXML:
        $xml = trim($metadata->asXML());
        $xml = preg_replace('/(^<metadata>)|(<\/metadata>$)/m', '', $xml);
        file_put_contents($this->getFilename($id, 'xml'), trim($xml));
    }

    /**
     * Load date granularity from the server.
     *
     * @access  private
     */
    private function loadGranularity()
    {
        echo "Autodetecting date granularity... ";
        $response = $this->sendRequest('Identify');
        $this->granularity = (string)$response->Identify->granularity;
        echo "found {$this->granularity}.\n";
    }

    /**
     * Extract the ID from a record object (support method for processRecords()).
     *
     * @access  private
     * @param   object      $record     SimpleXML record.
     * @return  string                  The ID value.
     */
    private function extractID($record)
    {
        // Normalize to string:
        $id = (string)$record->header->identifier;

        // Strip prefix if found:
        if (substr($id, 0, strlen($this->idPrefix)) == $this->idPrefix) {
            $id = substr($id, strlen($this->idPrefix));
        }

        // Return final value:
        return $id;
    }

    /**
     * Save harvested records to disk and track the end date.
     *
     * @access  private
     * @param   object      $records    SimpleXML records.
     */
    private function processRecords($records)
    {
        echo 'Processing ' . count($records) . " records...\n";

        // Loop through the records:
        foreach($records as $record) {
            // Die if the record is missing its header:
            if (empty($record->header)) {
                die("Unexpected missing record header.\n");
            }

            // Get the ID of the current record:
            $id = $this->extractID($record);

            // Save the current record, either as a deleted or as a regular file:
            $attribs = $record->header->attributes();
            if (strtolower($attribs['status']) == 'deleted') {
                $this->saveDeletedRecord($id);
            } else if (!isset($record->metadata)) {
                die("Unexpected missing record metadata.\n");
            } else {
                $this->saveRecord($id, $record->metadata);
            }

            // If the current record's date is newer than the previous end date,
            // remember it for future reference:
            $date = $this->normalizeDate($record->header->datestamp);
            if ($date && $date > $this->endDate) {
                $this->endDate = $date;
            }
        }
    }

    /**
     * Harvest records using OAI-PMH.
     *
     * @access  private
     * @param   array       $params     GET parameters for ListRecords method.
     */
    private function getRecords($params)
    {
        // Make the OAI-PMH request:
        $response = $this->sendRequest('ListRecords', $params);

        // Save the records from the response:
        if ($response->ListRecords->record) {
            $this->processRecords($response->ListRecords->record);
        }

        // If we have a resumption token, keep going; otherwise, we're done -- save
        // the end date.
        if ($response->ListRecords->resumptionToken) {
            $this->getRecordsByToken($response->ListRecords->resumptionToken);
        } else if ($this->endDate > 0) {
            $dateFormat = ($this->granularity == 'YYYY-MM-DD') ?
                'Y-m-d' : 'Y-m-d\TH:i:s\Z';
            $this->saveLastHarvestedDate(date($dateFormat, $this->endDate));
        }
    }

    /**
     * Harvest records via OAI-PMH using date and set.
     *
     * @access  private
     * @param   string      $date       Harvest start date (null for all records).
     * @param   string      $set        Set to harvest (null for all records).
     */
    private function getRecordsByDate($date = null, $set = null)
    {
        $params = array('metadataPrefix' => $this->metadata);
        if (!empty($date)) {
            $params['from'] = $date;
        }
        if (!empty($set)) {
            $params['set'] = $set;
        }
        $this->getRecords($params);
    }

    /**
     * Harvest records via OAI-PMH using resumption token.
     *
     * @access  private
     * @param   string      $token      Resumption token.
     */
    private function getRecordsByToken($token)
    {
        $this->getRecords(array('resumptionToken' => $token));
    }
}


?>
Return current item: VuFind