4 # Provides a client for pulling data from OAI-PMH providers
5 # For protocol documentation, see:
6 # http://www.openarchives.org/OAI/openarchivesprotocol.html
9 # OAIClient(ServerUrl, Cache)
12 # - Change the base url of the remote repository
13 # MetadataPrefix($pfx)
14 # - Set the schema we will request from remote
16 # - Restrict queries to a single set
18 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set
20 # - Fetch identifying information about the remote repository
22 # - Fetch information about what schemas remote can serve
23 # GetRecords($start,$end)
24 # - Pull records in batches, optionally with date restrictions
26 # - Pull a single record using a unique identifier
27 # MoreRecordsAvailable()
28 # - Determine if a batch pull is complete or not
29 # ResetRecordPointer()
30 # - Restart a batch pull from the beginning
32 # - Determine verbosity
34 # Copyright 2014 Edward Almasy and Internet Scout
35 # http://scout.wisc.edu
40 # ---- PUBLIC INTERFACE --------------------------------------------------
50 # set default debug level
51 $this->DebugLevel = 0;
56 # set default metadata prefix
59 # set default set specification for queries
62 $this->CacheSequenceNumber = 0;
65 $this->Cache = $Cache;
81 if ($NewValue != NULL)
85 return $this->ServerUrl;
96 if ($NewValue != NULL)
100 return $this->MetadataPrefix;
109 function SetSpec($NewValue =
"X-NOSETSPECVALUE-X")
111 if ($NewValue !=
"X-NOSETSPECVALUE-X")
115 return $this->SetSpec;
127 # query server for XML text
128 $XmlText = $this->PerformQuery(
"Identify");
129 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
131 # convert XML text into object
132 $Xml = simplexml_load_string($XmlText);
133 $this->DebugOutVar(9, __METHOD__,
"Xml", $Xml);
135 # if identification info was found
137 if (isset($Xml->Identify))
140 $Ident = $Xml->Identify;
141 $this->GetValFromXml($Ident,
"repositoryName",
"Name", $Info);
142 $this->GetValFromXml($Ident,
"adminEmail",
"Email", $Info);
143 $this->GetValFromXml($Ident,
"baseURL",
"URL", $Info);
146 # return info to caller
157 # query server for XML text
158 $XmlText = $this->PerformQuery(
"ListMetadataFormats");
159 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
161 # convert XML text into object
162 $Xml = simplexml_load_string($XmlText);
163 $this->DebugOutVar(9, __METHOD__,
"Xml", $Xml);
165 # if format info was found
167 if (isset($Xml->ListMetadataFormats->metadataFormat))
171 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
173 $this->GetValFromXml(
174 $Format,
"metadataPrefix",
"Name", $Formats[$Index]);
175 $this->GetValFromXml(
176 $Format,
"schema",
"Schema", $Formats[$Index]);
177 $this->GetValFromXml(
178 $Format,
"metadataNamespace",
"Namespace",
184 # return info to caller
197 # if we're using a cache directory, figure out which file
198 # should contain this set of records
199 if ($this->Cache !== NULL)
201 $cache_fname = sprintf(
"%s/%010x",
203 $this->CacheSequenceNumber);
204 $this->CacheSequenceNumber++;
207 # when we're not using a cache or don't have a cached copy of
208 # this set of records, query the OAI provider to get it
209 if ($this->Cache === NULL || !file_exists($cache_fname) )
211 # if we have resumption token from prior query
212 if (isset($this->ResumptionToken))
214 # use resumption token as sole argument
215 $Args[
"resumptionToken"] = $this->ResumptionToken;
219 # set up arguments for query
220 $Args[
"metadataPrefix"] = $this->MetadataPrefix;
221 if ($StartDate) { $Args[
"from"] = $StartDate; }
222 if ($EndDate) { $Args[
"until"] = $EndDate; }
223 if ($this->
SetSpec) { $Args[
"set"] = $this->SetSpec; }
226 # query server for XML text
227 $XmlText = $this->PerformQuery(
"ListRecords", $Args);
229 # if a cache is in use, save this chunk of XML into it
230 if ($this->Cache !== NULL)
232 file_put_contents($cache_fname, $XmlText);
237 # get XML text from the cache
238 $XmlText = file_get_contents($cache_fname);
241 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
243 return $this->GetRecordsFromXML($XmlText,
"ListRecords");
262 $Args[
"metadataPrefix"] = $this->MetadataPrefix;
263 $Args[
"identifier"] = $Id;
265 # query server for XML text
266 $XmlText = $this->PerformQuery(
"GetRecord", $Args);
267 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
269 return $this->GetRecordsFromXML($XmlText,
"GetRecord");
279 return isset($this->ResumptionToken) ? TRUE : FALSE;
287 unset($this->ResumptionToken);
288 $this->CacheSequenceNumber = 0;
298 $this->DebugLevel = $NewLevel;
302 # ---- PRIVATE INTERFACE -------------------------------------------------
305 private $MetadataPrefix;
308 private $ResumptionToken;
310 private $CacheSequenceNumber;
312 # perform OAI query and return resulting data to caller
313 private function PerformQuery($QueryVerb, $Args = NULL)
315 # open stream to OAI server
317 if (strpos($this->
ServerUrl,
"?") === FALSE)
319 $QueryUrl = $this->
ServerUrl.
"?verb=".$QueryVerb;
323 $QueryUrl = $this->
ServerUrl.
"&verb=".$QueryVerb;
328 foreach ($Args as $ArgName => $ArgValue)
330 $QueryUrl .=
"&".urlencode($ArgName).
"=".urlencode($ArgValue);
333 $FHndl = fopen($QueryUrl,
"r");
335 # if stream was successfully opened
337 if ($FHndl !== FALSE)
339 # while lines left in response
340 while (!feof($FHndl))
342 # read line from server and add it to text to be parsed
343 $Text .= fread($FHndl, 10000000);
347 # close OAI server stream
350 # return query result data to caller
354 # set array value if available in simplexml object
355 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
357 if (isset($Xml->$SrcName))
359 $Results[$DstName] = trim($Xml->$SrcName);
363 # print variable contents if debug is above specified level
364 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
366 if ($this->DebugLevel >= $Level)
368 print(
"\n<pre>".$MethodName.
"() ".$VarName.
" = \n");
436 private function ExtractDataFromXml(&$Records, $Index, DOMNode $dom, $Section, $ParentTagName=NULL)
438 foreach ($dom->childNodes as $node)
440 # for DOM children that are elements (rather than comments, text, or something else)
441 if ($node->nodeType == XML_ELEMENT_NODE)
443 # compute a tag name to use
445 (($ParentTagName!==NULL) ? $ParentTagName.
"/" :
"")
448 # Glue together the contents of the 'text' children of this node
450 foreach ($node->childNodes as $child)
452 if ($child->nodeType == XML_TEXT_NODE)
454 $Value .= $child->nodeValue;
458 # if we had a non-empty value, add it to the results
459 if (strlen(trim($Value))>0)
461 $Records[$Index][$Section][$StorageTagName] []= $Value;
464 # and process our children
465 $this->ExtractDataFromXml($Records, $Index, $node, $Section, $StorageTagName);
476 private function GetFirstElement(DOMNode $dom)
478 foreach ($dom->childNodes as $child)
480 if ($child->nodeType == XML_ELEMENT_NODE)
505 private function GetRecordsFromXML($XmlText, $ParseTo)
507 # create XML parser and pass it text
508 $Xml = simplexml_load_string($XmlText);
510 # if text could not be parsed, return NULL
511 if (! $Xml instanceof SimpleXmlElement )
514 # set up vars to hold our results
518 # we'll want to find our records with XPath, so we need to
519 # register a prefix for the oai elements
520 $Xml->registerXPathNamespace(
'oai',
"http://www.openarchives.org/OAI/2.0/");
522 # extract records, iterate over them
523 $RecordXML = $Xml->xpath(
"oai:".$ParseTo.
"//oai:record");
524 foreach ($RecordXML as $Record)
526 # pull relevant information out of the header
528 # Note that SimpleXMLElement objects map elements onto PHP
529 # object properties, and will return a SimpleXMLElement w/o
530 # any associated XML for non-existent elements. So,
531 # nothing explodes when we ask the Record for an element it
534 # However, SimpleXMLElements w/o associated XML return
535 # 'NULL' for all properties. Therefore, if we tried to
536 # look at the grandchild of a non-existent element it would
537 # be problematic. In the cases below, we get empty
538 # strings when the children of 'header' &c are empty, which
539 # is what we want anyway.
541 $Records[$Index][
"identifier"] = (string) $Record->header->identifier;
542 $Records[$Index][
"datestamp"] = (
string) $Record->header->datestamp;
544 # grab associated meadata (if there is any)
545 if ($Record->metadata->count() > 0)
547 # to avoid frustrations with namespaces and SimpleXML, use
548 # DOMDocument to parse the record data
549 $doc = dom_import_simplexml( $Record->metadata );
551 # get the 'record' element
552 $doc = $this->GetFirstElement( $doc );
554 # record the format used for this record
555 $Records[$Index][
"format"] = $doc->nodeName;
557 # extract data for this record
558 $this->ExtractDataFromXml( $Records, $Index, $doc,
"metadata" );
561 # if there is additional information available, snag that too
562 if ($Record->about->count() > 0)
564 $doc = dom_import_simplexml( $Record->about );
565 $this->ExtractDataFromXml($Records, $Index, $doc,
"about");
568 # move along to the next record
572 # look for resumption token and save if found (as above, we'll
573 # get an empty string if either ListRecords or resumptionToken
575 $Token = (string) $Xml->ListRecords->resumptionToken;
577 if (strlen($Token)>0)
579 $this->ResumptionToken = $Token;
583 unset($this->ResumptionToken);
586 # return records to caller
ResetRecordPointer()
Clear any additional records available after last GetRecords().
ServerUrl($NewValue=NULL)
Get or set URL of target OAI repository server.
OAIClient($ServerUrl, $Cache=NULL)
Class constructor.
GetRecord($Id)
Get a single record from a repositry server.
MoreRecordsAvailable()
Check whether more records are available after last GetRecords().
GetRecords($StartDate=NULL, $EndDate=NULL)
Retrieve records from repository server.
MetadataPrefix($NewValue=NULL)
Get or set metadata schema for records being retrieved.
SetSpec($NewValue="X-NOSETSPECVALUE-X")
Get or set specification of subset of records to be retrieved.
GetIdentification()
Retrieve identification information from repository server.
SetDebugLevel($NewLevel)
Set current debug output level.
GetFormats()
Retrieve list of available metadata formats from repository server.