Files
aqueduct/includes/AqueductUtility.php
2011-06-13 19:42:11 -04:00

336 lines
11 KiB
PHP

<?php
/*
Aqueduct: A linked data semantic web extension for MediaWiki
Copyright (C) 2010 The Johns Hopkins University/Applied Physics Laboratory
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
if ( !defined( 'MEDIAWIKI' ) )
{
die();
}
abstract class AqueductUtility
{
//STATIC UTILITY FUNCTIONS
//Take a wiki title as input, and see if it is associated with Aqueduct namespaces (as defined on the special page.)
//If it is associated with a basic namespace, return the row in the basicrow parameter
//If it is associated with an advanced namespace, return the rows in the basicrow and advancedrow parameters
public static function getRowsForAqueductTitle($title, &$basicrow, &$advancedrow)
{
$transtable = aqDbGetTransTable();
$basicrow = NULL;
$advancedrow = NULL;
$ns = $title->getNamespace();
//See if this is a basic query
foreach ($transtable as $row)
{
if ($row['aq_wiki_namespace_id'] == $ns)
{
$basicrow = $row;
return;
}
}
//See if this was an advanced query
$querytable = aqDbTableGetAll('query');
foreach ($querytable as $advrow)
{
if ($advrow['aq_wiki_namespace_id'] == $ns)
{
$advancedrow = $advrow;
$basicrow = AqueductUtility::getBasicRowFromAdvancedRow($advrow);
}
}
return;
}
//Look up the basic row that is "chained" to the given advanced row
public static function getBasicRowFromAdvancedRow($advancedrow)
{
$transtable = aqDbGetTransTable();
foreach ($transtable as $brow)
{
if ($brow['aq_wiki_namespace'] == $advancedrow['aq_wiki_parent_namespace'])
{
return $brow;
}
}
if (!$foundrow)
{
throw new Exception('The parent namespace for an advanced query was not found.');
}
}
//Return an instance of AqueductInterface that can be used to do queries for the given row
public static function getInterfaceForRow($basicrow)
{
switch ($basicrow['aq_source_type'])
{
case 'BB':
$interface = new AqueductInterfaceBlackbook28($basicrow);
break;
case 'Embedded':
$interface = new AqueductInterfaceEmbedded($basicrow);
break;
case 'BB28':
$interface = new AqueductInterfaceBlackbook28($basicrow);
break;
case 'BB30':
$interface = new AqueductInterfaceBlackbook30($basicrow);
break;
case 'Test':
$interface = new AqueductInterfaceTest($basicrow);
break;
case 'Memcached':
$interface = new AqueductInterfaceMemcached($basicrow);
break;
case 'Arc2':
$interface = new AqueductInterfaceArc2($basicrow);
break;
default:
throw new Exception('The datasource type for this namespace is invalid or unsupported.');
}
return $interface;
}
//Convenience function that finds the Aqueduct configuration row associated with a title
//for the sole purpose of converting the title to a URI.
//To avoid wasted effort, this function should not be called if any interface functions will
//ever be called on the title.
//This convenience function takes the title as a string (unlike the other ones) and it can
//throw an exception if the namespace is not under Aqueduct's control.
public static function titleToURIStatic($titlestring)
{
$t = Title::newFromText($titlestring);
$basicrow = NULL;
$advancedrow = NULL;
AqueductUtility::getRowsForAqueductTitle($t,$basicrow,$advancedrow);
return AqueductUtility::getInterfaceForRow($basicrow)->titleToURI($t);
}
//Return an array where reified RDF statements are seperated by subject, and all statements for a particular subject
//are put under the appropriate key of the returned array.
//This is non-trivial because complex structures involving bnodes make the subject that a bnode is ultimately associated
//with non-obvious.
//Input: An ARC2 SimpleIndex structure
//Output: An array where the keys are the subject URIs and the values are arrays of reified RDF statements
//(each statement being represented as an ARC2 SimpleIndex)
public static function seperateReifiedRdfEntities($index)
{
$bnodeMappings = array();
$output = array();
//Look for reified RDF statements about the results
foreach ($index as $nodeid=>$nodecontents)
{
if (isset($nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject']))
{
//This is a reified RDF statement
$s = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['value'];
$t = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['type'];
$o = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#object'][0]['value'];
$ot = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#object'][0]['type'];
//See if we know what the bnode is connected to
if ($t == 'bnode' && isset($bnodeMappings[$s]))
{
$s = $bnodeMappings[$s];
}
//Put the node under the proper key of the output array
if (!isset($output[$s]))
{
$output[$s] = array();
}
$output[$s][$nodeid] = $nodecontents;
//See if this statement will allow us to disambiguate the ultimate subject of another statement
if ($ot == 'bnode')
{
//The ultimate subject of the connected bnode is this node's subject
if (isset($bnodeMappings[$o]))
{
throw new Exception('Rdf bnode structure is not a tree');
}
$bnodeMappings[$o] = $s;
//Transitively remap the subject of any child bnodes that were known
foreach ($bnodeMappings as $mapFrom=>$mapTo)
{
if ($mapTo == $o)
{
$bnodeMappings[$mapFrom] = $s;
}
}
if (isset($output[$o]))
{
$output[$s] = array_merge($output[$s],$output[$o]);
unset($output[$o]);
}
}
//Finished processing this statement
}
}
//Statements are now organized by their ultimate subject. Do sanity checks: exactly one URI subject should be represented
//in each key of the output array. Also convert URIs to wiki titles.
$outputWithTitles = array();
foreach ($output as $subject=>$statements)
{
$subjectfound = FALSE;
foreach ($statements as $statement)
{
$s = $statement['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['value'];
$t = $statement['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['type'];
if ($t=='uri')
{
if ($s==$subject)
{
$subjectfound = TRUE;
}
else
{
throw new Exception('RDF processing error: Two subjects per entity');
}
}
}
if (!$subjectfound)
{
throw new Exception('RDF processing error: Bnode not connected to subject');
}
$outputWithTitles[AqueductInterface::uriToTitle($subject)] = $statements;
}
return $outputWithTitles;
}
//Take a reified RDF ARC2 index, and create an un-reified index in its place.
//Crashes if the original index contained any tree-like structures (reified statements about bnodes) because
//this logic has not been coded yet
public static function unReifyRDF($index)
{
//Look for reified RDF statements about the results
$triples = array();
foreach ($index as $nodeid=>$nodecontents)
{
if (isset($nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject']))
{
//This is a reified RDF statement
$s = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['value'];
$t = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'][0]['type'];
$p = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'][0]['value'];
$o = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#object'][0]['value'];
$ot = $nodecontents['http://www.w3.org/1999/02/22-rdf-syntax-ns#object'][0]['type'];
//Do not allow bnodes
if ($t == 'bnode')
{
throw new Exception('RDF processing error: Blank nodes were found in the source data, and are not supported for this RDF operation.');
}
$triples []= array('s'=>$s,'p'=>$p,'o'=>$o,'s_type'=>'uri','o_type'=>$ot);
}
}
aqProfile("arcUnReify");
$retvalue = ARC2::getSimpleIndex($triples, false);
aqProfile("aq");
return $retvalue;
}
/**
* Determine the Unicode codepoint of a single-character UTF-8 sequence.
* Does not check for invalid input data.
*
* @param $char String
* @return Integer
* @public
*/
public static function utf8ToCodepoint( $char )
{
# Find the length
$z = ord( $char{0} );
if ( $z & 0x80 ) {
$length = 0;
while ( $z & 0x80 ) {
$length++;
$z <<= 1;
}
} else {
$length = 1;
}
if ( $length != strlen( $char ) ) {
return false;
}
if ( $length == 1 ) {
return ord( $char );
}
# Mask off the length-determining bits and shift back to the original location
$z &= 0xff;
$z >>= $length;
# Add in the free bits from subsequent bytes
for ( $i=1; $i<$length; $i++ ) {
$z <<= 6;
$z |= ord( $char{$i} ) & 0x3f;
}
return $z;
}
public static function escapeSPARQLParameter($text)
{
//Insert the page name as a literal in the query where a double hash mark is seen
//This is hard because the page name may contain characters that must be escaped
//in a SPARQL literal.
$n = str_replace('\\','\\\\',$text);
$n = str_replace(array('"',"'"),array('\\"',"\\'"),$n);
//Convert UTF-8 encoded codepoints into the appropriate SPARQL escape sequence
$replace = '';
$accum = '';
for ($x=0;$x<strlen($n);$x++)
{
$c = $n[$x];
if (ord($c)<128)
{
//Not a "unicode" character, can output literally
$accum = '';
$replace .= $c;
}
else
{
//Try to decode UTF-8
$accum .= $c;
$codepoint = AqueductUtility::utf8ToCodepoint($accum);
if ($codepoint!==false)
{
//Complete UTF-8 sequence found
$seq = dechex($codepoint);
while (strlen($seq)!=4 && strlen($seq)<8)
{
$seq = '0' . $seq;
}
if (strlen($seq)==4)
{
$seq = '\\u' . $seq;
}
else
{
$seq = '\\U' . $seq;
}
$replace .= $seq;
$accum = '';
}
}
}
return $replace;
}
}