Files
aqueduct/includes/AqueductInterface.php
2011-06-13 19:42:11 -04:00

329 lines
9.6 KiB
PHP

<?php
/*
Aqueduct: A linked data semantic web extension for MediaWiki
Copyright (C) 2010 The Johns Hopkins University/Applied Physics Laboratory
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
if ( !defined( 'MEDIAWIKI' ) )
{
die();
}
abstract class AqueductInterface
{
protected $mRow;
public function __construct($row)
{
$this->mRow = $row;
}
public abstract function materialize($title);
public abstract function advancedOperation($title,$advancedrow);
//Sets one triple, doesn't disturb any triples with the same subject
public abstract function hardSet($title, $predicateURI, $object, $objecttype);
public function titleToURI($title)
{
//1. Get the title text
$t = $title->getDBkey();
//Flip the title case if needed
if ($this->mRow['aq_initial_lowercase'])
{
$t = strtolower(substr($t,0,1)) . substr($t,1);
}
//2. If the URI begins with a backslash \, remove it. If the URI ends with a caret ^ , remove it.
if ($t[0] == '\\')
{
$t = substr($t,1);
}
if ($t[strlen($t)-1] == '^')
{
$t = substr($t,0,strlen($t)-1);
}
//3. Process caret ^ escape sequences. A caret-period ^. is converted to a period. A caret-a ^a is converted to an ampersand & . A caret-underscore ^_ is converted to an underscore. A caret-tilde ^~ is converted to a tilde. Other such sequences are illegal.
//4. Convert backticks ` to colons :
//5. Convert backslashes \ to hash marks #
$t = str_replace(array('^.','^a','^_','^~','`','\\'),array('.','&','_','~',':','#'),$t);
if (strpos($t,'^') !== FALSE)
{
throw new Exception('Title has an illegal caret escape sequence');
}
//6. Convert characters that are illegal in URIs to UTF-8 octet sequences (with the percent sign). Convert one character at a time. If a percent-dash octet sequence is encountered (as described above), keep it as a single-percent octet sequence.
$legalcharpattern = '|[-[:alnum:];/?:@&=+$_.!~*(),\'#]|S';
$result = '';
$pos = 0;
$length = strlen($t);
while ($pos < $length)
{
if ($t[$pos] == '%')
{
//Percent dash encoded octet?
if ($pos+4 > $length || $t[$pos+1] != '-')
{
throw new Exception('Unescaped title illegally has a percent-encoded octet sequence. This usually happens because the title was escaped multiple times.');
}
else
{
//Convert percent-dash encoded octet to a normally encoded octet.
$result .= $t[$pos] . $t[$pos+2] . $t[$pos+3];
}
$pos += 4;
}
else
{
if (preg_match($legalcharpattern,$t[$pos])>0)
{
//Do not escape legal URI character.
$result .= $t[$pos];
}
else
{
//Escape illegal URI character
//The title object is already a UTF-8 encoded mbstring, so this will handle Unicode characters properly
$result .= '%' . strtoupper(bin2hex($t[$pos]));
}
$pos ++;
}
}
//Add URI prefix
$result = $this->mRow['aq_source_uri'] . $result;
return $result;
}
public static function uriToTitle($uri)
{
$transtable = aqDbGetTransTable();
$legalchars = '[-[:alnum:];/?:@&=+$_.!~*(),\'#]';
$legalcharpattern = '|' . $legalchars . '|S';
$legalstringpattern = '|^' . $legalchars . '+$|SD';
//1. Check for illegal characters
if (preg_match($legalstringpattern,$uri)==0)
{
throw new Exception("Illegal URI: $uri");
}
//2. Detect which URI prefix is being used to select the configuration row, and remove the URI prefix
$matchingrow = NULL;
foreach ($transtable as $row)
{
//Use a configuration row if the URI prefix matches and another configuration row with a better (longer) prefix was not found
if (strpos($uri,$row['aq_source_uri']) === 0)
{
if (!$matchingrow || strlen($matchingrow['aq_source_uri']) < strlen($row['aq_source_uri']))
{
$matchingrow = $row;
}
}
}
if ($matchingrow)
{
$uri = substr($uri,strlen($matchingrow['aq_source_uri']));
}
//3. Convert the sequences of octets into unicode characters.
$decodeduri = '';
$currentchar = 0;
while ($currentchar < strlen($uri))
{
$c = $uri[$currentchar];
if ($c == '%')
{
if ($currentchar + 3 > strlen($uri))
{
throw new Exception('Malformed escape sequence in URI: $uri');
}
$currentseqhex = $uri[$currentchar+1] . $uri[$currentchar+2];
$currentseqdec = hexdec($currentseqhex);
$currentseqbin = pack('H*', $currentseqhex);
if ($currentseqdec < 128)
{
//This is a "non-encoded" character (code point <128)
$encodeme = FALSE;
if (preg_match($legalcharpattern,$currentseqbin)>0)
{
//URL-legal character was unnecessarily encoded. Output this in encoded form to preserve canonicalization
$encodeme = TRUE;
}
else if ($currentseqdec < 32)
{
//Keep control characters encoded
$encodeme = TRUE;
}
else if (strpos('<>[]|{}\`^% ',$currentseqbin) !== FALSE)
{
//Keep a character that we will not want to use in a wiki title encoded
$encodeme = TRUE;
}
if ($encodeme)
{
$decodeduri = $decodeduri . '%-' . $currentseqhex;
}
else
{
$decodeduri = $decodeduri . $currentseqbin;
}
}
else
{
//This is part of a UTF-8 encoded sequence
//Just output the byte to the wiki title, because we have nothing else to do with these characters
$decodeduri = $decodeduri . $currentseqbin;
}
$currentchar+=3;
}
else
{
$decodeduri = $decodeduri . $c;
$currentchar++;
}
}
//4. The hash mark # could have been present in an unescaped form the URI, which would cause it to remain in the title at this point.
//This character is illegal in the wiki. Convert it to a backslash \. If the hash mark was at the beginning of the title, convert it to a double backslash instead
//5. Colons can be confused for namespace prefixes, so convert them to a backtick `
$decodeduri = str_replace(array('#',':'),array('\\','`'),$decodeduri);
if ($decodeduri[0] == '\\')
{
$decodeduri = '\\' . $decodeduri;
}
//6. Some characters will be conditionally escaped if they will cause problems.
//Any period not adjacent with a character other than another period or the forward slash is escaped to ^
//In a sequence of underscores, insert a ^ between all underscores, because Mediawiki will collapse sequences of underscores. Example: ___ turns into _caret_caret_ (caret means ^ )
//Do the same in a sequence of tildes.
$currentchar = 0;
$printedunderscore = FALSE;
$printedtilde = FALSE;
$periodokay = FALSE;
$escapeduri = '';
for ($currentchar=0; $currentchar<strlen($decodeduri); $currentchar++)
{
$c = $decodeduri[$currentchar];
if ($c == '_')
{
if ($printedunderscore)
{
$escapeduri .= '^';
}
$escapeduri .= '_';
$printedunderscore = TRUE;
$printedtilde = FALSE;
$periodokay = TRUE;
}
else if ($c == '~')
{
if ($printedtilde)
{
$escapeduri .= '^';
}
$escapeduri .= '~';
$printedunderscore = FALSE;
$printedtilde = TRUE;
$periodokay = TRUE;
}
else if ($c == '.')
{
if ($periodokay)
{
//The period is "armored" by the character on the left, so just print it
$escapeduri .= '.';
}
else if ($currentchar+1<strlen($decodeduri))
{
$n = $decodeduri[$currentchar+1];
if ($n!='.' && $n!='/')
{
//Period armored by the character on the right
$escapeduri .= '.';
}
//Unarmored period
$escapeduri .= '^.';
}
else
{
//Unarmored period at the end of the string
$escapeduri .= '^.';
}
$printedunderscore = FALSE;
$printedtilde = FALSE;
$periodokay = FALSE;
}
else
{
$printedunderscore = FALSE;
$printedtilde = FALSE;
//Not an underscore or period or tilde, so we don't handle it in this loop.
$escapeduri .= $c;
if ($c != '/')
{
$periodokay = TRUE;
}
}
}
//If the URI contains any semicolons ;, escape all ampersands & to ^a
if (strpos($escapeduri, ';') !== FALSE)
{
$escapeduri = str_replace('&','^a',$escapeduri);
}
//If the string ends with an underscore at this point, put a caret at the end so Mediawiki does not get rid of the underscore.
if ($escapeduri[strlen($escapeduri)-1] == '_')
{
$escapeduri .= '^';
}
//7. Logic for characters that Mediawiki will modify only at the beginning of a string
$prependbackslash = FALSE;
if (strtoupper($escapeduri[0]) == $escapeduri[0])
{
if ($matchingrow && $matchingrow['aq_initial_lowercase'])
{
$prependbackslash = TRUE;
}
}
else if (strtolower($escapeduri[0]) == $escapeduri[0])
{
if (!$matchingrow || !$matchingrow['aq_initial_lowercase'])
{
$prependbackslash = TRUE;
}
}
else if ($escapeduri[0] == '_')
{
$prependbackslash = TRUE;
}
$escapeduri[0] = strtoupper($escapeduri[0]);
if ($prependbackslash)
{
$escapeduri = '\\' . $escapeduri;
}
//8. Prepend the namespace
if ($matchingrow)
{
if ($matchingrow['aq_wiki_namespace_id'] > 0)
{
$escapeduri = $matchingrow['aq_wiki_namespace'] . ':' . $escapeduri;
}
}
else
{
$escapeduri = 'Unknown:' . $escapeduri;
}
return $escapeduri;
}
}