mirror of
https://github.com/JHUAPL/aqueduct.git
synced 2026-01-09 15:58:04 -05:00
329 lines
9.6 KiB
PHP
329 lines
9.6 KiB
PHP
<?php
|
|
/*
|
|
Aqueduct: A linked data semantic web extension for MediaWiki
|
|
Copyright (C) 2010 The Johns Hopkins University/Applied Physics Laboratory
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*/
|
|
if ( !defined( 'MEDIAWIKI' ) )
|
|
{
|
|
die();
|
|
}
|
|
|
|
|
|
abstract class AqueductInterface
|
|
{
|
|
protected $mRow;
|
|
|
|
public function __construct($row)
|
|
{
|
|
$this->mRow = $row;
|
|
}
|
|
|
|
public abstract function materialize($title);
|
|
|
|
public abstract function advancedOperation($title,$advancedrow);
|
|
|
|
//Sets one triple, doesn't disturb any triples with the same subject
|
|
public abstract function hardSet($title, $predicateURI, $object, $objecttype);
|
|
|
|
public function titleToURI($title)
|
|
{
|
|
//1. Get the title text
|
|
$t = $title->getDBkey();
|
|
//Flip the title case if needed
|
|
if ($this->mRow['aq_initial_lowercase'])
|
|
{
|
|
$t = strtolower(substr($t,0,1)) . substr($t,1);
|
|
}
|
|
|
|
//2. If the URI begins with a backslash \, remove it. If the URI ends with a caret ^ , remove it.
|
|
if ($t[0] == '\\')
|
|
{
|
|
$t = substr($t,1);
|
|
}
|
|
if ($t[strlen($t)-1] == '^')
|
|
{
|
|
$t = substr($t,0,strlen($t)-1);
|
|
}
|
|
//3. Process caret ^ escape sequences. A caret-period ^. is converted to a period. A caret-a ^a is converted to an ampersand & . A caret-underscore ^_ is converted to an underscore. A caret-tilde ^~ is converted to a tilde. Other such sequences are illegal.
|
|
//4. Convert backticks ` to colons :
|
|
//5. Convert backslashes \ to hash marks #
|
|
$t = str_replace(array('^.','^a','^_','^~','`','\\'),array('.','&','_','~',':','#'),$t);
|
|
if (strpos($t,'^') !== FALSE)
|
|
{
|
|
throw new Exception('Title has an illegal caret escape sequence');
|
|
}
|
|
|
|
//6. Convert characters that are illegal in URIs to UTF-8 octet sequences (with the percent sign). Convert one character at a time. If a percent-dash octet sequence is encountered (as described above), keep it as a single-percent octet sequence.
|
|
$legalcharpattern = '|[-[:alnum:];/?:@&=+$_.!~*(),\'#]|S';
|
|
$result = '';
|
|
$pos = 0;
|
|
$length = strlen($t);
|
|
while ($pos < $length)
|
|
{
|
|
if ($t[$pos] == '%')
|
|
{
|
|
//Percent dash encoded octet?
|
|
if ($pos+4 > $length || $t[$pos+1] != '-')
|
|
{
|
|
throw new Exception('Unescaped title illegally has a percent-encoded octet sequence. This usually happens because the title was escaped multiple times.');
|
|
}
|
|
else
|
|
{
|
|
//Convert percent-dash encoded octet to a normally encoded octet.
|
|
$result .= $t[$pos] . $t[$pos+2] . $t[$pos+3];
|
|
}
|
|
$pos += 4;
|
|
}
|
|
else
|
|
{
|
|
if (preg_match($legalcharpattern,$t[$pos])>0)
|
|
{
|
|
//Do not escape legal URI character.
|
|
$result .= $t[$pos];
|
|
}
|
|
else
|
|
{
|
|
//Escape illegal URI character
|
|
//The title object is already a UTF-8 encoded mbstring, so this will handle Unicode characters properly
|
|
$result .= '%' . strtoupper(bin2hex($t[$pos]));
|
|
}
|
|
$pos ++;
|
|
}
|
|
}
|
|
//Add URI prefix
|
|
$result = $this->mRow['aq_source_uri'] . $result;
|
|
|
|
return $result;
|
|
}
|
|
|
|
public static function uriToTitle($uri)
|
|
{
|
|
$transtable = aqDbGetTransTable();
|
|
$legalchars = '[-[:alnum:];/?:@&=+$_.!~*(),\'#]';
|
|
$legalcharpattern = '|' . $legalchars . '|S';
|
|
$legalstringpattern = '|^' . $legalchars . '+$|SD';
|
|
//1. Check for illegal characters
|
|
if (preg_match($legalstringpattern,$uri)==0)
|
|
{
|
|
throw new Exception("Illegal URI: $uri");
|
|
}
|
|
//2. Detect which URI prefix is being used to select the configuration row, and remove the URI prefix
|
|
$matchingrow = NULL;
|
|
foreach ($transtable as $row)
|
|
{
|
|
//Use a configuration row if the URI prefix matches and another configuration row with a better (longer) prefix was not found
|
|
if (strpos($uri,$row['aq_source_uri']) === 0)
|
|
{
|
|
if (!$matchingrow || strlen($matchingrow['aq_source_uri']) < strlen($row['aq_source_uri']))
|
|
{
|
|
$matchingrow = $row;
|
|
}
|
|
}
|
|
}
|
|
if ($matchingrow)
|
|
{
|
|
$uri = substr($uri,strlen($matchingrow['aq_source_uri']));
|
|
}
|
|
//3. Convert the sequences of octets into unicode characters.
|
|
$decodeduri = '';
|
|
$currentchar = 0;
|
|
while ($currentchar < strlen($uri))
|
|
{
|
|
$c = $uri[$currentchar];
|
|
if ($c == '%')
|
|
{
|
|
if ($currentchar + 3 > strlen($uri))
|
|
{
|
|
throw new Exception('Malformed escape sequence in URI: $uri');
|
|
}
|
|
$currentseqhex = $uri[$currentchar+1] . $uri[$currentchar+2];
|
|
$currentseqdec = hexdec($currentseqhex);
|
|
$currentseqbin = pack('H*', $currentseqhex);
|
|
if ($currentseqdec < 128)
|
|
{
|
|
//This is a "non-encoded" character (code point <128)
|
|
$encodeme = FALSE;
|
|
if (preg_match($legalcharpattern,$currentseqbin)>0)
|
|
{
|
|
//URL-legal character was unnecessarily encoded. Output this in encoded form to preserve canonicalization
|
|
$encodeme = TRUE;
|
|
}
|
|
else if ($currentseqdec < 32)
|
|
{
|
|
//Keep control characters encoded
|
|
$encodeme = TRUE;
|
|
}
|
|
else if (strpos('<>[]|{}\`^% ',$currentseqbin) !== FALSE)
|
|
{
|
|
//Keep a character that we will not want to use in a wiki title encoded
|
|
$encodeme = TRUE;
|
|
}
|
|
if ($encodeme)
|
|
{
|
|
$decodeduri = $decodeduri . '%-' . $currentseqhex;
|
|
}
|
|
else
|
|
{
|
|
$decodeduri = $decodeduri . $currentseqbin;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//This is part of a UTF-8 encoded sequence
|
|
//Just output the byte to the wiki title, because we have nothing else to do with these characters
|
|
$decodeduri = $decodeduri . $currentseqbin;
|
|
}
|
|
$currentchar+=3;
|
|
}
|
|
else
|
|
{
|
|
$decodeduri = $decodeduri . $c;
|
|
$currentchar++;
|
|
}
|
|
}
|
|
//4. The hash mark # could have been present in an unescaped form the URI, which would cause it to remain in the title at this point.
|
|
//This character is illegal in the wiki. Convert it to a backslash \. If the hash mark was at the beginning of the title, convert it to a double backslash instead
|
|
//5. Colons can be confused for namespace prefixes, so convert them to a backtick `
|
|
$decodeduri = str_replace(array('#',':'),array('\\','`'),$decodeduri);
|
|
if ($decodeduri[0] == '\\')
|
|
{
|
|
$decodeduri = '\\' . $decodeduri;
|
|
}
|
|
//6. Some characters will be conditionally escaped if they will cause problems.
|
|
//Any period not adjacent with a character other than another period or the forward slash is escaped to ^
|
|
//In a sequence of underscores, insert a ^ between all underscores, because Mediawiki will collapse sequences of underscores. Example: ___ turns into _caret_caret_ (caret means ^ )
|
|
//Do the same in a sequence of tildes.
|
|
$currentchar = 0;
|
|
$printedunderscore = FALSE;
|
|
$printedtilde = FALSE;
|
|
$periodokay = FALSE;
|
|
$escapeduri = '';
|
|
for ($currentchar=0; $currentchar<strlen($decodeduri); $currentchar++)
|
|
{
|
|
$c = $decodeduri[$currentchar];
|
|
if ($c == '_')
|
|
{
|
|
if ($printedunderscore)
|
|
{
|
|
$escapeduri .= '^';
|
|
}
|
|
$escapeduri .= '_';
|
|
$printedunderscore = TRUE;
|
|
$printedtilde = FALSE;
|
|
$periodokay = TRUE;
|
|
}
|
|
else if ($c == '~')
|
|
{
|
|
if ($printedtilde)
|
|
{
|
|
$escapeduri .= '^';
|
|
}
|
|
$escapeduri .= '~';
|
|
$printedunderscore = FALSE;
|
|
$printedtilde = TRUE;
|
|
$periodokay = TRUE;
|
|
}
|
|
else if ($c == '.')
|
|
{
|
|
if ($periodokay)
|
|
{
|
|
//The period is "armored" by the character on the left, so just print it
|
|
$escapeduri .= '.';
|
|
}
|
|
else if ($currentchar+1<strlen($decodeduri))
|
|
{
|
|
$n = $decodeduri[$currentchar+1];
|
|
if ($n!='.' && $n!='/')
|
|
{
|
|
//Period armored by the character on the right
|
|
$escapeduri .= '.';
|
|
}
|
|
//Unarmored period
|
|
$escapeduri .= '^.';
|
|
}
|
|
else
|
|
{
|
|
//Unarmored period at the end of the string
|
|
$escapeduri .= '^.';
|
|
}
|
|
$printedunderscore = FALSE;
|
|
$printedtilde = FALSE;
|
|
$periodokay = FALSE;
|
|
}
|
|
else
|
|
{
|
|
$printedunderscore = FALSE;
|
|
$printedtilde = FALSE;
|
|
//Not an underscore or period or tilde, so we don't handle it in this loop.
|
|
$escapeduri .= $c;
|
|
if ($c != '/')
|
|
{
|
|
$periodokay = TRUE;
|
|
}
|
|
}
|
|
}
|
|
//If the URI contains any semicolons ;, escape all ampersands & to ^a
|
|
if (strpos($escapeduri, ';') !== FALSE)
|
|
{
|
|
$escapeduri = str_replace('&','^a',$escapeduri);
|
|
}
|
|
//If the string ends with an underscore at this point, put a caret at the end so Mediawiki does not get rid of the underscore.
|
|
if ($escapeduri[strlen($escapeduri)-1] == '_')
|
|
{
|
|
$escapeduri .= '^';
|
|
}
|
|
//7. Logic for characters that Mediawiki will modify only at the beginning of a string
|
|
$prependbackslash = FALSE;
|
|
if (strtoupper($escapeduri[0]) == $escapeduri[0])
|
|
{
|
|
if ($matchingrow && $matchingrow['aq_initial_lowercase'])
|
|
{
|
|
$prependbackslash = TRUE;
|
|
}
|
|
}
|
|
else if (strtolower($escapeduri[0]) == $escapeduri[0])
|
|
{
|
|
if (!$matchingrow || !$matchingrow['aq_initial_lowercase'])
|
|
{
|
|
$prependbackslash = TRUE;
|
|
}
|
|
}
|
|
else if ($escapeduri[0] == '_')
|
|
{
|
|
$prependbackslash = TRUE;
|
|
}
|
|
$escapeduri[0] = strtoupper($escapeduri[0]);
|
|
if ($prependbackslash)
|
|
{
|
|
$escapeduri = '\\' . $escapeduri;
|
|
}
|
|
//8. Prepend the namespace
|
|
if ($matchingrow)
|
|
{
|
|
if ($matchingrow['aq_wiki_namespace_id'] > 0)
|
|
{
|
|
$escapeduri = $matchingrow['aq_wiki_namespace'] . ':' . $escapeduri;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
$escapeduri = 'Unknown:' . $escapeduri;
|
|
}
|
|
return $escapeduri;
|
|
}
|
|
}
|