blob: a2b3abb2e67a8c151251172d15b8cffab5f4d5e0 [file] [log] [blame]
<?php
if ( !defined( 'MEDIAWIKI' ) ) {
die( 'This file is a MediaWiki extension, not a valid entry point.' );
}
/**
* This provides a {{#transliterate:map|word}} parser function that
*
* 1. Finds out which transliteration maps exist
* Transliteration maps are pages at [[MediaWiki:Transliterator:map]]
* This query is cached.
* 2. Loads the map if it exists, and parses it into an array suitable for use with strtr()
* This array is cached.
* 3. Applies the transliteration map to the word, case-insensitively, but respecting either NFD
* or NFC and combining characters, and word start-and-end markers.
*
* It also provides syntax checking for the transliteration pages, both on save and preview.
* Perhaps in the future it will provide an API interface to assist javascript transliteration.
*
* More detailed user-documentation is at http://mediawiki.org/wiki/Extension:Transliterator
*
* Design decisions:
* As there are an unlimited number of transliteration schemes, and which to use depends mainly
* on personal preference, it is too inflexible to provide the schemes along with the extension,
* though it may be nice to provide some default standardised ones in the future. Perhaps this
* would also be a way to support some more languages, but most languages that can be transliterated
* automatically can be done using this scheme.
*
* The maps are discovered in one query to deal with the expected use-case on en.wiktionary which
* is the translation template, i.e. {{#transliterate}} will be called with an invalid map name
* much more often than not.
*
* The need to handle NFD is illustrated best by Korean which has a tractable map in NFD but
* would require thousands of NFC rules. Word start and end markers are required by Greek and other
* languages that treat initial and final letters separately. Code-points are combined because of the
* mess that letting stray combining characters through on their own can cause, and the confusion
* that this causes.
*
* Most methods are static, with the exception of those methods that must interact (at some level)
* with the runtime cache of maps, everything else is stateless.
*/
use UtfNormal\Validator;
class ExtTransliterator {
// These characters have been chosen because they are forbidden by MediaWiki, have no special
// regex meaning, are not unicode letters, and take up only one byte.
const WORD_START = "\x1F"; // A character that will be appended when ^ should match at the start
const WORD_END = "\x1E"; // A character that will be appended when $ should match at the end
const LETTER_END = "\x1D"; // A chacter added between each character as a separator
// The prefix to use for cache items (the number should be incremented when the map format changes)
const CACHE_PREFIX = "extTransliterator.4";
// flags for preprocessor
const DECOMPOSE = 1;
const IGNORE_ENDINGS = 2;
// attribute flags for postprocessor
const PREFIXED = 1;
const UPCASED = 2;
public $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
public $mMaps = array(); // An Array of "$mapname" => The map parsed from that page.
/**
* Handle the {{#transliterate: $mapname | $word | $format? | $answer? }} call.
*
* @param $parser Parser
* @param $mapname String is the name of the transliteration map to find.
* @param $word String is the string to transliterate (if the map was found)
* @param $format String is a string containing $1 to be replaced by the transliteration if the map exists
* @param $answer String allows for a user-specified transliteration to override the automatic one
*/
function render( $parser, $mapname = '', $word = '', $format = '$1', $answer = '' ) {
// Handle the case when people use {{#transliterate:<>|<>||<>}}
if ( trim( $format ) === '' ) {
$format = '$1';
}
// If we have been given the answer, return it straight-away
if ( trim( $answer ) !== '' ) {
return str_replace( '$1', $answer, $format );
}
// Check that the map is a valid title, if not we're in a template {{#transliterate:{{{1}}}...}}
$title = Title::newFromText( self::getMapPagePrefix() . $mapname, NS_MEDIAWIKI );
if ( !$title ) {
$function = MagicWord::get( 'transliterate' )->getSynonym( 0 );
return str_replace( '$1', "{{#$function:$mapname|$word}}", $format );
}
$mappage = $title->getDBkey();
$map = $this->getMap( $mappage );
// False if map was not found
if ( !$map ) {
$output = '';
// An error message (these should have been caught by ::validate, but you can't be too careful)
} elseif ( is_string( $map ) ) {
$output = self::wrapError( $map );
// Success!, now do the transliteration
} else {
$output = str_replace( '$1', self::transliterate( $word, $map ), $format );
}
// Populate the dependency table so that we get re-rendered if the map changes.
// TODO: It would be nice if we could make this invisible to the user.
if ( isset( $this->mPages[$mappage] ) ) {
$parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
} else {
$parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
}
return $output;
}
/**
* Get all the existing maps in one query.
*
* @return Array( title => id );
*/
function getExistingMapNames() {
global $wgMemc;
// Has it been used on this page already?
if ( ! is_null( $this->mPages ) )
return $this->mPages;
// Has it been used since it was last updated?
$cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
if ( $cached )
return $this->mPages = $cached;
$dbr = wfGetDB( DB_SLAVE );
// TODO: This could potentially cause problems if someone creates a few thousand
// pages with the prefix. The prefix is guaranteed to be a few letters, so this
// won't get the whole MediaWiki namespace.
// The result of this query is memcached until someone edits a map.
$res = $dbr->select( 'page',
array( 'page_title', 'page_id' ),
array(
'page_namespace' => NS_MEDIAWIKI,
'page_title ' . $dbr->buildLike( self::getMapPagePrefix(), $dbr->anyString() )
),
__METHOD__
);
$this->mPages = Array();
while ( $r = $res->fetchObject() ) {
$this->mPages[$r->page_title] = $r->page_id;
}
$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages );
return $this->mPages;
}
/**
* Get a parsed map.
*
* 1. Check caches for quick return.
* 2? Load from database
* 3? Parse map
*
* @param $mappage String including MapPagePrefix
* @return false (no map) || String (parse error) || Map (success)
*/
function getMap( $mappage ) {
global $wgMemc;
if ( isset( $this->mMaps[$mappage] ) ) {
return $this->mMaps[$mappage];
}
$existing = $this->getExistingMapNames();
if ( isset( $existing[$mappage] ) ) {
$map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) );
if ( !$map ) {
$mapMessage = wfMessage( $mappage );
if ( !$mapMessage->isDisabled() ) {
$map = self::readMap( $mapMessage->text(), $mappage );
}
if ( $map ) {
$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map );
}
}
} else {
$map = false;
}
return $this->mMaps[$mappage] = $map;
}
/**
* Normalise the text so it can be used with strtr() safely
*
* 1. decodeCharReferences
* 2. split into NFD codepoints or NFC fully combined
* 3. add bookends on word boundaries
*
* @param $word String from user input
* @param $flags may include self::DECOMPOSE, self::IGNORE_ENDINGS
* @return String
*/
static function forTransliteration( $word, $flags ) {
static $regexes = null;
// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
// has been HTML escaped it will be converted to NFC if it passes through
// transliteration unchanged, I think that's a WONTFIX though.
$word = Sanitizer::decodeCharReferences( $word );
if ( $flags & self::DECOMPOSE ) {
$word = Validator::toNFD( $word );
$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
} else {
$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
}
if ( !$regexes ) {
// A "letter" is a unicode letter followed by some combining characters
// A "non-letter" is any other character followed by some combining characters
// "end" is done first so it watches out for word-endings in "start"
// If it should treat endings then the start and end of the string are non-letters
// Otherwise it does not touch the start or end of the string, only internal transitions
$combining = '(?:[\pM]*' . self::LETTER_END . ')';
$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
$regexes = array (
'endings' => array (
'start' => "/(^$combining?|$nonletter$combining)([\pL])/u",
'end' => "/([\pL]$combining)([^\pL]|$)/u",
),
'ignore-endings' => array (
'start' => "/($nonletter$combining)([\pL])/u",
'end' => "/([\pL]$combining)([^\pL])/u",
),
);
}
$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );
return $word;
}
/**
* Update the current rule-set from a rule
*
* @param $from String, the left-hand-side of a rule
* @param $to String, the right-hand-side of a rule
* @param $flags Flags, flags from the top of the map
* @param $rules Array, ^from$ -> to (for strtr())
* @param $attrs Array, from$ -> Flags (for post processing)
* @return Bool true on success, false if an ambiguous rule
*/
static function addToRules( $from, $to, $flags, &$rules, &$attrs ) {
global $wgLang;
$prefix = $suffix = '';
// forTransliteration() may decode a deliberately escaped ^ or $.
// in order to find accurate word boundaries. So we check here if
// this occurs, and work around it.
$noprefix = $nosuffix = false;
if ( $from[0] === '&' || $from[strlen( $from ) - 1] === ';' ) {
$decoded = Sanitizer::decodeCharReferences( $from );
$noprefix = $decoded[0] === '^';
$nosuffix = $decoded[strlen( $decoded ) - 1] === '$';
}
$from = self::forTransliteration( $from, $flags | self::IGNORE_ENDINGS );
if ( !$noprefix ) {
$from = preg_replace( '/^[\^][' . self::LETTER_END . '][' . self::WORD_START . ']/u', '', $from, 1, $count );
if ( $count ) {
$prefix = self::WORD_START;
}
}
if ( !$nosuffix ) {
$from = preg_replace( '/[' . self::WORD_END . '][$][' . self::LETTER_END . ']$/u', '', $from, 1, $count );
if ( $count ) {
$suffix = self::WORD_END;
}
}
// Check that this rule isn't ambiguous
if ( isset( $rules[$prefix . $from . $suffix] ) && !( $attrs[$from] & self::UPCASED ) ) {
return false;
}
$rules[$prefix . $from . $suffix] = $to;
$attrs[$from . $suffix] = $prefix ? self::PREFIXED : 0;
// Now case-insensitivity
$casefrom = $wgLang->ucfirst( $from ) ;
if ( $from !== $casefrom && !isset( $rules[$prefix . $casefrom . $suffix] ) ) {
$rules[$prefix . $casefrom . $suffix] = $wgLang->ucfirst( $to );
$attrs[$casefrom . $suffix] = ( $prefix ? self::PREFIXED : 0 ) | self::UPCASED;
}
return true;
}
/**
* Decide if a line in a map page may contain useful information.
*
* @param $line
* @return Boolean
*/
static function isUsefulLine( $line ) {
return $line != '' && $line[0] != '#';
}
/**
* Parse a map input syntax into a map.
*
* Input syntax is a set of lines.
* All " " are ignored.
* Lines starting with # are ignored, remaining lines are split by =>
* HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
*
* @param $input String - the contents of the map page
* @param $mappage String - the title of the map page (without MediaWiki:)
* @return false (empty, or no useful content, treat as no map) ||
* String (error message, syntax error while parsing) ||
* Array( 'rules' => Array ( from => to ), 'flags' => Flags )
*/
static function readMap( $input, $mappage ) {
global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;
$rules = array(); // The actual rules to go into strtr()
$attrs = array(); // A map of those rules that were automatically added
$flags = 0; // Flags associated with those rules
// Split lines, remove blank lines and comments.
$input = trim( $input );
$lines = preg_split( "/\s*\n\s*/u", $input );
$lines = array_filter( $lines, array( 'ExtTransliterator', 'isUsefulLine' ) );
$lines = array_values( $lines );
// Nothing left?
if ( count( $lines ) == 0 )
return false;
// Check for __DECOMPOSE__
$decompose = MagicWord::get( 'tr_decompose' );
if ( $decompose->matchVariableStartToEnd( $lines[0] ) ) {
$flags = $flags | self::DECOMPOSE;
array_shift( $lines );
}
// Check for DoS
if ( count( $lines ) > $wgTransliteratorRuleCount ) {
return wfMessage( 'transliterator-error-rulecount', $wgTransliteratorRuleCount, $mappage )->escaped();
}
foreach ( $lines as $line ) {
$pair = preg_split( '/\s*=>\s*/u', $line );
if ( count( $pair ) != 2 || $pair[0] === '' ) {
return wfMessage( 'transliterator-error-syntax', $line, $mappage )->escaped();
}
if ( strlen( $pair[0] ) > $wgTransliteratorRuleSize ) {
return wfMessage( 'transliterator-error-rulesize', $line, $mappage, $wgTransliteratorRuleSize )->escaped();
}
if ( !self::addToRules( $pair[0], $pair[1], $flags, $rules, $attrs ) ) {
return wfMessage( 'transliterator-error-ambiguous', $line, $mappage )->escaped();
}
}
return self::postProcessMap( $rules, $attrs, $flags );
}
/**
* Fix problems created by readMap.
*
* 1. Long auto-generated case rules override case-specific rules.
* Delete the auto-generated rules.
*
* 2. The ^ operator overrides length ($ is ok though).
* Insert extra ^ rules with each needed length.
*
* 3. All the bookends we have inserted are still there
* Add rules to remove them.
*
* @param $rules Array( from => to )
* @param $attrs Array( from => self::PREFIXED | self::UPCASED )
* @param $flags May contain self::DECOMPOSE
* @return Array( "rules" => $rules, "flags" => $flags )
*/
static function postProcessMap( $rules, $attrs, $flags ) {
// $attrs is sorted into binary order, so start-based substrings of longer rules
// immediately precede them. Don't need to know anything else about the order.
ksort( $attrs );
$wasPrefixed = false;
$naturalCased = false;
$naturalCasedFrom = '';
$prefixedFrom = '';
foreach ( $attrs as $from => $attr ) {
// If the current rule has been auto-upcased, but a prefix of this rule wasn't,
// remove the auto-upcased rule as the specified upper-case takes priority.
if ( $attr & self::UPCASED ) {
if ( $naturalCased ) {
if ( strpos( $from, $naturalCasedFrom ) === 0 ) {
unset( $rules[$from] );
unset( $rules[self::WORD_START . $from] );
continue;
} else {
$naturalCased = false;
}
}
} elseif ( !$naturalCased ) {
$naturalCased = true;
$naturalCasedFrom = $from;
}
// When it finds a ^ed rule, it duplicates all rules that start with that rule
// which has the effect of promoting length to override ^.
// If the length is actually the same, ^x needs to maintain priority over x$
if ( $attr & self::PREFIXED ) {
if ( !$wasPrefixed || strpos( $from, $prefixedFrom ) !== 0 ) {
$wasPrefixed = true;
$prefixedFrom = $from;
}
} elseif ( $wasPrefixed ) {
if ( strpos( $from, $prefixedFrom ) === 0 ) {
if ( $from !== $prefixedFrom . self::WORD_END ) {
$rules[self::WORD_START . $from] = $rules[$from];
}
} else {
$wasPrefixed = false;
}
}
}
$rules[self::LETTER_END] = '';
$rules[self::WORD_END] = '';
$rules[self::WORD_START] = '';
$rules = new ReplacementArray( $rules );
return array( 'rules' => $rules, 'flags' => $flags );
}
/**
* Transliterate a word using the given map's rules and flags.
*
* @param $word raw user input
* @param $map as returned by getMap()
* @return String ready to output
*/
static function transliterate( $word, $map )
{
// Add bookends and combining character markers
$word = self::forTransliteration( $word, $map['flags'] );
// Perform transliteration
$output = $map['rules']->replace( $word );
// Maintain MediaWiki invariant of NFC
return Validator::toNFC( $output );
}
/**
* Put a message inside an error span.
*
* @param $msg String (HTML)
* @return String (HTML)
*/
static function wrapError( $msg ) {
return "<span class=\"transliterator error\">$msg</span>";
}
/**
* Get the prefix to use for map pages in the MediaWiki namespace.
*
* @return String
*/
static function getMapPagePrefix () {
static $prefix = null;
if ( !$prefix ) {
$prefix = MagicWord::get( 'tr_prefix' )->getSynonym( 0 );
// If the prefix is too short then parts of the MediaWiki namespace are
// rendered un-editable. If it is not a valid title, then it is very broken.
if ( strlen( $prefix ) < 3 || !Title::newFromText( $prefix, NS_MEDIAWIKI ) ) {
wfDebug( "Invalid Transliterator prefix, must be a valid title longer than three characters, falling back to Transliterator:" );
$prefix = "Transliterator:";
}
}
return $prefix;
}
/**
* Decide whether the title represents a Transliterator map.
*
* @param $title Title
* @return Boolean
*/
static function isMapPage( &$title ) {
if ( $title->getNamespace() == NS_MEDIAWIKI ) {
if ( strpos( $title->getText(), self::getMapPagePrefix() ) === 0 ) {
return true;
}
}
return false;
}
/**
* Remove the article from the Transliterator caches.
* (ArticlePurge, ArticleDeleteComplete)
*
* @param WikiPage $wikiPage
*/
static function purgeArticle( WikiPage &$wikiPage ) {
$title = $wikiPage->getTitle();
return self::purgeTitle( $title );
}
/**
* Remove the article from the Transliterator caches.
* (NewRevisionFromEditComplete)
*
* @param WikiPage $wikiPage
*/
static function purgeArticleNewRevision( WikiPage $wikiPage ) {
$title = $wikiPage->getTitle();
return self::purgeTitle( $title );
}
/**
* Remove the title from the Transliterator caches.
* (TitleMoveComplete hook)
*/
static function purgeNewTitle( &$title, &$newtitle ) {
return self::purgeTitle( $newtitle );
}
/**
* Remove the title from the Transliterator caches.
* (ArticleUndelete hook)
*
* @param $title Title
*/
static function purgeTitle( &$title ) {
global $wgMemc;
if ( self::isMapPage( $title ) ) {
$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) );
$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, '__map_names__' ) );
}
return true;
}
/**
* Show any errors that would be caused by trying to use this map.
*
* Does not follow redirects.
*
* (EditFilter hook)
*
* @param $editPage EditPage
* @param $text String
* @param $section
* @param $hookError
*/
static function validate( $editPage, $text, $section, &$hookError ) {
// FIXME: Should not access private variables
$title = $editPage->mTitle;
if ( self::isMapPage( $title ) ) {
$map = self::readMap( $text, $title->getDBkey() );
if ( is_string( $map ) ) {
$hookError = self::wrapError( $map );
}
}
return true;
}
/**
* Prepend any error message caused by parsing the text for preview.
* (EditPageGetPreviewText hook)
* @param $editPage EditPage
* @param $content Content
* @return bool
*/
static function preview( $editPage, &$content ) {
self::validate( $editPage, ContentHandler::getContentText( $content ), null, $hookError );
if ( $hookError ) {
$content = ContentHandler::makeContent(
$hookError . "\n----\n" . ContentHandler::getContentText( $content ) );
}
return true;
}
/**
* Called on first use to create singleton
* (ParserFirstCallInit hook)
*
* @param $parser Parser
*/
static function setup( &$parser ) {
$trans = new ExtTransliterator;
$parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
return true;
}
}