Transliterator_body.php - mediawiki/extensions/Transliterator - Gitiles

 <?php

 if ( !defined( 'MEDIAWIKI' ) ) {
 	die( 'This file is a MediaWiki extension, not a valid entry point.' );
 }

 /**
  * This provides a {{#transliterate:map|word}} parser function that
  *
  * 1. Finds out which transliteration maps exist
  *    Transliteration maps are pages at [[MediaWiki:Transliterator:map]]
  *    This query is cached.
  * 2. Loads the map if it exists, and parses it into an array suitable for use with strtr()
  *    This array is cached.
  * 3. Applies the transliteration map to the word, case-insensitively, but respecting either NFD
  *    or NFC and combining characters, and word start-and-end markers.
  *
  * It also provides syntax checking for the transliteration pages, both on save and preview.
  * Perhaps in the future it will provide an API interface to assist javascript transliteration.
  *
  * More detailed user-documentation is at http://mediawiki.org/wiki/Extension:Transliterator
  *
  * Design decisions:
  * As there are an unlimited number of transliteration schemes, and which to use depends mainly
  * on personal preference, it is too inflexible to provide the schemes along with the extension,
  * though it may be nice to provide some default standardised ones in the future. Perhaps this
  * would also be a way to support some more languages, but most languages that can be transliterated
  * automatically can be done using this scheme.
  *
  * The maps are discovered in one query to deal with the expected use-case on en.wiktionary which
  * is the translation template, i.e. {{#transliterate}} will be called with an invalid map name
  * much more often than not.
  *
  * The need to handle NFD is illustrated best by Korean which has a tractable map in NFD but
  * would require thousands of NFC rules.  Word start and end markers are required by Greek and other
  * languages that treat initial and final letters separately. Code-points are combined because of the
  * mess that letting stray combining characters through on their own can cause, and the confusion
  * that this causes.
  *
  * Most methods are static, with the exception of those methods that must interact (at some level)
  * with the runtime cache of maps, everything else is stateless.
  */

 use UtfNormal\Validator;

 class ExtTransliterator  {
 	// These characters have been chosen because they are forbidden by MediaWiki, have no special
 	// regex meaning, are not unicode letters, and take up only one byte.
 	const WORD_START = "\x1F";  // A character that will be appended when ^ should match at the start
 	const WORD_END = "\x1E";    // A character that will be appended when $ should match at the end
 	const LETTER_END = "\x1D";  // A chacter added between each character as a separator

 	// The prefix to use for cache items (the number should be incremented when the map format changes)
 	const CACHE_PREFIX = "extTransliterator.4";

 	// flags for preprocessor
 	const DECOMPOSE = 1;
 	const IGNORE_ENDINGS = 2;

 	// attribute flags for postprocessor
 	const PREFIXED = 1;
 	const UPCASED = 2;

 	public $mPages = null;   // An Array of "transliterator:$mapname" => The database row for that template.
 	public $mMaps = array(); // An Array of "$mapname" => The map parsed from that page.


 	/**
 	 * Handle the {{#transliterate: $mapname | $word | $format? | $answer? }} call.
 	 *
 	 * @param $parser Parser
 	 * @param $mapname String is the name of the transliteration map to find.
 	 * @param $word String is the string to transliterate (if the map was found)
 	 * @param $format String is a string containing $1 to be replaced by the transliteration if the map exists
 	 * @param $answer String allows for a user-specified transliteration to override the automatic one
 	 */
 	function render( $parser, $mapname = '', $word = '', $format = '$1', $answer = '' ) {
 		// Handle the case when people use {{#transliterate:<>|<>||<>}}
 		if ( trim( $format ) === '' ) {
 			$format = '$1';
 		}

 		// If we have been given the answer, return it straight-away
 		if ( trim( $answer ) !== '' ) {
 			return str_replace( '$1', $answer, $format );
 		}

 		// Check that the map is a valid title, if not we're in a template {{#transliterate:{{{1}}}...}}
 		$title = Title::newFromText( self::getMapPagePrefix() . $mapname, NS_MEDIAWIKI );
 		if ( !$title ) {
 			$function = MagicWord::get( 'transliterate' )->getSynonym( 0 );
 			return str_replace( '$1', "{{#$function:$mapname|$word}}", $format );
 		}

 		$mappage = $title->getDBkey();
 		$map = $this->getMap( $mappage );

 		// False if map was not found
 		if ( !$map ) {
 			$output = '';

 		// An error message (these should have been caught by ::validate, but you can't be too careful)
 		} elseif ( is_string( $map ) ) {
 			$output = self::wrapError( $map );

 		// Success!, now do the transliteration
 		} else {
 			$output = str_replace( '$1', self::transliterate( $word, $map ), $format );
 		}

 		// Populate the dependency table so that we get re-rendered if the map changes.
 		// TODO: It would be nice if we could make this invisible to the user.
 		if ( isset( $this->mPages[$mappage] ) ) {
 			$parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
 		} else {
 			$parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
 		}

 		return $output;
 	}

 	/**
 	 * Get all the existing maps in one query.
 	 *
 	 * @return Array( title => id );
 	 */
 	function getExistingMapNames() {
 		global $wgMemc;

 		// Has it been used on this page already?
 		if ( ! is_null( $this->mPages ) )
 			return $this->mPages;

 		// Has it been used since it was last updated?
 		$cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
 		if ( $cached )
 			return $this->mPages = $cached;

 		$dbr = wfGetDB( DB_SLAVE );
 		// TODO: This could potentially cause problems if someone creates a few thousand
 		// pages with the prefix. The prefix is guaranteed to be a few letters, so this
 		// won't get the whole MediaWiki namespace.
 		// The result of this query is memcached until someone edits a map.
 		$res = $dbr->select( 'page',
 			array( 'page_title', 'page_id' ),
 			array(
 				'page_namespace' => NS_MEDIAWIKI,
 				'page_title ' . $dbr->buildLike( self::getMapPagePrefix(), $dbr->anyString() )
 			),
 			__METHOD__
 		);

 		$this->mPages = Array();

 		while ( $r = $res->fetchObject() ) {
 			$this->mPages[$r->page_title] = $r->page_id;
 		}

 		$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages );
 		return $this->mPages;
 	}

 	/**
 	 * Get a parsed map.
 	 *
 	 * 1. Check caches for quick return.
 	 * 2? Load from database
 	 * 3? Parse map
 	 *
 	 * @param $mappage  String including MapPagePrefix
 	 * @return false (no map) || String (parse error) || Map (success)
 	 */
 	function getMap( $mappage ) {
 		global $wgMemc;

 		if ( isset( $this->mMaps[$mappage] ) ) {
 			return $this->mMaps[$mappage];
 		}

 		$existing = $this->getExistingMapNames();
 		if ( isset( $existing[$mappage] ) ) {
 			$map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) );

 			if ( !$map ) {
 				$mapMessage = wfMessage( $mappage );

 				if ( !$mapMessage->isDisabled() ) {
 					$map = self::readMap( $mapMessage->text(), $mappage );
 				}

 				if ( $map ) {
 					$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map );
 				}
 			}
 		} else {
 			$map = false;
 		}

 		return $this->mMaps[$mappage] = $map;
 	}

 	/**
 	 * Normalise the text so it can be used with strtr() safely
 	 *
 	 * 1. decodeCharReferences
 	 * 2. split into NFD codepoints or NFC fully combined
 	 * 3. add bookends on word boundaries
 	 *
 	 * @param $word  String from user input
 	 * @param $flags  may include self::DECOMPOSE, self::IGNORE_ENDINGS
 	 * @return String
 	 */
 	static function forTransliteration( $word, $flags ) {
 		static $regexes = null;

 		// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
 		// has been HTML escaped it will be converted to NFC if it passes through
 		// transliteration unchanged, I think that's a WONTFIX though.
 		$word = Sanitizer::decodeCharReferences( $word );

 		if ( $flags & self::DECOMPOSE ) {
 			$word = Validator::toNFD( $word );
 			$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
 		} else {
 			$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
 		}

 		if ( !$regexes ) {
 			// A "letter" is a unicode letter followed by some combining characters
 			// A "non-letter" is any other character followed by some combining characters
 			// "end" is done first so it watches out for word-endings in "start"
 			// If it should treat endings then the start and end of the string are non-letters
 			// Otherwise it does not touch the start or end of the string, only internal transitions
 			$combining = '(?:[\pM]*' . self::LETTER_END . ')';
 			$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
 			$regexes = array (
 				'endings' => array (
 					'start' => "/(^$combining?|$nonletter$combining)([\pL])/u",
 					'end' => "/([\pL]$combining)([^\pL]|$)/u",
 				),
 				'ignore-endings' => array (
 					'start' => "/($nonletter$combining)([\pL])/u",
 					'end' => "/([\pL]$combining)([^\pL])/u",
 				),
 			);
 		}

 		$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
 		$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
 		$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );

 		return $word;
 	}

 	/**
 	 * Update the current rule-set from a rule
 	 *
 	 * @param $from String, the left-hand-side of a rule
 	 * @param $to String, the right-hand-side of a rule
 	 * @param $flags Flags, flags from the top of the map
 	 * @param $rules Array, ^from$ -> to (for strtr())
 	 * @param $attrs Array, from$ -> Flags (for post processing)
 	 * @return Bool true on success, false if an ambiguous rule
 	 */
 	static function addToRules( $from, $to, $flags, &$rules, &$attrs ) {
 		global $wgLang;

 		$prefix = $suffix = '';

 		// forTransliteration() may decode a deliberately escaped ^ or $.
 		// in order to find accurate word boundaries. So we check here if
 		// this occurs, and work around it.
 		$noprefix = $nosuffix = false;
 		if ( $from[0] === '&' || $from[strlen( $from ) - 1] === ';' ) {
 		       	$decoded = Sanitizer::decodeCharReferences( $from );
 			$noprefix = $decoded[0] === '^';
 			$nosuffix = $decoded[strlen( $decoded ) - 1] === '$';
 		}

 		$from = self::forTransliteration( $from, $flags | self::IGNORE_ENDINGS );

 		if ( !$noprefix ) {
 			$from = preg_replace( '/^[\^][' . self::LETTER_END . '][' . self::WORD_START . ']/u', '', $from, 1, $count );
 			if ( $count ) {
 				$prefix = self::WORD_START;
 			}
 		}
 		if ( !$nosuffix ) {
 			$from = preg_replace( '/[' . self::WORD_END . '][$][' . self::LETTER_END . ']$/u', '', $from, 1, $count );
 			if ( $count ) {
 				$suffix = self::WORD_END;
 			}
 		}

 		// Check that this rule isn't ambiguous
 		if ( isset( $rules[$prefix . $from . $suffix] ) && !( $attrs[$from] & self::UPCASED ) ) {
 			return false;
 		}

 		$rules[$prefix . $from . $suffix] = $to;
 		$attrs[$from . $suffix] = $prefix ? self::PREFIXED : 0;

 		// Now case-insensitivity
 		$casefrom = $wgLang->ucfirst( $from ) ;
 		if ( $from !== $casefrom && !isset( $rules[$prefix . $casefrom . $suffix] ) ) {
 			$rules[$prefix . $casefrom . $suffix] = $wgLang->ucfirst( $to );
 			$attrs[$casefrom . $suffix] = ( $prefix ? self::PREFIXED : 0 ) | self::UPCASED;
 		}

 		return true;
 	}

 	/**
 	 * Decide if a line in a map page may contain useful information.
 	 *
 	 * @param $line
 	 * @return Boolean
 	 */
 	static function isUsefulLine( $line ) {
 		return $line != '' && $line[0] != '#';
 	}

 	/**
 	 * Parse a map input syntax into a map.
 	 *
 	 * Input syntax is a set of lines.
 	 *  All " " are ignored.
 	 *  Lines starting with # are ignored, remaining lines are split by =>
 	 *  HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
 	 *
 	 * @param $input String - the contents of the map page
 	 * @param $mappage String - the title of the map page (without MediaWiki:)
 	 * @return false (empty, or no useful content, treat as no map) ||
 	 *         String (error message, syntax error while parsing) ||
 	 *         Array( 'rules' => Array ( from => to ), 'flags' => Flags )
 	 */
 	static function readMap( $input, $mappage ) {
 		global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;

 		$rules = array(); // The actual rules to go into strtr()
 		$attrs = array(); // A map of those rules that were automatically added
 		$flags = 0;       // Flags associated with those rules

 		// Split lines, remove blank lines and comments.
 		$input = trim( $input );
 		$lines = preg_split( "/\s*\n\s*/u", $input );
 		$lines = array_filter( $lines, array( 'ExtTransliterator', 'isUsefulLine' ) );
 		$lines = array_values( $lines );

 		// Nothing left?
 		if ( count( $lines ) == 0 )
 			return false;

 		// Check for __DECOMPOSE__
 		$decompose = MagicWord::get( 'tr_decompose' );
 		if ( $decompose->matchVariableStartToEnd( $lines[0] ) ) {
 			$flags = $flags | self::DECOMPOSE;
 			array_shift( $lines );
 		}

 		// Check for DoS
 		if ( count( $lines ) > $wgTransliteratorRuleCount ) {
 			return wfMessage( 'transliterator-error-rulecount', $wgTransliteratorRuleCount, $mappage )->escaped();
 		}

 		foreach ( $lines as $line ) {

 			$pair = preg_split( '/\s*=>\s*/u', $line );

 			if ( count( $pair ) != 2 || $pair[0] === '' ) {
 				return wfMessage( 'transliterator-error-syntax', $line, $mappage )->escaped();
 			}

 			if ( strlen( $pair[0] ) > $wgTransliteratorRuleSize ) {
 				return wfMessage( 'transliterator-error-rulesize', $line, $mappage,  $wgTransliteratorRuleSize )->escaped();

 			}

 			if ( !self::addToRules( $pair[0], $pair[1], $flags, $rules, $attrs ) ) {
 				return wfMessage( 'transliterator-error-ambiguous', $line, $mappage )->escaped();
 			}
 		}

 		return self::postProcessMap( $rules, $attrs, $flags );
 	}

 	/**
 	 * Fix problems created by readMap.
 	 *
 	 * 1. Long auto-generated case rules override case-specific rules.
 	 *   Delete the auto-generated rules.
 	 *
 	 * 2. The ^ operator overrides length ($ is ok though).
 	 *   Insert extra ^ rules with each needed length.
 	 *
 	 * 3. All the bookends we have inserted are still there
 	 *   Add rules to remove them.
 	 *
 	 * @param $rules  Array( from => to )
 	 * @param $attrs  Array( from => self::PREFIXED | self::UPCASED )
 	 * @param $flags  May contain self::DECOMPOSE
 	 * @return Array( "rules" => $rules, "flags" => $flags )
 	 */
 	static function postProcessMap( $rules, $attrs, $flags ) {
 		// $attrs is sorted into binary order, so start-based substrings of longer rules
 		// immediately precede them. Don't need to know anything else about the order.
 		ksort( $attrs );
 		$wasPrefixed = false;
 		$naturalCased = false;
 		$naturalCasedFrom = '';
 		$prefixedFrom = '';
 		foreach ( $attrs as $from => $attr ) {

 			// If the current rule has been auto-upcased, but a prefix of this rule wasn't,
 			// remove the auto-upcased rule as the specified upper-case takes priority.
 			if ( $attr & self::UPCASED ) {

 				if ( $naturalCased ) {
 					if ( strpos( $from, $naturalCasedFrom ) === 0 ) {

 						unset( $rules[$from] );
 						unset( $rules[self::WORD_START . $from] );
 						continue;

 					} else {
 						$naturalCased = false;
 					}
 				}

 			} elseif ( !$naturalCased ) {

 				$naturalCased = true;
 				$naturalCasedFrom = $from;

 			}

 			// When it finds a ^ed rule, it duplicates all rules that start with that rule
 			// which has the effect of promoting length to override ^.
                         // If the length is actually the same, ^x needs to maintain priority over x$
 			if ( $attr & self::PREFIXED ) {

 				if ( !$wasPrefixed || strpos( $from, $prefixedFrom ) !== 0 ) {
 					$wasPrefixed = true;
 					$prefixedFrom = $from;
 				}

 			} elseif ( $wasPrefixed ) {

 				if ( strpos( $from, $prefixedFrom ) === 0 ) {
 					if ( $from !== $prefixedFrom . self::WORD_END ) {
 						$rules[self::WORD_START . $from] = $rules[$from];
 					}
 				} else {
 					$wasPrefixed = false;
 				}

 			}
 		}

 		$rules[self::LETTER_END] = '';
 		$rules[self::WORD_END] = '';
 		$rules[self::WORD_START] = '';

 		$rules = new ReplacementArray( $rules );

 		return array( 'rules' => $rules, 'flags' => $flags );
 	}

 	/**
 	 * Transliterate a word using the given map's rules and flags.
 	 *
 	 * @param $word  raw user input
 	 * @param $map  as returned by getMap()
 	 * @return String ready to output
 	 */
 	static function transliterate( $word, $map )
 	{
 		// Add bookends and combining character markers
 		$word = self::forTransliteration( $word, $map['flags'] );

 		// Perform transliteration
 		$output = $map['rules']->replace( $word );

 		// Maintain MediaWiki invariant of NFC
 		return Validator::toNFC( $output );
 	}

 	/**
 	 * Put a message inside an error span.
 	 *
 	 * @param $msg String (HTML)
 	 * @return String (HTML)
 	 */
 	static function wrapError( $msg ) {
 		return "<span class=\"transliterator error\">$msg</span>";
 	}

 	/**
 	 * Get the prefix to use for map pages in the MediaWiki namespace.
 	 *
 	 * @return String
 	 */
 	static function getMapPagePrefix () {
 		static $prefix = null;
 		if ( !$prefix ) {
 			$prefix = MagicWord::get( 'tr_prefix' )->getSynonym( 0 );
 			// If the prefix is too short then parts of the MediaWiki namespace are
 			// rendered un-editable. If it is not a valid title, then it is very broken.
 			if ( strlen( $prefix ) < 3 || !Title::newFromText( $prefix, NS_MEDIAWIKI ) ) {
 				wfDebug( "Invalid Transliterator prefix, must be a valid title longer than three characters, falling back to Transliterator:" );
 				$prefix = "Transliterator:";
 			}
 		}
 		return $prefix;
 	}

 	/**
 	 * Decide whether the title represents a Transliterator map.
 	 *
 	 * @param $title Title
 	 * @return Boolean
 	 */
 	static function isMapPage( &$title ) {
 		if ( $title->getNamespace() == NS_MEDIAWIKI ) {
 			if ( strpos( $title->getText(), self::getMapPagePrefix() ) === 0 ) {
 				return true;
 			}
 		}
 		return false;
 	}

 	/**
 	 * Remove the article from the Transliterator caches.
 	 * (ArticlePurge, ArticleDeleteComplete)
 	 *
 	 * @param WikiPage $wikiPage
 	 */
 	static function purgeArticle( WikiPage &$wikiPage ) {
 		$title = $wikiPage->getTitle();
 		return self::purgeTitle( $title );
 	}

 	/**
 	 * Remove the article from the Transliterator caches.
 	 * (NewRevisionFromEditComplete)
 	 *
 	 * @param WikiPage $wikiPage
 	 */
 	static function purgeArticleNewRevision( WikiPage $wikiPage ) {
 		$title = $wikiPage->getTitle();
 		return self::purgeTitle( $title );
 	}

 	/**
 	 * Remove the title from the Transliterator caches.
 	 * (TitleMoveComplete hook)
 	 */
 	static function purgeNewTitle( &$title, &$newtitle ) {
 		return self::purgeTitle( $newtitle );
 	}

 	/**
 	 * Remove the title from the Transliterator caches.
 	 * (ArticleUndelete hook)
 	 *
 	 * @param $title Title
 	 */
 	static function purgeTitle( &$title ) {
 		global $wgMemc;
 		if ( self::isMapPage( $title ) ) {
 			$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) );
 			$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, '__map_names__' ) );
 		}
 		return true;
 	}

 	/**
 	 * Show any errors that would be caused by trying to use this map.
          *
          * Does not follow redirects.
          *
 	 * (EditFilter hook)
 	 *
 	 * @param $editPage EditPage
 	 * @param $text String
 	 * @param $section
 	 * @param $hookError
 	 */
 	static function validate( $editPage, $text, $section, &$hookError ) {
 		// FIXME: Should not access private variables
 		$title = $editPage->mTitle;
 		if ( self::isMapPage( $title ) ) {
 			$map = self::readMap( $text, $title->getDBkey() );
 			if ( is_string( $map ) ) {
 				$hookError = self::wrapError( $map );
 			}
 		}
 		return true;
 	}

 	/**
 	 * Prepend any error message caused by parsing the text for preview.
 	 * (EditPageGetPreviewText hook)
 	 * @param $editPage EditPage
 	 * @param $content Content
 	 * @return bool
 	 */
 	static function preview( $editPage, &$content ) {
 		self::validate( $editPage, ContentHandler::getContentText( $content ), null, $hookError );
 		if ( $hookError ) {
 			$content = ContentHandler::makeContent(
 				$hookError . "\n----\n" . ContentHandler::getContentText( $content ) );
 		}
 		return true;
 	}

 	/**
 	 * Called on first use to create singleton
 	 * (ParserFirstCallInit hook)
 	 *
 	 * @param $parser Parser
 	 */
 	static function setup( &$parser ) {
 		$trans = new ExtTransliterator;
 		$parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
 		return true;
 	}
 }
	<?php

	if ( !defined( 'MEDIAWIKI' ) ) {
	die( 'This file is a MediaWiki extension, not a valid entry point.' );
	}

	/**
	* This provides a {{#transliterate:map\|word}} parser function that
	*
	* 1. Finds out which transliteration maps exist
	* Transliteration maps are pages at [[MediaWiki:Transliterator:map]]
	* This query is cached.
	* 2. Loads the map if it exists, and parses it into an array suitable for use with strtr()
	* This array is cached.
	* 3. Applies the transliteration map to the word, case-insensitively, but respecting either NFD
	* or NFC and combining characters, and word start-and-end markers.
	*
	* It also provides syntax checking for the transliteration pages, both on save and preview.
	* Perhaps in the future it will provide an API interface to assist javascript transliteration.
	*
	* More detailed user-documentation is at http://mediawiki.org/wiki/Extension:Transliterator
	*
	* Design decisions:
	* As there are an unlimited number of transliteration schemes, and which to use depends mainly
	* on personal preference, it is too inflexible to provide the schemes along with the extension,
	* though it may be nice to provide some default standardised ones in the future. Perhaps this
	* would also be a way to support some more languages, but most languages that can be transliterated
	* automatically can be done using this scheme.
	*
	* The maps are discovered in one query to deal with the expected use-case on en.wiktionary which
	* is the translation template, i.e. {{#transliterate}} will be called with an invalid map name
	* much more often than not.
	*
	* The need to handle NFD is illustrated best by Korean which has a tractable map in NFD but
	* would require thousands of NFC rules. Word start and end markers are required by Greek and other
	* languages that treat initial and final letters separately. Code-points are combined because of the
	* mess that letting stray combining characters through on their own can cause, and the confusion
	* that this causes.
	*
	* Most methods are static, with the exception of those methods that must interact (at some level)
	* with the runtime cache of maps, everything else is stateless.
	*/

	use UtfNormal\Validator;

	class ExtTransliterator {
	// These characters have been chosen because they are forbidden by MediaWiki, have no special
	// regex meaning, are not unicode letters, and take up only one byte.
	const WORD_START = "\x1F"; // A character that will be appended when ^ should match at the start
	const WORD_END = "\x1E"; // A character that will be appended when $ should match at the end
	const LETTER_END = "\x1D"; // A chacter added between each character as a separator

	// The prefix to use for cache items (the number should be incremented when the map format changes)
	const CACHE_PREFIX = "extTransliterator.4";

	// flags for preprocessor
	const DECOMPOSE = 1;
	const IGNORE_ENDINGS = 2;

	// attribute flags for postprocessor
	const PREFIXED = 1;
	const UPCASED = 2;

	public $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
	public $mMaps = array(); // An Array of "$mapname" => The map parsed from that page.


	/**
	* Handle the {{#transliterate: $mapname \| $word \| $format? \| $answer? }} call.
	*
	* @param $parser Parser
	* @param $mapname String is the name of the transliteration map to find.
	* @param $word String is the string to transliterate (if the map was found)
	* @param $format String is a string containing $1 to be replaced by the transliteration if the map exists
	* @param $answer String allows for a user-specified transliteration to override the automatic one
	*/
	function render( $parser, $mapname = '', $word = '', $format = '$1', $answer = '' ) {
	// Handle the case when people use {{#transliterate:<>\|<>\|\|<>}}
	if ( trim( $format ) === '' ) {
	$format = '$1';
	}

	// If we have been given the answer, return it straight-away
	if ( trim( $answer ) !== '' ) {
	return str_replace( '$1', $answer, $format );
	}

	// Check that the map is a valid title, if not we're in a template {{#transliterate:{{{1}}}...}}
	$title = Title::newFromText( self::getMapPagePrefix() . $mapname, NS_MEDIAWIKI );
	if ( !$title ) {
	$function = MagicWord::get( 'transliterate' )->getSynonym( 0 );
	return str_replace( '$1', "{{#$function:$mapname\|$word}}", $format );
	}

	$mappage = $title->getDBkey();
	$map = $this->getMap( $mappage );

	// False if map was not found
	if ( !$map ) {
	$output = '';

	// An error message (these should have been caught by ::validate, but you can't be too careful)
	} elseif ( is_string( $map ) ) {
	$output = self::wrapError( $map );

	// Success!, now do the transliteration
	} else {
	$output = str_replace( '$1', self::transliterate( $word, $map ), $format );
	}

	// Populate the dependency table so that we get re-rendered if the map changes.
	// TODO: It would be nice if we could make this invisible to the user.
	if ( isset( $this->mPages[$mappage] ) ) {
	$parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
	} else {
	$parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
	}

	return $output;
	}

	/**
	* Get all the existing maps in one query.
	*
	* @return Array( title => id );
	*/
	function getExistingMapNames() {
	global $wgMemc;

	// Has it been used on this page already?
	if ( ! is_null( $this->mPages ) )
	return $this->mPages;

	// Has it been used since it was last updated?
	$cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
	if ( $cached )
	return $this->mPages = $cached;

	$dbr = wfGetDB( DB_SLAVE );
	// TODO: This could potentially cause problems if someone creates a few thousand
	// pages with the prefix. The prefix is guaranteed to be a few letters, so this
	// won't get the whole MediaWiki namespace.
	// The result of this query is memcached until someone edits a map.
	$res = $dbr->select( 'page',
	array( 'page_title', 'page_id' ),
	array(
	'page_namespace' => NS_MEDIAWIKI,
	'page_title ' . $dbr->buildLike( self::getMapPagePrefix(), $dbr->anyString() )
	),
	__METHOD__
	);

	$this->mPages = Array();

	while ( $r = $res->fetchObject() ) {
	$this->mPages[$r->page_title] = $r->page_id;
	}

	$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages );
	return $this->mPages;
	}

	/**
	* Get a parsed map.
	*
	* 1. Check caches for quick return.
	* 2? Load from database
	* 3? Parse map
	*
	* @param $mappage String including MapPagePrefix
	* @return false (no map) \|\| String (parse error) \|\| Map (success)
	*/
	function getMap( $mappage ) {
	global $wgMemc;

	if ( isset( $this->mMaps[$mappage] ) ) {
	return $this->mMaps[$mappage];
	}

	$existing = $this->getExistingMapNames();
	if ( isset( $existing[$mappage] ) ) {
	$map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) );

	if ( !$map ) {
	$mapMessage = wfMessage( $mappage );

	if ( !$mapMessage->isDisabled() ) {
	$map = self::readMap( $mapMessage->text(), $mappage );
	}

	if ( $map ) {
	$wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map );
	}
	}
	} else {
	$map = false;
	}

	return $this->mMaps[$mappage] = $map;
	}

	/**
	* Normalise the text so it can be used with strtr() safely
	*
	* 1. decodeCharReferences
	* 2. split into NFD codepoints or NFC fully combined
	* 3. add bookends on word boundaries
	*
	* @param $word String from user input
	* @param $flags may include self::DECOMPOSE, self::IGNORE_ENDINGS
	* @return String
	*/
	static function forTransliteration( $word, $flags ) {
	static $regexes = null;

	// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
	// has been HTML escaped it will be converted to NFC if it passes through
	// transliteration unchanged, I think that's a WONTFIX though.
	$word = Sanitizer::decodeCharReferences( $word );

	if ( $flags & self::DECOMPOSE ) {
	$word = Validator::toNFD( $word );
	$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
	} else {
	$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
	}

	if ( !$regexes ) {
	// A "letter" is a unicode letter followed by some combining characters
	// A "non-letter" is any other character followed by some combining characters
	// "end" is done first so it watches out for word-endings in "start"
	// If it should treat endings then the start and end of the string are non-letters
	// Otherwise it does not touch the start or end of the string, only internal transitions
	$combining = '(?:[\pM]*' . self::LETTER_END . ')';
	$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
	$regexes = array (
	'endings' => array (
	'start' => "/(^$combining?\|$nonletter$combining)([\pL])/u",
	'end' => "/([\pL]$combining)([^\pL]\|$)/u",
	),
	'ignore-endings' => array (
	'start' => "/($nonletter$combining)([\pL])/u",
	'end' => "/([\pL]$combining)([^\pL])/u",
	),
	);
	}

	$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
	$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
	$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );

	return $word;
	}

	/**
	* Update the current rule-set from a rule
	*
	* @param $from String, the left-hand-side of a rule
	* @param $to String, the right-hand-side of a rule
	* @param $flags Flags, flags from the top of the map
	* @param $rules Array, ^from$ -> to (for strtr())
	* @param $attrs Array, from$ -> Flags (for post processing)
	* @return Bool true on success, false if an ambiguous rule
	*/
	static function addToRules( $from, $to, $flags, &$rules, &$attrs ) {
	global $wgLang;

	$prefix = $suffix = '';

	// forTransliteration() may decode a deliberately escaped ^ or $.
	// in order to find accurate word boundaries. So we check here if
	// this occurs, and work around it.
	$noprefix = $nosuffix = false;
	if ( $from[0] === '&' \|\| $from[strlen( $from ) - 1] === ';' ) {
	$decoded = Sanitizer::decodeCharReferences( $from );
	$noprefix = $decoded[0] === '^';
	$nosuffix = $decoded[strlen( $decoded ) - 1] === '$';
	}

	$from = self::forTransliteration( $from, $flags \| self::IGNORE_ENDINGS );

	if ( !$noprefix ) {
	$from = preg_replace( '/^[\^][' . self::LETTER_END . '][' . self::WORD_START . ']/u', '', $from, 1, $count );
	if ( $count ) {
	$prefix = self::WORD_START;
	}
	}
	if ( !$nosuffix ) {
	$from = preg_replace( '/[' . self::WORD_END . '][$][' . self::LETTER_END . ']$/u', '', $from, 1, $count );
	if ( $count ) {
	$suffix = self::WORD_END;
	}
	}

	// Check that this rule isn't ambiguous
	if ( isset( $rules[$prefix . $from . $suffix] ) && !( $attrs[$from] & self::UPCASED ) ) {
	return false;
	}

	$rules[$prefix . $from . $suffix] = $to;
	$attrs[$from . $suffix] = $prefix ? self::PREFIXED : 0;

	// Now case-insensitivity
	$casefrom = $wgLang->ucfirst( $from ) ;
	if ( $from !== $casefrom && !isset( $rules[$prefix . $casefrom . $suffix] ) ) {
	$rules[$prefix . $casefrom . $suffix] = $wgLang->ucfirst( $to );
	$attrs[$casefrom . $suffix] = ( $prefix ? self::PREFIXED : 0 ) \| self::UPCASED;
	}

	return true;
	}

	/**
	* Decide if a line in a map page may contain useful information.
	*
	* @param $line
	* @return Boolean
	*/
	static function isUsefulLine( $line ) {
	return $line != '' && $line[0] != '#';
	}

	/**
	* Parse a map input syntax into a map.
	*
	* Input syntax is a set of lines.
	* All " " are ignored.
	* Lines starting with # are ignored, remaining lines are split by =>
	* HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
	*
	* @param $input String - the contents of the map page
	* @param $mappage String - the title of the map page (without MediaWiki:)
	* @return false (empty, or no useful content, treat as no map) \|\|
	* String (error message, syntax error while parsing) \|\|
	* Array( 'rules' => Array ( from => to ), 'flags' => Flags )
	*/
	static function readMap( $input, $mappage ) {
	global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;

	$rules = array(); // The actual rules to go into strtr()
	$attrs = array(); // A map of those rules that were automatically added
	$flags = 0; // Flags associated with those rules

	// Split lines, remove blank lines and comments.
	$input = trim( $input );
	$lines = preg_split( "/\s\n\s/u", $input );
	$lines = array_filter( $lines, array( 'ExtTransliterator', 'isUsefulLine' ) );
	$lines = array_values( $lines );

	// Nothing left?
	if ( count( $lines ) == 0 )
	return false;

	// Check for __DECOMPOSE__
	$decompose = MagicWord::get( 'tr_decompose' );
	if ( $decompose->matchVariableStartToEnd( $lines[0] ) ) {
	$flags = $flags \| self::DECOMPOSE;
	array_shift( $lines );
	}

	// Check for DoS
	if ( count( $lines ) > $wgTransliteratorRuleCount ) {
	return wfMessage( 'transliterator-error-rulecount', $wgTransliteratorRuleCount, $mappage )->escaped();
	}

	foreach ( $lines as $line ) {

	$pair = preg_split( '/\s=>\s/u', $line );

	if ( count( $pair ) != 2 \|\| $pair[0] === '' ) {
	return wfMessage( 'transliterator-error-syntax', $line, $mappage )->escaped();
	}

	if ( strlen( $pair[0] ) > $wgTransliteratorRuleSize ) {
	return wfMessage( 'transliterator-error-rulesize', $line, $mappage, $wgTransliteratorRuleSize )->escaped();

	}

	if ( !self::addToRules( $pair[0], $pair[1], $flags, $rules, $attrs ) ) {
	return wfMessage( 'transliterator-error-ambiguous', $line, $mappage )->escaped();
	}
	}

	return self::postProcessMap( $rules, $attrs, $flags );
	}

	/**
	* Fix problems created by readMap.
	*
	* 1. Long auto-generated case rules override case-specific rules.
	* Delete the auto-generated rules.
	*
	* 2. The ^ operator overrides length ($ is ok though).
	* Insert extra ^ rules with each needed length.
	*
	* 3. All the bookends we have inserted are still there
	* Add rules to remove them.
	*
	* @param $rules Array( from => to )
	* @param $attrs Array( from => self::PREFIXED \| self::UPCASED )
	* @param $flags May contain self::DECOMPOSE
	* @return Array( "rules" => $rules, "flags" => $flags )
	*/
	static function postProcessMap( $rules, $attrs, $flags ) {
	// $attrs is sorted into binary order, so start-based substrings of longer rules
	// immediately precede them. Don't need to know anything else about the order.
	ksort( $attrs );
	$wasPrefixed = false;
	$naturalCased = false;
	$naturalCasedFrom = '';
	$prefixedFrom = '';
	foreach ( $attrs as $from => $attr ) {

	// If the current rule has been auto-upcased, but a prefix of this rule wasn't,
	// remove the auto-upcased rule as the specified upper-case takes priority.
	if ( $attr & self::UPCASED ) {

	if ( $naturalCased ) {
	if ( strpos( $from, $naturalCasedFrom ) === 0 ) {

	unset( $rules[$from] );
	unset( $rules[self::WORD_START . $from] );
	continue;

	} else {
	$naturalCased = false;
	}
	}

	} elseif ( !$naturalCased ) {

	$naturalCased = true;
	$naturalCasedFrom = $from;

	}

	// When it finds a ^ed rule, it duplicates all rules that start with that rule
	// which has the effect of promoting length to override ^.
	// If the length is actually the same, ^x needs to maintain priority over x$
	if ( $attr & self::PREFIXED ) {

	if ( !$wasPrefixed \|\| strpos( $from, $prefixedFrom ) !== 0 ) {
	$wasPrefixed = true;
	$prefixedFrom = $from;
	}

	} elseif ( $wasPrefixed ) {

	if ( strpos( $from, $prefixedFrom ) === 0 ) {
	if ( $from !== $prefixedFrom . self::WORD_END ) {
	$rules[self::WORD_START . $from] = $rules[$from];
	}
	} else {
	$wasPrefixed = false;
	}

	}
	}

	$rules[self::LETTER_END] = '';
	$rules[self::WORD_END] = '';
	$rules[self::WORD_START] = '';

	$rules = new ReplacementArray( $rules );

	return array( 'rules' => $rules, 'flags' => $flags );
	}

	/**
	* Transliterate a word using the given map's rules and flags.
	*
	* @param $word raw user input
	* @param $map as returned by getMap()
	* @return String ready to output
	*/
	static function transliterate( $word, $map )
	{
	// Add bookends and combining character markers
	$word = self::forTransliteration( $word, $map['flags'] );

	// Perform transliteration
	$output = $map['rules']->replace( $word );

	// Maintain MediaWiki invariant of NFC
	return Validator::toNFC( $output );
	}

	/**
	* Put a message inside an error span.
	*
	* @param $msg String (HTML)
	* @return String (HTML)
	*/
	static function wrapError( $msg ) {
	return "<span class=\"transliterator error\">$msg</span>";
	}

	/**
	* Get the prefix to use for map pages in the MediaWiki namespace.
	*
	* @return String
	*/
	static function getMapPagePrefix () {
	static $prefix = null;
	if ( !$prefix ) {
	$prefix = MagicWord::get( 'tr_prefix' )->getSynonym( 0 );
	// If the prefix is too short then parts of the MediaWiki namespace are
	// rendered un-editable. If it is not a valid title, then it is very broken.
	if ( strlen( $prefix ) < 3 \|\| !Title::newFromText( $prefix, NS_MEDIAWIKI ) ) {
	wfDebug( "Invalid Transliterator prefix, must be a valid title longer than three characters, falling back to Transliterator:" );
	$prefix = "Transliterator:";
	}
	}
	return $prefix;
	}

	/**
	* Decide whether the title represents a Transliterator map.
	*
	* @param $title Title
	* @return Boolean
	*/
	static function isMapPage( &$title ) {
	if ( $title->getNamespace() == NS_MEDIAWIKI ) {
	if ( strpos( $title->getText(), self::getMapPagePrefix() ) === 0 ) {
	return true;
	}
	}
	return false;
	}

	/**
	* Remove the article from the Transliterator caches.
	* (ArticlePurge, ArticleDeleteComplete)
	*
	* @param WikiPage $wikiPage
	*/
	static function purgeArticle( WikiPage &$wikiPage ) {
	$title = $wikiPage->getTitle();
	return self::purgeTitle( $title );
	}

	/**
	* Remove the article from the Transliterator caches.
	* (NewRevisionFromEditComplete)
	*
	* @param WikiPage $wikiPage
	*/
	static function purgeArticleNewRevision( WikiPage $wikiPage ) {
	$title = $wikiPage->getTitle();
	return self::purgeTitle( $title );
	}

	/**
	* Remove the title from the Transliterator caches.
	* (TitleMoveComplete hook)
	*/
	static function purgeNewTitle( &$title, &$newtitle ) {
	return self::purgeTitle( $newtitle );
	}

	/**
	* Remove the title from the Transliterator caches.
	* (ArticleUndelete hook)
	*
	* @param $title Title
	*/
	static function purgeTitle( &$title ) {
	global $wgMemc;
	if ( self::isMapPage( $title ) ) {
	$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) );
	$wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, '__map_names__' ) );
	}
	return true;
	}

	/**
	* Show any errors that would be caused by trying to use this map.
	*
	* Does not follow redirects.
	*
	* (EditFilter hook)
	*
	* @param $editPage EditPage
	* @param $text String
	* @param $section
	* @param $hookError
	*/
	static function validate( $editPage, $text, $section, &$hookError ) {
	// FIXME: Should not access private variables
	$title = $editPage->mTitle;
	if ( self::isMapPage( $title ) ) {
	$map = self::readMap( $text, $title->getDBkey() );
	if ( is_string( $map ) ) {
	$hookError = self::wrapError( $map );
	}
	}
	return true;
	}

	/**
	* Prepend any error message caused by parsing the text for preview.
	* (EditPageGetPreviewText hook)
	* @param $editPage EditPage
	* @param $content Content
	* @return bool
	*/
	static function preview( $editPage, &$content ) {
	self::validate( $editPage, ContentHandler::getContentText( $content ), null, $hookError );
	if ( $hookError ) {
	$content = ContentHandler::makeContent(
	$hookError . "\n----\n" . ContentHandler::getContentText( $content ) );
	}
	return true;
	}

	/**
	* Called on first use to create singleton
	* (ParserFirstCallInit hook)
	*
	* @param $parser Parser
	*/
	static function setup( &$parser ) {
	$trans = new ExtTransliterator;
	$parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
	return true;
	}
	}