Location: PHPKode > projects > HuMo-gen > admin/include/ansel2unicode/ansel2unicode.php
<?php

/**
* A class that converts strings in ANSEL character set into Unicode (UTF-8).
*
* ANSEL is known as the American Library Association character set, and is until now
* (2009) still the main character set used in GEDCOM files, the standard format for
* exchanging genealogical data.
*
* The conversion is based on the work of Heiner Eichmann documented at his web page:
*      http://www.heiner-eichmann.de/gedcom/charintr.htm
* The conversion uses the mapping file that can be downloaded from here:
*      http://www.heiner-eichmann.de/gedcom/ans2uni.con.zip
*
* You are free to use this code and edit it to suit your needs, but let my parts of the
* commenting be intact. It is not legal to sell this piece of software.
* This class was published on my blog at http://www.gammelsaeter.com/
*
* @author Pål Gjerde Gammelsæter
* @license Free to use, illegal to sell.
* @version 2nd October 2009
*/
class Ansel2Unicode {

		private $_mapping;   // The character mappings

		/**
		* Constructor of the object.
		* Generates the table for ANSEL to Unicode mapping.
		* @param String $conversionFile  The name of the mapping file made by Heiner Eichmann.
		*                                 The file can be downloaded from this URL:
		*                                 http://www.heiner-eichmann.de/gedcom/ans2uni.con.zip
		*/
		public function __construct($conversionFile = 'include/ansel2unicode/ans2uni.con') {
				$temp_ini_file = 'mappings.ini';    // Name of temporary ini file

				/*
				if (file_exists($conversionFile)) {
						// Load file contents, convert into well-formed ini file for later parsing.
						// This is done because the original mapping file cannot be parsed by the
						// PHP function parse_ini_file.
						//print "<br>bef make temp ".date('h:i:s:u')."<br>";
						$file_contents = file_get_contents($conversionFile, 'FILE_BINARY'); // Load contents
						$file_contents = $this->stripComments($file_contents, '#');         // Strip comments
						file_put_contents($temp_ini_file, $file_contents);                  // Save contents
						//print "<br>after make temp ".date('h:i:s:u')."<br>";
				*/
						// Get ini contents
						$map = parse_ini_file($temp_ini_file);            // Parse ini file

						// Go through map to split up the mappings that contain more characters in the key,
						// so that mappings with one character goes into $this->mapping[1], those with two
						// characters goes to $this->mapping[2] etc.
						foreach ($map as $key => $value) {
								$characters = explode('+', $key);             // Split string where '+' occurrs
								$num_chars = count($characters);              // Count number of characters
								$this->_mapping[$num_chars][strtolower($key)] = $value;// Put mapping in right place
						}
				/*
						//print "<br>after mapsplit ".date('h:i:s:u')."<br>";
						// Delete temporary ini file efterwards if exists
						if (file_exists($temp_ini_file)) { unlink($temp_ini_file); }
				}
				else {
						echo '<p>No mapping file with name '.$conversionFile.' exists. Download this file '.
								'from <a href="http://www.heiner-eichmann.de/gedcom/ans2uni.con.zip">here</a>.';
				}
				*/
		}


		/**
		* Loops through string to look for matches in the mapping table and replaces the
		* characters that has a mapping. This will convert the given string to Unicode/UTF-8.
		*
		* @param String $string    String to convert to Unicode (ÙTF-8)
		* @return String
		*/
		public function convert($string) {
				$i          = 0;                                // Initialize counter in loop
				$output     = '';                               // String to output

				// Go through string, fetch next character, next two characters and next three characters
				// to check against mapping table if mapping exist.

				$convertit=0; // default - don't use the ANSEL conversion

				for($a=224; $a<=255; $a++) {  // look for special ANSEL diacritics in range chr(224) till chr (255)
							if(strpos($string,chr($a)) != false ) {
										$convertit=1; // there is at least one special diacritic - we need to do ANSEL conversion
										break;  // no need to look for further diacritics - we have to do ANSEL conversion anyway
							}
				}
				if($convertit==0) { $output=$string; }  // no diacritic in the string no need for ANSEL conversion

				while ($i <= (strlen($string) - 1) AND $convertit==1) { // do special ANSEL conversion for diacritics
						$remains = strlen($string) - $i;            // Characters that remains in string
						$key = array();                             // Initialize array

						// Get next, next two and next three characters (if number of remaing
						// characters allow it)
						if ($remains >= 3) { $key[3] = $this->getKeyMap($string, $i, 3); }
						if ($remains >= 2) { $key[2] = $this->getKeyMap($string, $i, 2); }
						if ($remains >= 1) { $key[1] = $this->getKeyMap($string, $i, 1); }

						// Check if next three characters exist in mapping, and replace them if they do
						if (count($key) == 3 && array_key_exists($key[3], $this->_mapping[3])) {
								$output .= chr(hexdec($this->_mapping[3][$key[3]]));
								$i += 3; // We mapped three bytes into one char, jump three forward for next loop
						}
						// Check if next two characters exist in mapping, and replace them if they do
						elseif (count($key) >= 2 && array_key_exists($key[2], $this->_mapping[2])) {
								$output .= chr(hexdec($this->_mapping[2][$key[2]]));
								$i += 2;
						}
						// Check if next character exist in mapping, and replace it if it does
						elseif (count($key) >= 1 && array_key_exists($key[1], $this->_mapping[1])) {
								$output .= chr(hexdec($this->_mapping[1][$key[1]]));
								$i++;
						}
						// No mapping found, just return the character we have
						else {
								$output .= chr(hexdec($key[1]));
								$i++;
						}
				}

				return utf8_encode($output); // Return the string with replacements
		}


		/**
		* Format a string to same format as the keys in mapping table. The characters are
		* formatted as hexadecimal and separated with a plus sign. Example:
		* "Gon" -> "47+6f+6e"
		*
		* @param String  $string  The string witch contains the characters to convert
		* @param int     $start   Start position for the characters to convert
		* @param int     $length  Number of characters to convert
		* @return String
		*/
		private function getKeyMap($string, $start, $length = 1) {
				$array = str_split(substr($string, $start, $length));
				foreach ($array as $key => $value) { $array[$key] = dechex(ord($value)); }
				return implode('+', $array);
		}


		/**
		* Strips comments off the end of each line in given string.
		*
		* @param String    $data           String where comments are being removed
		* @param String    $comment_char   The character indicating start of a comment
		* @return String[] The array of strings where comments are removed
		*/
		private function stripComments($string, $comment_char = '#') {
				$array = explode("\n", $string);                            // Split each line into array
				$return = array();
				foreach ($array as $i => $line) {
						$pos = strpos($line, $comment_char);                   // Get postition of first '#'
						if ($pos > 0) { $return[$i] = substr($line, 0, $pos); }// Get part before '#'
						else { $return[$i] = $line; }                          // No '#' found
				}
				return implode("\n", $return);                               // Merge together as a string
		}
}

?>
Return current item: HuMo-gen