Location: PHPKode > projects > Sphider Plus > sphider-plus_v.2.9/include/stemming/ru_stem.php
<?php

/* o------------------------------------------------------------------------------o
 *
 *  Written on a cold winter evening close to the end of 2005 by Dennis Kreminsky
 *
 *    PHP5 implementation of Martin Porter's stemming algorithm for Russian language.
 *   Additional stemming supplied and adapted for Sphider-plus
 *   by Rolf Kellner [Tec] Feb. 2010
 *
 * o------------------------------------------------------------------------------o */

define ('CHAR_LENGTH', '2'); // all Russian characters take 2 bytes in UTF-8

class ru_Stemmer {

	public function Stem($word){
		$word=self::re($word);
		$a=self::rv($word);
		$start=$a[0];
		$rv=$a[1];
		$rv=self::step1($rv);
		$rv=self::step2($rv);
		$rv=self::step3($rv);
		$rv=self::step4($rv);
		return $start.$rv;
	}

	private function re($word) {
		/**
		 * Remove: ...ский and ...у
		 */
		$re = preg_replace("/(ский|у)$/", '', $word);
		return $re;
	}

	private function rv($word){
		$vowels=array('а','е','и','о','у','ы','э','ю','я');

		$flag=0;
		$rv='';
		$start='';
		for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH){
			if ($flag==1)
			$rv.=substr($word, $i, CHAR_LENGTH);
			else
			$start.=substr($word, $i, CHAR_LENGTH);
			if (array_search(substr($word,$i,CHAR_LENGTH), $vowels)!==FALSE)
			$flag=1;
		}
		return array($start,$rv);
	}

	private function step1($word){
		$perfective1=array('в', 'вши', 'вшись');
		foreach ($perfective1 as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='я'))
		return substr($word, 0, strlen($word)-strlen($suffix));

		$perfective2=array('ив','ивши','ившись','ывши','ывшись');

		foreach ($perfective2 as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix)
		return substr($word, 0, strlen($word)-strlen($suffix));

		$reflexive=array('ся', 'сь');
		foreach ($reflexive as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix)
		$word=substr($word, 0, strlen($word)-strlen($suffix));

		$adjective=array('ее','ие','ые','ое','ими','ыми','ей','ий','ый','ой','ей','ий','ый','ой','ом','его','ого','ему','ому','их','ых','ему','ому','их','ых','ую','юю','ая','яя','ою','ею');
		$participle2=array('ем','нн','вш','ющ','щ');
		$participle1=array('ивш','ывш','ующ');
		foreach ($adjective as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix){
			$word=substr($word, 0, strlen($word)-strlen($suffix));
			foreach ($participle1 as $suffix)
			if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='я'))
			$word=substr($word, 0, strlen($word)-strlen($suffix));
			foreach ($participle2 as $suffix)
			if (substr($word,-(strlen($suffix)))==$suffix)
			$word=substr($word, 0, strlen($word)-strlen($suffix));
			return $word;
		}

		$verb1=array('ла','на','ете','йте','ли','й','л','ем','н','ло','но','ет','ют','ны','ть','ешь','нно');
		foreach ($verb1 as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='я'))
		return substr($word, 0, strlen($word)-strlen($suffix));

		$verb2=array('ила','ыла','ена','ейте','уйте','ите','или','ыли','ей','уй','ил','ыл','им','ым','ен','ило','ыло','ено','ят','ует','уют','ит','ыт','ены','ить','ыть','ишь','ую','ю');
		foreach ($verb2 as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix)
		return substr($word, 0, strlen($word)-strlen($suffix));

		$noun=array('а','ев','ов','ие','ье','е','иями','ями','ами','еи','ии','и','ией','ей','ой','ий','й','иям','ям','ием','ем','ам','ом','о','у','ах','иях','ях','ы','ь','ию','ью','ю','ия','ья','я');
		foreach ($noun as $suffix)
		if (substr($word,-(strlen($suffix)))==$suffix)
		return substr($word, 0, strlen($word)-strlen($suffix));
		return $word;
	}

	private function step2($word){
		if (substr($word,-CHAR_LENGTH,CHAR_LENGTH)=='и')
		$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
		return $word;
	}

	private function step3($word){
		$vowels=array('а','е','и','о','у','ы','э','ю','я');
		$flag=0;
		$r1='';
		$r2='';
		for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH){
			if ($flag==2)
			$r1.=substr($word, $i, CHAR_LENGTH);
			if (array_search(substr($word,$i,CHAR_LENGTH), $vowels)!==FALSE)
			$flag=1;
			if ($flag=1 && array_search(substr($word,$i,CHAR_LENGTH), $vowels)===FALSE)
			$flag=2;
		}
		$flag=0;
		for ($i=0; $i<strlen($r1); $i+=CHAR_LENGTH){
			if ($flag==2)
			$r2.=substr($r1, $i, CHAR_LENGTH);
			if (array_search(substr($r1,$i,CHAR_LENGTH), $vowels)!==FALSE)
			$flag=1;
			if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels)===FALSE)
			$flag=2;
		}
		$derivational=array('ост','ость');
		foreach ($derivational as $suffix)
		if (substr($r2,-(strlen($suffix)))==$suffix)
		$word=substr($word, 0, strlen($r2)-strlen($suffix));
		return $word;
	}

	private function step4($word){
		if (substr($word,-CHAR_LENGTH*2)=='нн')
		$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
		else{
			$superlative=array('ейш', 'ейше');
			foreach ($superlative as $suffix)
			if (substr($word,-(strlen($suffix)))==$suffix)
			$word=substr($word, 0, strlen($word)-strlen($suffix));
			if (substr($word,-CHAR_LENGTH*2)=='нн')
			$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
		}

		// should there be a guard flag? can't think of a russian word that ends with ...ейшь or ..ннь , though the algorithm states this is an "otherwise" case
		if (substr($word,-CHAR_LENGTH,CHAR_LENGTH)=='ь')
		$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
		return $word;
	}
}

?>
Return current item: Sphider Plus