<?php
/* o------------------------------------------------------------------------------o
*
* Written on a cold winter evening close to the end of 2005 by Dennis Kreminsky
*
* PHP5 implementation of Martin Porter's stemming algorithm for Russian language.
* Additional stemming supplied and adapted for Sphider-plus
* by Rolf Kellner [Tec] Feb. 2010
*
* o------------------------------------------------------------------------------o */
define ('CHAR_LENGTH', '2'); // all Russian characters take 2 bytes in UTF-8
class ru_Stemmer {
public function Stem($word){
$word=self::re($word);
$a=self::rv($word);
$start=$a[0];
$rv=$a[1];
$rv=self::step1($rv);
$rv=self::step2($rv);
$rv=self::step3($rv);
$rv=self::step4($rv);
return $start.$rv;
}
private function re($word) {
/**
* Remove: ...Ñкий and ...Ñ
*/
$re = preg_replace("/(Ñкий|Ñ)$/", '', $word);
return $re;
}
private function rv($word){
$vowels=array('а','е','и','о','Ñ','Ñ','Ñ','Ñ','Ñ');
$flag=0;
$rv='';
$start='';
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH){
if ($flag==1)
$rv.=substr($word, $i, CHAR_LENGTH);
else
$start.=substr($word, $i, CHAR_LENGTH);
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels)!==FALSE)
$flag=1;
}
return array($start,$rv);
}
private function step1($word){
$perfective1=array('в', 'вÑи', 'вÑиÑÑ');
foreach ($perfective1 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='Ñ'))
return substr($word, 0, strlen($word)-strlen($suffix));
$perfective2=array('ив','ивÑи','ивÑиÑÑ','ÑвÑи','ÑвÑиÑÑ');
foreach ($perfective2 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
return substr($word, 0, strlen($word)-strlen($suffix));
$reflexive=array('ÑÑ', 'ÑÑ');
foreach ($reflexive as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
$word=substr($word, 0, strlen($word)-strlen($suffix));
$adjective=array('ее','ие','Ñе','ое','ими','Ñми','ей','ий','Ñй','ой','ей','ий','Ñй','ой','ом','его','ого','емÑ','омÑ','иÑ
','ÑÑ
','емÑ','омÑ','иÑ
','ÑÑ
','ÑÑ','ÑÑ','аÑ','ÑÑ','оÑ','еÑ');
$participle2=array('ем','нн','вÑ','ÑÑ','Ñ');
$participle1=array('ивÑ','ÑвÑ','ÑÑÑ');
foreach ($adjective as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix){
$word=substr($word, 0, strlen($word)-strlen($suffix));
foreach ($participle1 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='Ñ'))
$word=substr($word, 0, strlen($word)-strlen($suffix));
foreach ($participle2 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
$word=substr($word, 0, strlen($word)-strlen($suffix));
return $word;
}
$verb1=array('ла','на','еÑе','йÑе','ли','й','л','ем','н','ло','но','еÑ','ÑÑ','нÑ','ÑÑ','еÑÑ','нно');
foreach ($verb1 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix && (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='Ñ'))
return substr($word, 0, strlen($word)-strlen($suffix));
$verb2=array('ила','Ñла','ена','ейÑе','ÑйÑе','иÑе','или','Ñли','ей','Ñй','ил','Ñл','им','Ñм','ен','ило','Ñло','ено','ÑÑ','ÑеÑ','ÑÑÑ','иÑ','ÑÑ','енÑ','иÑÑ','ÑÑÑ','иÑÑ','ÑÑ','Ñ');
foreach ($verb2 as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
return substr($word, 0, strlen($word)-strlen($suffix));
$noun=array('а','ев','ов','ие','Ñе','е','иÑми','Ñми','ами','еи','ии','и','ией','ей','ой','ий','й','иÑм','Ñм','ием','ем','ам','ом','о','Ñ','аÑ
','иÑÑ
','ÑÑ
','Ñ','Ñ','иÑ','ÑÑ','Ñ','иÑ','ÑÑ','Ñ');
foreach ($noun as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
return substr($word, 0, strlen($word)-strlen($suffix));
return $word;
}
private function step2($word){
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH)=='ø')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
return $word;
}
private function step3($word){
$vowels=array('а','е','и','о','Ñ','Ñ','Ñ','Ñ','Ñ');
$flag=0;
$r1='';
$r2='';
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH){
if ($flag==2)
$r1.=substr($word, $i, CHAR_LENGTH);
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels)!==FALSE)
$flag=1;
if ($flag=1 && array_search(substr($word,$i,CHAR_LENGTH), $vowels)===FALSE)
$flag=2;
}
$flag=0;
for ($i=0; $i<strlen($r1); $i+=CHAR_LENGTH){
if ($flag==2)
$r2.=substr($r1, $i, CHAR_LENGTH);
if (array_search(substr($r1,$i,CHAR_LENGTH), $vowels)!==FALSE)
$flag=1;
if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels)===FALSE)
$flag=2;
}
$derivational=array('оÑÑ','оÑÑÑ');
foreach ($derivational as $suffix)
if (substr($r2,-(strlen($suffix)))==$suffix)
$word=substr($word, 0, strlen($r2)-strlen($suffix));
return $word;
}
private function step4($word){
if (substr($word,-CHAR_LENGTH*2)=='нн')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
else{
$superlative=array('ейÑ', 'ейÑе');
foreach ($superlative as $suffix)
if (substr($word,-(strlen($suffix)))==$suffix)
$word=substr($word, 0, strlen($word)-strlen($suffix));
if (substr($word,-CHAR_LENGTH*2)=='нн')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
}
// should there be a guard flag? can't think of a russian word that ends with ...ейÑÑ or ..Ð½Ð½Ñ , though the algorithm states this is an "otherwise" case
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH)=='Ñ')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
return $word;
}
}
?>