Clair::Utils

porter


Package variablesGeneral documentationMethods

Package variablesTop
No package variables defined.

SynopsisTop
No synopsis!
DescriptionTop
No description!
MethodsTop
porterNo descriptionCode

Methods description


None available.

Methods code


porterdescriptionprevnextTop
sub porter {
  # First and only argument. The word to be stemmed. should be lower-case
# an "i" could be apended to the regular expresions to make them case
# insensitive.
my $word=shift; if(length($word)>2){ # This is a consonant. Not "aiueo" and "y" only if preceded by a vowel
my $c='(?:[^aiueoy]|(?:(?<=[aiueo])y)|\by)'; #reconoce una consonante
# This is a vowel. "aiueo" and "y" if preceded by a consonant
my $v='(?:[aiueo]|(?:(?<![aiueo])y))'; #reconoce una vocal
my $extra=0; # The re "/^(?:$c+)?(?:$v+$c+){m}(?:$v+)?$/" is [C](VC)**m[V] in perl
# Matches if (m > 0)
my $m_gt_0 = "^(?:$c+)?(?:$v+$c+){1,}(?:$v+)?\$"; # Matches if (m > 1)
my $m_gt_1 = "^(?:$c+)?(?:$v+$c+){2,}(?:$v+)?\$"; # Matches if (m = 1)
my $m_eq_1="^(?:$c+)?(?:$v+$c+){1}(?:$v+)?\$"; # Matches *o
my $o="$c$v(?:[^aiueowxy])\$"; # Matches *d
my $d="($c)\\1\$"; #STEP 1a
if($word =~ /(.+)sses$/){ $word=$1."ss"; } elsif($word =~ /(.+)ies$/){ $word=$1."i"; } elsif($word =~ /(.+[^s])s$/){ # engloba 2 ultimas reglas de 1a
$word=$1; # Same as last 2 rules of 1a
} #STEP 1b
if($word =~ /(.+)eed$/) { if (($w=$1) =~ /$m_gt_0/o){ $word=$w."ee"; } } elsif($word =~ /(.+)ed$/) { if (($w=$1) =~ /$v/o) { $word=$w; $extra=1; } } elsif($word =~ /(.+)ing$/){ if(($w=$1) =~ /$v/o) { $word=$w; $extra=1; } } # If 2nd or 3rd of the previous rules was successful try the extra rules...
#Si aplicaron alguna de las dos ultimas reglas de "1b" hacemos las siguientes
if($extra){ if($word =~ /(.+)at$/){ $word=$1."ate"; } elsif($word =~ /(.+)bl$/){ $word=$1."ble"; } elsif($word =~ /(.+)iz$/){ $word=$1."ize"; } # (*d and not (*L or *S or *Z)) --> single letter
elsif(($word =~ /$d/o) and ($word !~ /[lsz]$/)){ $word=substr($word,0,-1); } # (m=1 and *o) --> E
elsif(($word =~ /$m_eq_1/o) and ($word =~ /$o/o)){ $word.='e'; } } # STEP 1c
if($word =~ /(.+)y$/){ if (($w=$1) =~ /$v/o){ $word=$w."i"; } } #STEP 2 con crazy performance hack descrito en paper
# To speed up the algorithm we do a switch on the penultimate letter of the
# word being tested. Same for steps 3 and 4.
my $letter=substr($word,-2,1); if($letter eq "a"){ if($word =~ /(.+)ational$/){ if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } elsif($word =~ /(.+)tional$/){ if(($w=$1) =~ /$m_gt_0/o){ $word=$w."tion"; } } } elsif($letter eq "c"){ if($word =~ /(.+)enci$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ence"; } } elsif($word =~ /(.+)anci$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ance"; } } } elsif($letter eq "e"){ if($word =~ /(.+)izer$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ize"; } } } #NEW RULE SEE "http://www.muscat.com/~martin/stem.html"
elsif($letter eq "g"){ if($word =~ /(.+)logi$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."log"; } } } elsif($letter eq "l"){ #RULE CHANGED SEE "http://www.muscat.com/~martin/stem.html"
if($word =~ /(.+)bli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ble"; } } elsif($word =~ /(.+)alli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)entli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ent"; } } elsif($word =~ /(.+)eli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."e"; } } elsif($word =~ /(.+)ousli$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ous"; } } } elsif($letter eq "o"){ if($word =~ /(.+)ization$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ize"; } } elsif($word =~ /(.+)ation$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } elsif($word =~ /(.+)ator$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ate"; } } } elsif($letter eq "s"){ if($word =~ /(.+)alism$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)iveness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ive"; } } elsif($word =~ /(.+)fulness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ful"; } } elsif($word =~ /(.+)ousness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ous"; } } } elsif($letter eq "t"){ if($word =~ /(.+)aliti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } elsif($word =~ /(.+)iviti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ive"; } } elsif($word =~ /(.+)biliti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ble"; } } } #STEP 3
$letter=substr($word,-1,1); if($letter eq "e"){ if($word =~ /(.+)icate$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } elsif($word =~ /(.+)ative$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } elsif($word =~ /(.+)alize$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."al"; } } } elsif($letter eq "i"){ if($word =~ /(.+)iciti$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } } elsif($letter eq "l"){ if($word =~ /(.+)ical$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w."ic"; } } elsif($word =~ /(.+)ful$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } } elsif($letter eq "s"){ if($word =~ /(.+)ness$/) { if(($w=$1) =~ /$m_gt_0/o){ $word=$w; } } } #STEP 4
$letter=substr($word,-2,1); if($letter eq "a"){ if($word =~ /(.+)al$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "c"){ if($word =~ /(.+)ance$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ence$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "e"){ if($word =~ /(.+)er$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "i"){ if($word =~ /(.+)ic$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "l"){ if($word =~ /(.+)able$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ible$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "n"){ if($word =~ /(.+)ant$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ement$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ment$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)ent$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "o"){ if($word =~ /(.+)ion$/) { if((($w=$1) =~ /[st]$/) and ($w =~ /$m_gt_1/o)){ $word=$w; } } elsif($word =~ /(.+)ou$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "s"){ if($word =~ /(.+)ism$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "t"){ if($word =~ /(.+)ate$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } elsif($word =~ /(.+)iti$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "u"){ if($word =~ /(.+)ous$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "v"){ if($word =~ /(.+)ive$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } elsif($letter eq "z"){ if($word =~ /(.+)ize$/) { if(($w=$1) =~ /$m_gt_1/o){ $word=$w; } } } #STEP 5a
if($word =~ /(.+)e$/) { if((($w=$1) =~ /$m_gt_1/o) or (($w =~ /$m_eq_1/o) and ($w !~ /$o/o))){ $word=$w; } } #STEP 5b
#(m>1 and *d and *L) -->
if($word =~ /ll$/) { if($word =~ /$m_gt_1/o){ $word=substr($word,0,length($word)-1); } } # It's stemmed so I guess we can give it back :-)
} return $word;
}

General documentation


No general documentation available.