Clair

Document


Package variablesGeneral documentationMethods

Package variablesTop
Privates (from "my" definitions)
$sid;
@sentences;
$latest_tag;
$r_text;

Included modulesTop
Carp
Clair::Config
Clair::SentenceSegmenter::MxTerminator
Clair::SentenceSegmenter::SentenceSegmenter
Clair::SentenceSegmenter::Text
Clair::Utils::MxTerminator
Clair::Utils::TFIDFUtils qw ( )
IO::File
Lingua::Stem
Lingua::Stem::En
Scalar::Util qw ( looks_like_number )
XML::Parser

SynopsisTop
No synopsis!
DescriptionTop
No description!
MethodsTop
newNo descriptionCode

Methods description


None available.

Methods code


newdescriptionprevnextTop
sub new {
	my $class = shift;

	Clair::Utils::MxTerminator::init;

	my %parameters = @_;

	my $type = $parameters{type};

	if (!defined $type) {
		$type = "text";
	}

	if ($type ne 'html' && $type ne 'text' && $type ne 'stem' && $type ne 'xml') {
		die('Document::new - Illegal value of type parameter.');
	}

	my $file     = $parameters{file};
	my $string   = $parameters{string};
	my $id       = $parameters{id};
	my $language = $parameters{language};
	my $label    = $parameters{class};

	if (defined $file && defined $string) {
		die('Document::new - Both file and string defined.');
	}

	my $body;

	if (defined $file) {
		#print "file = $file\n";
open FILE, "<$file" or die("Document::new - Could not open file: $file"); $body = q{}; while (my $line = <FILE>) { $body .= $line; } } elsif (defined $string) { $body = $string; } else { die('Document::new - Neither file nor string defined.'); } my $self = bless { $type => $body, id => $id, language => $language, class => $label }, $class; return $self; } sub count_words { my $self = shift; my $body = $self->{text}; my @words = split(/\s+/, $body); return scalar(@words); } sub split_into_words { my $self = shift; my %parameters = @_; my $type = 'text'; if (exists $parameters{type}) { $type = $parameters{type}; } my $punc = $parameters{punc}; my $body; if ($type eq "text") { $body = $self->get_text(); } elsif ($type eq "html") { $body = $self->get_html(); } elsif ($type eq "stem") { $body = $self->get_stem(); } else { die "type must be html, text, or stem"; } return Clair::Utils::TFIDFUtils::split_words($body, $punc); } sub get_unique_words { my $self = shift; my %params = @_; my $type = $params{type} || "stem"; my @words = $self->split_into_words( type => $type ); my %hash; map {$hash{$_} = 1} @words; my @unique_words = keys %hash; return @unique_words; } sub print { my $self = shift; my %parameters = @_; my $type = $parameters{type}; my $body = $self->{$type}; if ($type eq "sent") { my $sents = $self->{sent}; foreach my $s (@{$sents}) { print "$s\n"; } } else { print $body; } } sub save { my $self = shift; my %parameters = @_; my $file = $parameters{file}; my $type = $parameters{type}; my $body = $self->{$type}; open FILE, ">$file" or croak('Document::save - Could not open file for writing.'); print FILE $body; close FILE; } sub strip_html { my $self = shift; my $text; if (defined $self->{html}) { $text = $self->{html}; $text =~ s/<.+?>//g; $self->{text} = $text; } else { $text = $self->{text}; } return $text; } sub get_html { my $self = shift; return $self->{html}; } sub get_xml { my $self = shift; return $self->{xml}; } sub get_text { my $self = shift; return $self->{text}; } sub get_stem { my $self = shift; if (defined $self->{stem}) { return $self->{stem}; } else { return $self->stem(); } } sub get_sent { my $self = shift; if (defined $self->{sent}) { return @{$self->{sent}}; } else { return $self->split_into_sentences(); } } sub get_sentences { my $self = shift; if (defined $self->{sent}) { return @{$self->{sent}}; } else { return $self->split_into_sentences(); } } sub get_id { my $self = shift; return $self->{id}; } sub get_class { my $self = shift; return $self->{class}; } sub set_id { my $self = shift; my %parameters = @_; my $id = $parameters{id}; if (!defined $id) { die('Document::set_id - id parameter not defined.'); } $self->{id} = $id; } sub set_class { my $self = shift; my $label = shift; $self->{class} = $label; } sub set_parent_document { my $self = shift; my $doc = shift; $self->{parent_document} = $doc; } sub get_parent_document { my $self = shift; if (not exists $self->{parent_document}) { die "Parent document has not been set.\n"; } return $self->{parent_document}; } sub tf { my $self = shift; my %params = @_; my $type = $params{type} || "stem"; my $punc = $params{punc} || 0; my @words = $self->split_into_words( type => $type, punc=> $punc ); my %tf; foreach my $word (@words) { $tf{$word}++; } return %tf; } sub filter_sents { my $self = shift; my %params = @_; unless ($params{matches} || $params{test}) { warn "No argument passed to Document::filter_sents"; return $self; } my @sents = $self->split_into_sentences(); my $test = $params{test}; if ($params{matches}) { $test = sub { /$params{matches}/ }; } my @filtered = grep { &$test($_) } @sents; my $text = join "", @filtered; my $id = $self->get_id(); my $class = $self->get_class(); my $result = Clair::Document->new( string => $text, type => "text", id => $id, class => $class ); return $result; } # ------------------------------------------------------------
# {tag,text} are auxiliary routines for parse_html()
# ------------------------------------------------------------
sub stem { my $self = shift; my $stemmer = Lingua::Stem->new(-locale => 'EN-US'); $stemmer->stem_caching({ -level => 2 }); my @words = split(/\s+/, $self->{text}); my @stemmed = @{$stemmer->stem(@words)}; my $stem = join(" ",@stemmed); $self->{stem} = $stem; return $stem; } # Added by Mark Hodges because calculating the idf requires
# the newlines remain in place
# Note: this adds a newline to the end of the document
sub stem_keep_newlines { my $self = shift; my @lines = split(/\n/, $self->{text}); my $stemmer = Lingua::Stem->new(-locale => 'EN-US'); $stemmer->stem_caching({ -level => 2 }); my $stem = ""; foreach my $l (@lines) { my @words = split(/\s+/, $l); my @stemmed = @{$stemmer->stem(@words)}; $stem .= join(" ",@stemmed) . "\n"; } $self->{stem} = $stem; return $stem; } sub split_into_lines { my $self = shift; my $text = $self->{text}; my @lines = split(/\n/, $text); return @lines; } sub xml_to_text { my $self = shift; my $xml = $self->{xml}; my $xml_parser = new XML::Parser(Handlers => { Start =>\& read_start, Char =>\& read_char}); $sid=1; @sentences = (); $r_text = ""; $xml_parser->parse($self->{xml}); $self->{sent} =\@ sentences; $self->{text} = $r_text; return @sentences; } sub read_start { shift; my $element_name = shift; my %atts = @_; if ($element_name eq 'p') { $latest_tag = "p"; } else { $latest_tag = $element_name; } # print "latest_tag= $latest_tag\n";
} sub read_char { shift; my $text = shift; if ($text =~ /\S/) { if ($latest_tag eq "p") { # push (@sentences, "$sid\t$latest_tag\t$text");
push (@sentences, "$text"); $r_text .= "$text\n"; $sid++; } else { # push (@sentences, "$sid\t$latest_tag\t$text");
} } } sub split_into_sentences { my $self = shift; my $text = $self->{text}; # Old MxTerminator style:
# MxTerminator::init;
# my @sentences = MxTerminator::do_document($text);
# Old Text::Sentence style:
# Setting the locale first may be worthwhile here. TODO
# my @sentences = split_sentences( $text );
# MxTerminator keeps a single whitespace character at the end of each sentence.
# Text::Sentence does not, leading to undesired behavior.
# The following loop makes this function conform to its original authors'
# expectations.
# for my $i (0..$#sentences) {
# $sentences[$i] = "$sentences[$i] ";
# }
# New SentenceSegmenter style:
my $segmenter; if ($SENTENCE_SEGMENTER_TYPE eq "Text") { $segmenter = Clair::SentenceSegmenter::Text->new(); } elsif ($SENTENCE_SEGMENTER_TYPE eq "MxTerminator") { $segmenter = Clair::SentenceSegmenter::MxTerminator->new(); } else { $segmenter = Clair::SentenceSegmenter::Text->new(); warn "Configuration variable\$ SENTENCE_SEGMENTER_TYPE is not set to an eligible value."; } my @sentences = $segmenter->split_sentences($text); my @sent_feats; for (@sentences) { push @sent_feats, {}; } $self->{sent} =\@ sentences; $self->{sent_feats} =\@ sent_feats; return @sentences; } sub sentence_count { my $self = shift; return scalar $self->get_sentences(); } sub sentence_index_in_range { my $self = shift; my $index = shift; my $total_sents = $self->sentence_count(); return ($index >= 0 and $index < $total_sents); } sub set_sentence_feature { my $self = shift; my $index = shift; my %features = @_; return undef unless ($self->sentence_index_in_range($index)); return undef unless (keys %features > 0); my $added = 0; foreach my $feature_name (keys %features) { if (defined $features{$feature_name}) { $self->{sent_feats}->[$index]->{$feature_name} = $features{$feature_name}; $added++; } } return $added; } sub get_sentence_features { my $self = shift; my $index = shift; return undef unless $self->sentence_index_in_range($index); my $feat_ref = $self->{sent_feats}->[$index]; return %$feat_ref; } sub get_sentence_feature { my $self = shift; my $index = shift; my $name = shift; return undef unless ($self->sentence_index_in_range($index)); if (defined $self->{sent_feats}->[$index]->{$name}) { return $self->{sent_feats}->[$index]->{$name}; } else { return undef; } } sub remove_sentence_features { my $self = shift; my @sents = $self->get_sentences(); for (my $i = 0; $i < @sents; $i++) { my %features = $self->get_sentence_features($i); foreach my $name (keys %features) { $self->remove_sentence_feature($i, $name); } } } sub remove_sentence_feature { my $self = shift; my $index = shift; my $name = shift; return undef unless ($self->sentence_index_in_range($index)); if (exists $self->{sent_feats}->[$index]->{$name}) { delete $self->{sent_feats}->[$index]->{$name}; return 1; } else { return undef; } } sub compute_sentence_features { my $self = shift; my %features = @_; foreach my $name (keys %features) { $self->compute_sentence_feature( name => $name, feature => $features{$name} ); } } sub compute_sentence_feature { my $self = shift; my %params = @_; my ($name, $sub) = ($params{name}, $params{feature}); return undef unless (defined $name and defined $sub); my $norm = $params{normalize}; my @sents = $self->get_sentences(); my $state = {}; foreach my $i (0 .. $#sents) { my %params = ( document => $self, sentence => $sents[$i], sentence_index => $i, state => $state ); my $value; eval { $value = &$sub(%params); }; my $did = $self->get_id() || "no id"; if ($@) { warn "Feature $name died processing sent $i in document $did: $@"; } elsif (not defined $value) { warn "Feature $name returned undef for sent $i in document $did"; } else { $self->set_sentence_feature($i, $name => $value); } } if ($norm) { return $self->normalize_sentence_feature($name); } return 1; } sub normalize_sentence_features { my $self = shift; my @names = @_; return undef unless (scalar @names > 0); foreach my $name (@names) { $self->normalize_sentence_feature($name); } } sub normalize_sentence_feature { my $self = shift; my $name = shift; return undef unless (defined $name); my @sents = $self->get_sentences(0); if (@sents > 0) { my $min = $self->get_sentence_feature(0, $name); my $max = $min; my ($max_index, $min_index) = (0, 0); unless (looks_like_number($max) and looks_like_number($min)) { warn "Can't normalize feature $name: non-numeric"; return undef; } for (my $i = 0; $i < @sents; $i++) { my $value = $self->get_sentence_feature($i, $name); if (looks_like_number($value)) { if ($value > $max) { $max = $value; $max_index = $i; } if ($value < $min) { $min = $value; $min_index = $i; } } else { warn "Can't normalize feature $name: non-numeric"; return undef; } } for (my $i = 0; $i < @sents; $i++) { my $value = $self->get_sentence_feature($i, $name); my $new_value = 1; unless ($max == $min) { $new_value = ($value - $min) / ($max - $min);
} $self->set_sentence_feature($i, $name => $new_value); } } else { return undef; } } sub get_sentence_score { my $self = shift; my $index = shift; my $scores = $self->{sent_scores}; unless ($self->sentence_index_in_range($index) and defined $scores) { return undef; } return $scores->[$index]; } sub get_sentence_scores { my $self = shift; my $scores = $self->{sent_scores}; if (defined $scores) { return @$scores; } else { return undef; } } sub set_sentence_score { my $self = shift; my $index = shift; my $score = shift; my $scores = $self->{sent_scores}; unless (defined $scores) { $scores = []; my @sents = $self->get_sentences(); for (@sents) { push @$scores, 0; } $self->{sent_scores} = $scores; } unless ($self->sentence_index_in_range($index) and defined $score) { return undef; } $scores->[$index] = $score; return 1; } sub normalize_sentence_scores { my $self = shift; my $total = $self->sentence_count(); my @scores = $self->get_sentence_scores(); if (@scores) { my ($max, $min) = ($scores[0], $scores[0]); my ($max_index, $min_index) = (0, 0); foreach my $i (0 .. $total - 1) { my $score = $scores[$i]; if ($score > $max) { $max = $score; $max_index = $i; } if ($score < $min) { $min = $score; $min_index = $i; } } my @new_scores; if ($max == $min) { @new_scores = (1) x $total; } else { @new_scores = map { ($_ - $min) / ($max - $min) } @scores;
} foreach my $i (0 .. $total - 1) { $self->set_sentence_score($i, $new_scores[$i]); } return 1; } else { return undef; } } sub score_sentences { my $self = shift; my %params = @_; my $combiner = $params{combiner}; my $weights = $params{weights}; my $normalize = $params{normalize}; $normalize = 1 unless (defined $normalize); return undef unless (defined $combiner or defined $weights); # Use a regular linear combiner if weights are specified.
if (defined $weights) { $combiner = sub { my %features = @_; my $score = 0; foreach my $name (keys %$weights) { if ($features{$name}) { $score += $weights->{$name} * $features{$name}; } } return $score; }; } my @sents = $self->get_sentences(); for (my $i = 0; $i < @sents; $i++) { my %features = $self->get_sentence_features($i); my $score; eval { $score = &$combiner(%features); }; if ($@) { warn "Could not combine scores: $@"; return undef; } elsif (not defined $score) { warn "Could not combine scores, combiner returned undef for sent$i"; return undef; } elsif (not looks_like_number($score)) { warn "Could not combine scores, combiner returned non number $score" . " for sent $i"; return undef; } else { $self->set_sentence_score($i, $score); } } $self->normalize_sentence_scores() if $normalize; $self->get_sentence_scores(); } sub sentence_scores_computed { my $self = shift; my @scores = $self->get_sentence_scores(); return @scores; } sub set_document_feature { my $self = shift; my %features = @_; return undef unless (keys %features > 0); my $added = 0; foreach my $feature_name (keys %features) { if (defined $features{$feature_name}) { $self->{doc_feats}->{$feature_name} = $features{$feature_name}; $added++; } } return $added; } sub get_document_features { my $self = shift; my $feat_ref = $self->{doc_feats}; return %$feat_ref; } sub get_document_feature { my $self = shift; my $name = shift; if (defined $self->{doc_feats}->{$name}) { return $self->{doc_feats}->{$name}; } else { return undef; } } sub remove_document_features { my $self = shift; my %features = $self->get_document_features(); foreach my $name (keys %features) { $self->remove_document_feature($name); } } sub remove_document_feature { my $self = shift; my $name = shift; if (exists $self->{doc_feats}->{$name}) { delete $self->{doc_feats}->{$name}; return 1; } else { return undef; } } sub compute_document_features { my $self = shift; my %features = @_; foreach my $name (keys %features) { $self->compute_document_feature( name => $name, feature => $features{$name} ); } } sub compute_document_feature { my $self = shift; my %params = @_; my ($name, $sub) = ($params{name}, $params{feature}); return undef unless (defined $name and defined $sub); my %sub_params = ( document => $self, ); my $value; eval { $value = &$sub(%sub_params); }; my $did = $self->get_id() || "no id"; if ($@) { warn "Feature $name died processing document $did: $@"; } elsif (not defined $value) { warn "Feature $name returned undef document $did"; } else { $self->set_document_feature($name => $value); } return 1; } sub get_summary { my $self = shift; my %params = @_; unless ($self->sentence_scores_computed()) { warn "get_summary called on document where scores not defined"; return undef; } my @scores = $self->get_sentence_scores(); my %score_map; for (my $i = 0; $i < @scores; $i++) { $score_map{$i} = $scores[$i]; } my $size = $params{size}; $size = scalar @scores unless (defined $size and $size > 0); # Get the top scoring sentences, will sort them later
my @sents = $self->get_sentences(); my @summary; my $sortsub = sub { $score_map{$b} <=> $score_map{$a} }; foreach my $i (sort $sortsub keys %score_map) { last if (scalar @summary == $size); my %feats = $self->get_sentence_features($i); my $sent = { 'index' => $i, 'text' => $sents[$i], 'features' =>\% feats, 'score' => $self->get_sentence_score($i) }; push @summary, $sent; } # Return the sentences according to their original position in the
# document (unless the programmer explicitly says not to preserve
# the order).
if (defined $params{preserve_order}) { return @summary; } else { my $sortsub = sub { $a->{'index'} <=> $b->{'index'} }; return sort $sortsub @summary; } } sub is_numeric_feature { my $self = shift; my $name = shift; my @sents = $self->get_sentences(); foreach my $i (0 .. $#sents) { my $value = $self->get_sentence_feature($i, $name); unless (defined $value and looks_like_number($value)) { return 0; } } return scalar @sents; } # ***DEPRECATED (J. DePeri): use either Clair::Document::split_into_words, or Clair::Utils::TFIDFUtils::split_words
sub split_words { my $text = shift; my $punc = shift; return Clair::Utils::TFIDFUtils::split_words($text, $punc); } ### The following old subroutine has been superseded by the above (J. DePeri)
=pod sub split_words_deprecated { my $text = shift; my @words = split /\s|\,|\-|\(|\)|¡@|¡]|¡\^|¡A|¡B|¡C|¡u|¡m|¡n|¡F|¡þ|¡v|¡G|¡H|¡S|¡T|¡I|\?|\!|¡§|¡¨|¡y|¡z|\./, $text; my @ret = (); foreach my $w (@words) { next if $w =~ /^$/; push @ret, $w; } return @ret; } =cut ### The following two functions seem to not be used by anything. Warning.
sub trim { my $text = shift; $text =~ s/^\s+//; $text =~ s/\s+$//; return $text; } sub remove_whitespace { my $text = shift; $text =~ s/\s//g; return $text; } =head1 NAME Clair::Document - Document Class for the CLAIR Library =head1 VERSION Version 1.02 =cut our $VERSION = '1.02'; =head1 SYNOPSIS This module is one of the core modules for the CLAIR library. The Document holds all of of the text of a file. Different operations such as stemming, stripping html, word counting, among others can be performed on a Document. =head1 METHODS =cut =head2 new $docref = new Clair::Document(string => 'Document text', type => 'text', id => 'doc' class => 'label'); Creates a new document from a filename or string and assigns it the specified class label. First argument is either "string" or "file" to identify which method will be used. If "string" is used then the second argument should be a string representing the full text or html content of the file. If "file" is used then the second argument should be the filename to be used for input. The filename for a "string" input is undefined. Use 'set_filename()' to define this parameter. =cut =head2 count_words Counts the number of words contained within the text of the document. In order to use properly first instantiate a Document object with a file or a string, then call this method on it. =cut =head2 print print Prints the contents of Document to standard output =cut =head2 save save(file => 'out.txt', type => 'text') Saves the contents of Document to a file. =cut =head2 strip_html strip_html Removes all tags from html of Document. Resulting string is saved as the text of Document, then returned. =cut =head2 get_sent get_sent Depricated. Use get_sentences instead. Returns sentences of the document =cut =head2 get_sentences Returns the sentences of the document. =cut =head2 get_id get_id Returns the id of Document =cut =head2 get_class $class = $docref->get_class() Returns the class label of Document (for use in text classification). =cut =head2 get_parent_document get_parent_document Returns the parent document of the document. Used if the document is a sentence or line taken from another document to allow backtracking to the original document. =cut =head2 set_parent_document set_parent_document Sets the parent document of the document. Used if the document is a sentence or line taken from another document to allow backtracking to the original document. =cut =head2 split_into_words Returns the list of words in the document. Defaults to the text of the document but can be set to stem or html by passing an optional type argument: split_into_words(type => 'stem') =cut =head2 get_unique_words(type => 'stem') Returns a list of unique words in the document. Defaults to extracting these words from the the stemmed version of the document, but can be set to text or html by passing an optional type argument: get_unique_words(type => 'stem') =cut =head2 set_id set_id(id => 'new_id') Sets the id of Document. =cut =head2 set_class $docref->set_class('label') Sets the class label of Document. =cut =head2 get_xml Returns the xml value of a document. =cut =head2 get_text Returns the text value of a document. =cut =head2 get_html Returns the html value of a document. =cut =head2 get_stem get_stem() Returns the stemmed version of the Document. If the text has not already been stemmed, it will first call stem() and then save and return the results. =cut =head2 stem Stems the Document text =cut =head2 stem_keep_newlines stem_keep_newlines Stems the document, but without removing the newlines. This is needed by some methods to track where a word came from or to treat lines individually. Saves the result as the stemmed version of the document, then returns it =cut =head2 split_into_lines split_into_lines() Splits the document into an array at newlines =cut =head2 split_into_sentences split_into_sentences() Splits the document into an array of sentences (uses Text::Sentence to split the document) (A future version will allow the user to specify via lib/Clair/Config.pm whether they'd
prefer to use MxTerminator over Text::Sentence.)
=cut

=head2 filter_sentences

filter_sentences( matches => "regex" )
filter_sentences( test => $sub )

Applies a filter to the sentences in this document and returns a new
Clair::Document containing the sentences that passed the filter. The filter
can either be a regular expression (with the matches parameter) or a
subroutine references (with the test parameter). The id of the new document
will be the same as the original document (if the original id is defined).

=cut

=head2 xml_to_text

xml_to_text

Converts an XML document to text.

=head2 tf

tf( type => "stem" )

Splits the document into terms of the given type, then returns a hash
containing the term frequencies.

=cut

=head2 sentence_count

Returns the total number of sentences in this document.

=cut

=head2 sentence_index_in_range($i)

Returns true of there is a sentence with index $i, false otherwise. Sentence
indexing starts at 0.

=cut

=head2 set_sentence_feature($i, %features)

Sets the given features for sentence with index $i. Returns undef if $i insn'
t in the sentence range or if no features are given. Otherwise returns the number of features added to the given sentence. %features should be a hash mapping names to values. For example, set_sentence_feature(1, f1 => 1, f2 => 0.5, f3 => "red") sets those features to the second sentence. =cut =head2 get_sentence_features($i) Returns a hash mapping the feature names to values of the given sentence. Returns undef if the sentence index is out of range. =cut =head2 get_sentence_feature($i, $name) Returns the value of the given feature for the given sentence. Returns undef if the index is out of range or if the feature isn't defined for the sentence.

=cut

=head2 remove_sentence_features()

Removes all features from every sentence.

=cut

=head2 remove_sentence_feature($i, $name)

Removes the given feature from the given sentence. Returns true if succesfully
removed, returns undef otherwise.

=cut

=head2 compute_sentence_features( f1 => $subref1, f2 => $subref2, ... )

Computes the specified features for each sentence in the document by calling
$self->compute_sentence_feature(fN => $subrefN) for each feature.

=cut

=head2 compute_sentence_feature( name => $name, feature => $subref, normalize => 1 )

Computes the given feature for each sentence in the document. The feature
parameter should be a reference to a subroutine. The subroutine will be
called with the following parameters defined:

=over 4

=item * document - A reference to the document object

=item * sentence - The sentence text

=item * sentence_index - The index of the sentence

=item * state - A hash reference that is kept in memory between calls to the subroutine. This lets $subref save precomputed values or keep track of inter-sentence relationships.

=back

A feature subroutine should return a value. Any exceptions thrown by the
feature subroutine will be caught and a warning will be shown. If a feature
subroutine returns an undefined value, the feature will not be set and a
warning will be shown. This method returns undef if either name or feature
are not defined.

The normalize parameter, if set to a true value, will scale the values of this
feature so that the minimum value is 0 and the maximum value is 1. Nothing
will happen if any of the feature values are non-numeric.

=cut

=head2 normalize_sentence_features(@names)

Scales the given features so that the minimum value is 0 and the maximum
value is 1 for each feature.

=cut

=head2 normalize_sentence_feature($name)

Scales the values of the given feature so that the minimum value is 0 and
the maximum value is 1. Nothing will happen if any of the feature values are
non-numeric.

=cut

=head2 compute_sentence_features( %features );

Computes a set of features on each sentence. %features should be a hash
mapping names to sub references. See compute_sentence_feature for more
information.

=cut

=head2 get_sentence_score($i)

Returns the score of the sentence with index $i. Returns undef if $i is out
of range or if the score has not been defined yet.

=cut

=head2 get_sentence_scores()

Returns an array of the sentence scores. If the scores haven'
t been set, returns undef. =cut =head2 set_sentence_score