| Package variables | General documentation | Methods |
| Package variables | Top |
| No package variables defined. |
| Included modules | Top |
| Clair::Config |
| Clair::Document |
| Clair::Network |
| Clair::Network::Centrality::LexRank |
| MEAD::SimRoutines |
| Scalar::Util qw ( looks_like_number ) |
| lib " $MEAD_HOME /lib " |
| Synopsis | Top |
| Description | Top |
| Methods | Top |
| build_idf | No description | Code |
| classes | No description | Code |
| compute_binary_cosine | No description | Code |
| compute_cosine_matrix | No description | Code |
| compute_genprob_matrix | No description | Code |
| compute_lexrank | No description | Code |
| compute_sentence_feature | No description | Code |
| compute_sentence_features | No description | Code |
| count_elements | No description | Code |
| create_genprob_network | No description | Code |
| create_hyperlink_network_from_array | No description | Code |
| create_hyperlink_network_from_file | No description | Code |
| create_lexical_network | No description | Code |
| create_network | No description | Code |
| create_sentence_based_cluster | No description | Code |
| create_sentence_based_network | No description | Code |
| docterm_matrix | No description | Code |
| documents | No description | Code |
| documents_by_class | No description | Code |
| get | No description | Code |
| get_class | No description | Code |
| get_id | No description | Code |
| get_largest_cosine | No description | Code |
| get_sentence_feature | No description | Code |
| get_sentence_features | No description | Code |
| get_unique_words | No description | Code |
| has_document | No description | Code |
| insert | No description | Code |
| load_documents | No description | Code |
| load_file_list_array | No description | Code |
| load_file_list_from_file | No description | Code |
| load_lines_from_file | No description | Code |
| new | No description | Code |
| normalize_sentence_feature | No description | Code |
| normalize_sentence_features | No description | Code |
| normalize_sentence_scores | No description | Code |
| remove_sentence_features | No description | Code |
| save_documents_to_directory | No description | Code |
| save_documents_to_file | No description | Code |
| score_sentences | No description | Code |
| set_class | No description | Code |
| set_id | No description | Code |
| set_sentence_feature | No description | Code |
| stem_all_documents | No description | Code |
| strip_all_documents | No description | Code |
| tf | No description | Code |
| write_cos | No description | Code |
| build_idf | description | prev | next | Top |
sub build_idf
{ my $self = shift;
my $dbm_file = shift;
my %parameters = @_;
my $type = 'text';
if (exists $parameters{type}) {
$type = $parameters{type};
}
if ($type ne 'html' and $type ne 'text' and $type ne 'stem') {
die "Type must be 'html, 'text', or 'stem'.";
}
my %token_hash;
dbmopen(%token_hash, $dbm_file, 0666);
%token_hash = ();
my $count = 0;
my %documents = %{ $self->{documents} };
foreach my $doc (values %documents) {
$count++;
print "Looking at document $count\n";
my @words = $doc->split_into_words(type => $type);
my %looked = ();
foreach my $w (@words) {
$w =~ s/^\\[0-9]+//;
$w =~ s/^[\.\"\-\_\+\\\`\~\!\&\(\)\[\]\{\}\'\;\:\&\*\?\,]+//;
$w =~ s/[\.\"\-\_\+\\\`\~\!\&\(\)\[\]\{\}\'\;\:\&\*\?\,]+$//;
if ($w =~ /^\s*$/ || exists $looked{$w}) { next; }
if ($token_hash{$w} and $token_hash{$w} > 0) {
$token_hash{$w}++;
} else {
$token_hash{$w} = 1;
}
$looked{$w}++;
}
}
foreach my $w (keys %token_hash) {
if (0.5+$token_hash{$w} != 0) {
$token_hash{$w} = log(($count+1)/(0.5+$token_hash{$w}));} |
| classes | description | prev | next | Top |
sub classes
{ my $self = shift;
my $docsref = $self->documents();
my %classes;
foreach my $id (keys %$docsref) {
my $class = $docsref->{$id}->get_class();
$classes{$class}++ if (defined $class);
}
return %classes;} |
| compute_binary_cosine | description | prev | next | Top |
sub compute_binary_cosine
{ my $self = shift;
my $threshold = shift;
my %cos_matrix;
if ($self->{cosine_matrix}) {
%cos_matrix = %{ $self->{cosine_matrix} };
} else {
%cos_matrix = $self->compute_cosine_matrix();
}
my %retHash = ();
foreach my $doc_key (keys %cos_matrix) {
$retHash{$doc_key} = ();
}
foreach my $doc1_key (keys %cos_matrix)
{
foreach my $doc2_key (keys %{ $cos_matrix{$doc1_key} })
{
if ($cos_matrix{$doc1_key}{$doc2_key} >= $threshold)
{
$retHash{$doc1_key}{$doc2_key} = $cos_matrix{$doc1_key}{$doc2_key};
}
else
{
$retHash{$doc1_key}{$doc2_key} = 0;
}
}
}
return %retHash;} |
| compute_cosine_matrix | description | prev | next | Top |
sub compute_cosine_matrix
{ my $self = shift;
my %parameters = @_;
my $text_type = "stem";
if (exists $parameters{text_type}) {
$text_type = $parameters{text_type};
}
my %documents = %{ $self->{documents} };
my $i = 0;
my $j = 0;
my $counter = 0;
my %cos_hash = ();
foreach my $doc_key (keys %documents) {
$cos_hash{$doc_key} = ();
}
my $size = scalar(keys %documents);
foreach my $doc1_key (keys %documents) {
$i = 0;
$j++;
# my %doc1_hash = ();} |
| compute_genprob_matrix | description | prev | next | Top |
sub compute_genprob_matrix
{
my $self = shift;
my %params = @_;
$params{genprob} = $GENPROB unless $params{genprob};
my %word_map;
my $i = 0;
foreach my $word ($self->get_unique_words()) {
$word_map{$word} = $i++;
}
my %docmap;
my %total_freq;
my $docsref = $self->{documents};
# Write the term frequency file} |
| compute_lexrank | description | prev | next | Top |
sub compute_lexrank
{
my $self = shift;
my %params = @_;
my $cutoff = 0.15;
$cutoff = $params{cutoff} if $params{cutoff};
my $matrix = $self->{cosine_matrix};
my $cmatrix = {};
unless ($matrix) {
my %m = $self->compute_cosine_matrix( type => $params{type} );
$matrix =\% m;} |
| compute_sentence_feature | description | prev | next | Top |
sub compute_sentence_feature
{
my $self = shift;
my %params = @_;
my ($name, $sub) = ($params{name}, $params{feature});
my $norm = $params{normalize};
return undef unless defined $name and defined $sub;
my $docs = $self->documents();
my $state = {};
foreach my $did (keys %$docs) {
my $doc = $docs->{$did};
my @sents = $doc->get_sentences();
foreach my $i ( 0 .. $#sents ) {
my %params = (
document => $doc,
sentence => $sents[$i],
sentence_index => $i,
cluster => $self,
state => $state
);
my $value;
eval {
$value = &$sub(%params);
};
my $did = $self->get_id() || "no id";
if ($@) {
warn "Feature $name died processing $i in document $did: $@";
} elsif (not defined $value) {
warn "Feature $name returned undef for sent $i in doc $did";
} else {
$doc->set_sentence_feature($i, $name => $value);
}
}
}
if ($norm) {
return $self->normalize_sentence_feature($name);
}
return 1;} |
| compute_sentence_features | description | prev | next | Top |
sub compute_sentence_features
{ my $self = shift;
my %features = @_;
foreach my $name (keys %features) {
$self->compute_sentence_feature( name => $name,
feature => $features{$name} );
}} |
| count_elements | description | prev | next | Top |
sub count_elements
{ my $self = shift;
my $documents_ref = $self->{documents};
return scalar keys %$documents_ref;} |
| create_genprob_network | description | prev | next | Top |
sub create_genprob_network
{my $self = shift; my %params = @_; # Just create a regular cosine network using the genprob matrix} |
| create_hyperlink_network_from_array | description | prev | next | Top |
sub create_hyperlink_network_from_array
{ my $self = shift;
my $hyperlinks_ref = shift;
my @hyperlinks = @$hyperlinks_ref;
my %parameters = @_;
my $property = 'pagerank_transition';
if (exists $parameters{property}) {
$property = $parameters{property};
}
my $network = new Clair::Network;
foreach my $h (@hyperlinks) {
my ($u_id, $v_id) = @$h;
my $u = $self->get($u_id);
my $v = $self->get($v_id);
my $add_u = $u_id;
my $add_v = $v_id;
if (not $network->has_node($add_u)) {
$network->add_node($add_u, document => $u);
}
if ($u_id ne $v_id) {
if (not $network->has_node($add_v)) {
$network->add_node($add_v, document => $v);
}
$network->add_edge($add_u, $add_v);
$network->set_edge_attribute($add_u, $add_v, $property, 1);
} else {
$network->add_node($add_u);
$network->set_vertex_attribute($add_u, $property, 1);
}
}
return $network;} |
| create_hyperlink_network_from_file | description | prev | next | Top |
sub create_hyperlink_network_from_file
{ my $self = shift;
my $filename = shift;
my %parameters = @_;
my @hyperlink_array;
open(FILE, "< $filename") or die "Coudln't open $filename: $!";
while (<FILE>) {
next unless m/(.+) (.+)/;
my $u = $1;
my $v = $2;
my @link = ($u, $v);
push(@hyperlink_array,\@ link);
}
close(FILE);
return $self->create_hyperlink_network_from_array(\@hyperlink_array, %parameters);} |
| create_lexical_network | description | prev | next | Top |
sub create_lexical_network
{ my $self = shift;
my %params = @_;
my $docs = $self->documents();
my %word_hash = ();
foreach my $did (keys %$docs) {
my $doc = $docs->{$did};
my @sents = $doc->get_sentences();
foreach my $sent (@sents) {
my @sent_list = ();
my %seen = ();
chomp $sent;
my @words = split(/\s+/, $sent);
foreach my $word (@words) {
$word = lc $word;
if (not defined $seen{$word}) {
push(@sent_list, $word);
$seen{$word} = 1;
}
}
# We now have a hash of words in the sentence} |
| create_network | description | prev | next | Top |
sub create_network
{ my $self = shift;
my %parameters = @_;
my %cos_matrix = ();
if (exists $parameters{cosine_matrix}) {
%cos_matrix = %{ $parameters{cosine_matrix} };
} elsif (exists $self->{cosine_matrix}) {
%cos_matrix = $self->{cosine_matrix};
} else {
die "Must specify cosine matrix.";
}
my $include_zeros = 0;
if (exists $parameters{include_zeros} && $parameters{include_zeros} == 1) {
$include_zeros = 1;
}
my $property = 'lexrank_transition';
if (exists $parameters{property}) {
$property = $parameters{property};
}
my $network = Clair::Network->new();
# Add the edges to the graph} |
| create_sentence_based_cluster | description | prev | next | Top |
sub create_sentence_based_cluster
{ my $self = shift;
my %documents = %{ $self->{documents} };
my $c = Clair::Cluster::->new();
foreach my $doc (values %documents) {
my @sentences = $doc->split_into_sentences;
my $doc_id = $doc->get_id;
my $count = 0;
foreach my $sent (@sentences) {
++$count;
my $sent_id = $doc_id . $count;
my $new_doc = Clair::Document::->new(type => 'text', string => "$sent", id => "$sent_id");
$new_doc->set_parent_document($doc);
$c->insert($sent_id, $new_doc);
}
}
return $c;} |
| create_sentence_based_network | description | prev | next | Top |
sub create_sentence_based_network
{ my $self = shift;
my %documents = %{ $self->{documents} };
my %params = @_;
my $c = $self->create_sentence_based_cluster();
my %cos_hash = $c->compute_cosine_matrix(text_type => 'text');
if (exists $params{threshold} and $params{threshold} != 0) {
my $threshold = $params{threshold};
%cos_hash = $c->compute_binary_cosine($threshold);
}
my $include_zeros = 0;
if (exists $params{include_zeros} and $params{include_zeros} == 1) {
$include_zeros = 1;
}
return $c->create_network(cosine_matrix =>\% cos_hash, include_zeros => $include_zeros);} |
| docterm_matrix | description | prev | next | Top |
sub docterm_matrix
{ my $self = shift;
my %params = @_;
my $type = $params{type} || "stem";
my @matrix;
my $docsref = $self->documents();
my @uniq_words = sort $self->get_unique_words(type => $type);
my @docids = sort keys %$docsref;
foreach my $id (@docids) {
my %doc_tf = $docsref->{$id}->tf(type => $type);
my @vector;
foreach my $word (@uniq_words) {
push @vector, $doc_tf{$word} || 0;
}
push @matrix,\@ vector;
}
return (\@matrix,\@ docids,\@ uniq_words);} |
| documents | description | prev | next | Top |
sub documents
{ my $self = shift;
return $self->{documents};} |
| documents_by_class | description | prev | next | Top |
sub documents_by_class
{ my $self = shift;
my $docsref = $self->documents();
my %docs_by_class;
foreach my $id (keys %$docsref) {
my $class = $docsref->{$id}->get_class();
if (defined $class) {
$docs_by_class{$class}->{$id} = 1;
}
}
return %docs_by_class;} |
| get | description | prev | next | Top |
sub get
{ my $self = shift;
my $id = shift;
my $documents_ref = $self->{documents};
return $documents_ref->{$id};} |
| get_class | description | prev | next | Top |
sub get_class
{my $self = shift; my $id = shift; return $self->get($id)->get_class();} |
| get_id | description | prev | next | Top |
sub get_id
{ my $self = shift;
return $self->{id};} |
| get_largest_cosine | description | prev | next | Top |
sub get_largest_cosine
{ my $self = shift;
my %parameters = @_;
my %cos_matrix = ();
if (exists $parameters{cosine_matrix}) {
%cos_matrix = %{ $parameters{cosine_matrix} };
}
elsif (exists $self->{cosine_matrix}) {
%cos_matrix = %{ $self->{cosine_matrix} };
}
else {
die "Must specify cosine matrix.";
}
my $largest_cosine = -1;
my $largest_key1 = '';
my $largest_key2 = '';
foreach my $doc1_key (keys %cos_matrix)
{
foreach my $doc2_key (keys %{ $cos_matrix{$doc1_key} })
{
if ($largest_cosine < $cos_matrix{$doc1_key}{$doc2_key})
{
$largest_cosine = $cos_matrix{$doc1_key}{$doc2_key};
$largest_key1 = $doc1_key;
$largest_key2 = $doc2_key;
}
}
}
my %retHash = ();
$retHash{'value'} = $largest_cosine;
$retHash{'key1'} = $largest_key1;
$retHash{'key2'} = $largest_key2;
return %retHash;} |
| get_sentence_feature | description | prev | next | Top |
sub get_sentence_feature
{ my $self = shift;
my $docs = $self->documents();
my $did = shift;
my $sno = shift;
my $name = shift;
if ($self->has_document($did)) {
my $doc = $docs->{$did};
return $doc->get_sentence_feature($sno, $name);
} else {
return undef;
}} |
| get_sentence_features | description | prev | next | Top |
sub get_sentence_features
{ my $self = shift;
my $docs = $self->documents();
my $did = shift;
my $sno = shift;
if ($self->has_document($did)) {
my $doc = $docs->{$did};
return $doc->get_sentence_features($sno);
} else {
return undef;
}} |
| get_unique_words | description | prev | next | Top |
sub get_unique_words
{ my $self = shift;
my %params = @_;
my %words;
my $docsref = $self->{documents};
foreach my $id (keys %$docsref) {
my $doc = $docsref->{$id};
map { $words{$_} = 1 } $doc->get_unique_words(%params);
}
return keys %words;} |
| has_document | description | prev | next | Top |
sub has_document
{ my $self = shift;
my $id = shift;
return $self->{documents}->{$id};} |
| insert | description | prev | next | Top |
sub insert
{ my $self = shift;
my $id = shift;
my $document = shift;
my $documents_ref = $self->{documents};
$documents_ref->{$id} = $document;} |
| load_documents | description | prev | next | Top |
sub load_documents
{ my $self = shift;
my $document_expr = shift;
my %parameters = @_;
my $doc_type = 'text';
if (exists $parameters{type}) {
$doc_type = $parameters{type};
if ($doc_type ne 'text' and $doc_type ne 'html' and $doc_type ne 'stem') {
die "Document type must be\' html\',\' text\', or\' stem\'.";
}
}
my $filename_id = 1;
if ( (exists $parameters{filename_id} and $parameters{filename_id} == 0) or
(exists $parameters{count_id} and $parameters{count_id} == 1) ) {
$filename_id = 0;
}
my $count = 0;
if (exists $parameters{start_count} ) {
$count = $parameters{start_count};
}
open (LS, "ls -1 $document_expr |") or die "Could not run ls: $!";
while ( <LS> ) {
chomp;
my $file = $_;
my $id;
if ($filename_id == 1) {
$id = $file;
} else {
$id = $count;
}
my $doc = new Clair::Document(type => $doc_type, file => $file, id => $id);
$self->insert($id, $doc);
$count++;
}
close LS;
return $count;} |
| load_file_list_array | description | prev | next | Top |
sub load_file_list_array
{ my $self = shift;
my $filelist_ref = shift;
my @filelist = @$filelist_ref;
my %parameters = @_;
my $doc_type = 'text';
if (exists $parameters{type}) {
$doc_type = $parameters{type};
if ($doc_type ne 'text' and $doc_type ne 'html' and $doc_type ne 'stem') {
die "Document type must be\' html\',\' text\', or\' stem.\'";
}
}
my $filename_id = 1;
if ( (exists $parameters{filename_id} and $parameters{filename_id} == 0) or
(exists $parameters{count_id} and $parameters{count_id} == 1) ) {
$filename_id = 0;
}
my $count = 0;
if (exists $parameters{start_count} ) {
$count = $parameters{start_count};
}
foreach my $file (@filelist) {
my $id;
if ($filename_id == 1) {
$id = $file;
} else {
$id = $count;
}
my $doc = new Clair::Document(type => $doc_type, file => $file, id => $id);
$self->insert($id, $doc);
$count++;
} |