Clair

Cluster


Package variablesGeneral documentationMethods

Package variablesTop
No package variables defined.

Included modulesTop
Clair::Config
Clair::Document
Clair::Network
Clair::Network::Centrality::LexRank
MEAD::SimRoutines
Scalar::Util qw ( looks_like_number )
lib " $MEAD_HOME /lib "

SynopsisTop
No synopsis!
DescriptionTop
No description!
MethodsTop
build_idfNo descriptionCode
classesNo descriptionCode
compute_binary_cosineNo descriptionCode
compute_cosine_matrixNo descriptionCode
compute_genprob_matrixNo descriptionCode
compute_lexrankNo descriptionCode
compute_sentence_featureNo descriptionCode
compute_sentence_featuresNo descriptionCode
count_elementsNo descriptionCode
create_genprob_networkNo descriptionCode
create_hyperlink_network_from_arrayNo descriptionCode
create_hyperlink_network_from_fileNo descriptionCode
create_lexical_networkNo descriptionCode
create_networkNo descriptionCode
create_sentence_based_clusterNo descriptionCode
create_sentence_based_networkNo descriptionCode
docterm_matrixNo descriptionCode
documentsNo descriptionCode
documents_by_classNo descriptionCode
getNo descriptionCode
get_classNo descriptionCode
get_idNo descriptionCode
get_largest_cosineNo descriptionCode
get_sentence_featureNo descriptionCode
get_sentence_featuresNo descriptionCode
get_unique_wordsNo descriptionCode
has_documentNo descriptionCode
insertNo descriptionCode
load_documentsNo descriptionCode
load_file_list_arrayNo descriptionCode
load_file_list_from_fileNo descriptionCode
load_lines_from_fileNo descriptionCode
newNo descriptionCode
normalize_sentence_featureNo descriptionCode
normalize_sentence_featuresNo descriptionCode
normalize_sentence_scoresNo descriptionCode
remove_sentence_featuresNo descriptionCode
save_documents_to_directoryNo descriptionCode
save_documents_to_fileNo descriptionCode
score_sentencesNo descriptionCode
set_classNo descriptionCode
set_idNo descriptionCode
set_sentence_featureNo descriptionCode
stem_all_documentsNo descriptionCode
strip_all_documentsNo descriptionCode
tfNo descriptionCode
write_cosNo descriptionCode

Methods description


None available.

Methods code


build_idfdescriptionprevnextTop
sub build_idf {
	my $self = shift;
	my $dbm_file = shift;

	my %parameters = @_;

	my $type = 'text';
	if (exists $parameters{type}) {
		$type = $parameters{type};
	}

	if ($type ne 'html' and $type ne 'text' and $type ne 'stem') {
		die "Type must be 'html, 'text', or 'stem'.";
	}

	my %token_hash;
	dbmopen(%token_hash, $dbm_file, 0666);
	%token_hash = ();

	my $count = 0;

	my %documents = %{ $self->{documents} };

	foreach my $doc (values %documents) {
		$count++;
		print "Looking at document $count\n";

		my @words = $doc->split_into_words(type => $type);
		my %looked = ();

		foreach my $w (@words) {
			$w =~ s/^\\[0-9]+//;
			$w =~ s/^[\.\"\-\_\+\\\`\~\!\&\(\)\[\]\{\}\'\;\:\&\*\?\,]+//;
			$w =~ s/[\.\"\-\_\+\\\`\~\!\&\(\)\[\]\{\}\'\;\:\&\*\?\,]+$//;

			if ($w =~ /^\s*$/ || exists $looked{$w}) { next; }
			if ($token_hash{$w} and $token_hash{$w} > 0) {
				$token_hash{$w}++;
			} else {
				$token_hash{$w} = 1;
			}

			$looked{$w}++;
		}
	}

	foreach my $w (keys %token_hash) {
		if (0.5+$token_hash{$w} != 0) {
			$token_hash{$w} = log(($count+1)/(0.5+$token_hash{$w}));
} } return %token_hash;
}

classesdescriptionprevnextTop
sub classes {
	my $self = shift;

	my $docsref = $self->documents();
	my %classes;
	foreach my $id (keys %$docsref) {
		my $class = $docsref->{$id}->get_class();
		$classes{$class}++ if (defined $class);
	}

	return %classes;
}

compute_binary_cosinedescriptionprevnextTop
sub compute_binary_cosine {
	my $self = shift;
	my $threshold = shift;

	my %cos_matrix;
	if ($self->{cosine_matrix}) {
		%cos_matrix = %{ $self->{cosine_matrix} };
	} else {
		%cos_matrix = $self->compute_cosine_matrix();
	}

	my %retHash = ();

	foreach my $doc_key (keys %cos_matrix) {
		$retHash{$doc_key} = ();
	}

	foreach my $doc1_key (keys %cos_matrix)
	{
		foreach my $doc2_key (keys %{ $cos_matrix{$doc1_key} })
		{
			if ($cos_matrix{$doc1_key}{$doc2_key} >= $threshold)
			{
				$retHash{$doc1_key}{$doc2_key} = $cos_matrix{$doc1_key}{$doc2_key};
			}
			else
			{
				$retHash{$doc1_key}{$doc2_key} = 0;
			}
		}
	}

	return %retHash;
}

compute_cosine_matrixdescriptionprevnextTop
sub compute_cosine_matrix {
	my $self = shift;

	my %parameters = @_;

	my $text_type = "stem";
	if (exists $parameters{text_type}) {
		$text_type = $parameters{text_type};
	}

  my %documents = %{ $self->{documents} };

  my $i = 0;
  my $j = 0;
  my $counter = 0;
  my %cos_hash = ();

  foreach my $doc_key (keys %documents) {
    $cos_hash{$doc_key} = ();
  }

  my $size = scalar(keys %documents);
  foreach my $doc1_key (keys %documents) {
	  $i = 0;
	  $j++;

# my %doc1_hash = ();
# $cos_hash{$doc1_key} = \%doc1_hash;
my $document1 = $documents{$doc1_key}; foreach my $doc2_key (keys %documents) { $i++; my $document2 = $documents{$doc2_key}; if ($i < $j) { my $text1 = ""; my $text2 = ""; if ($text_type eq "stem") { $text1 = $document1->get_stem; $text2 = $document2->get_stem; } elsif ($text_type eq "text") { $text1 = $document1->{text}; $text2 = $document2->{text}; } my $cos = GetLexSim($text1, $text2); $cos_hash{$doc1_key}{$doc2_key} = $cos; $cos_hash{$doc2_key}{$doc1_key} = $cos; $counter++; last if ($counter == $size*($size-1)); } } last if ($counter == $size*($size-1)); } $self->{cosine_matrix} =\% cos_hash; return %cos_hash;
}

compute_genprob_matrixdescriptionprevnextTop
sub compute_genprob_matrix {
	my $self = shift;
	my %params = @_;
	$params{genprob} = $GENPROB unless $params{genprob};

	my %word_map;
	my $i = 0;
	foreach my $word ($self->get_unique_words()) {
		$word_map{$word} = $i++;
	}

	my %docmap;
	my %total_freq;
	my $docsref = $self->{documents};

# Write the term frequency file
open TF, "> tf.temp" or die "Couldn't open file: $!"; $i = 0; foreach my $id (keys %$docsref) { my $doc = $docsref->{$id}; my %tf = $doc->tf(); $docmap{$i} = $id; #my $numwords = scalar keys %tf;
my $numwords = $doc->split_into_words( type => "stem" ); foreach my $word (keys %tf) { my $tf = $tf{$word} * $numwords; $total_freq{$word} += $tf; print TF "$i\t$word_map{$word}\t$tf\n"; } $i++; } close TF; # Write the MLE file
open MLE, "> mle.temp" or die "Couldn't open file: $!"; my $total_words; map { $total_words += $total_freq{$_} } keys %total_freq; foreach my $word (keys %total_freq) { my $val = $total_freq{$word} / $total_words;
print MLE "$word_map{$word}\t$val\n"; } close MLE; # Run the command
my $total_docs = scalar keys %docmap; my @lines = `$params{genprob} tf.temp mle.temp 1000 $total_docs $total_words`; unless (@lines) { warn "Bad genprob output"; return undef; } # remove the temp files
unlink("tf.temp") or warn "Couldn't unlink tf.temp: $!"; unlink("mle.temp") or warn "Couldn't unlink mle.temp: $!"; # Save to a matrix
my %matrix; foreach my $line (@lines) { chomp $line; my ($from, $to, $val) = split / /, $line; my $id1 = $docmap{$from}; my $id2 = $docmap{$to}; if (defined $id1 && defined $id2 && defined $val) { unless ($matrix{$id1}) { $matrix{$id1} = {}; } $matrix{$id1}->{$id2} = $val; } else { warn "Bad genprob output"; return undef; } } # Make sure matrix has zero diagonal
foreach my $id (keys %matrix) { $matrix{$id}->{$id} = 0; } return %matrix;
}

compute_lexrankdescriptionprevnextTop
sub compute_lexrank {
	my $self = shift;
	my %params = @_;

	my $cutoff = 0.15;
	$cutoff = $params{cutoff} if $params{cutoff};

	my $matrix = $self->{cosine_matrix};
	my $cmatrix = {};
	unless ($matrix) {
		my %m = $self->compute_cosine_matrix( type => $params{type} );
		$matrix =\% m;
} foreach my $k1 (keys %$matrix) { $cmatrix->{$k1} = {} unless $cmatrix->{$k1}; foreach my $k2 (keys %{$matrix->{$k1}}) { if ($matrix->{$k1}->{$k2} >= $cutoff) { $cmatrix->{$k1}->{$k2} = $matrix->{$k1}->{$k2}; } else { $cmatrix->{$k1}->{$k2} = 0; } } } my $n = $self->create_network( cosine_matrix => $cmatrix, include_zeros => 1 ); my $cent = Clair::Network::Centrality::LexRank->new($n); $cent->centrality(%params); my %scores; my @verts = $n->get_vertices(); foreach my $v (@verts) { $scores{$v} = $n->get_vertex_attribute($v, "lexrank_value"); } return %scores;
}

compute_sentence_featuredescriptionprevnextTop
sub compute_sentence_feature {
	my $self = shift;
	my %params = @_;
	my ($name, $sub) = ($params{name}, $params{feature});
	my $norm = $params{normalize};

	return undef unless defined $name and defined $sub;
	my $docs = $self->documents();

	my $state = {};

	foreach my $did (keys %$docs) {

		my $doc = $docs->{$did};
		my @sents = $doc->get_sentences();

		foreach my $i ( 0 .. $#sents ) {

			my %params = (
					document => $doc,
					sentence => $sents[$i],
					sentence_index => $i,
					cluster => $self,
					state => $state
				     );

			my $value;
			eval {
				$value = &$sub(%params);
			};

			my $did = $self->get_id() || "no id";
			if ($@) {
				warn "Feature $name died processing $i in document $did: $@";
			} elsif (not defined $value) {
				warn "Feature $name returned undef for sent $i in doc $did";
			} else {
				$doc->set_sentence_feature($i, $name => $value);
			}

		}

	}

	if ($norm) {
		return $self->normalize_sentence_feature($name);
	}

	return 1;
}

compute_sentence_featuresdescriptionprevnextTop
sub compute_sentence_features {
	my $self = shift;
	my %features = @_;

	foreach my $name (keys %features) {
		$self->compute_sentence_feature( name => $name,
				feature => $features{$name} );
	}
}

count_elementsdescriptionprevnextTop
sub count_elements {
	my $self = shift;

	my $documents_ref = $self->{documents};

	return scalar keys %$documents_ref;
}

create_genprob_networkdescriptionprevnextTop
sub create_genprob_network {
	my $self = shift;
	my %params = @_;

# Just create a regular cosine network using the genprob matrix
$params{cosine_matrix} = $params{genprob_matrix}; my $network = $self->create_network(%params); # ... but make sure to reset the diagonal to 0
foreach my $v ($network->get_vertices) { $network->set_vertex_attribute($v, "lexrank_transition", 0); } return $network;
}

create_hyperlink_network_from_arraydescriptionprevnextTop
sub create_hyperlink_network_from_array {
	my $self = shift;

	my $hyperlinks_ref = shift;
	my @hyperlinks = @$hyperlinks_ref;

	my %parameters = @_;

	my $property = 'pagerank_transition';
	if (exists $parameters{property}) {
		$property = $parameters{property};
	}

	my $network = new Clair::Network;

	foreach my $h (@hyperlinks) {
		my ($u_id, $v_id) = @$h;

		my $u = $self->get($u_id);
		my $v = $self->get($v_id);
		my $add_u = $u_id;
		my $add_v = $v_id;

		if (not $network->has_node($add_u)) {
			$network->add_node($add_u, document => $u);
		}

		if ($u_id ne $v_id) {
			if (not $network->has_node($add_v)) {
				$network->add_node($add_v, document => $v);
			}

			$network->add_edge($add_u, $add_v);
			$network->set_edge_attribute($add_u, $add_v, $property, 1);
		} else {
			$network->add_node($add_u);
			$network->set_vertex_attribute($add_u, $property, 1);
		}
	}

	return $network;
}

create_hyperlink_network_from_filedescriptionprevnextTop
sub create_hyperlink_network_from_file {
	my $self = shift;

	my $filename = shift;

	my %parameters = @_;
	my @hyperlink_array;

	open(FILE, "< $filename") or die "Coudln't open $filename: $!";

	while (<FILE>) {
		next unless m/(.+) (.+)/;

		my $u = $1;
		my $v = $2;

		my @link = ($u, $v);
		push(@hyperlink_array,\@ link);
	}

	close(FILE);

	return $self->create_hyperlink_network_from_array(\@hyperlink_array, %parameters);
}

create_lexical_networkdescriptionprevnextTop
sub create_lexical_network {
  my $self = shift;

  my %params = @_;

  my $docs = $self->documents();

  my %word_hash = ();
  foreach my $did (keys %$docs) {
    my $doc = $docs->{$did};
    my @sents = $doc->get_sentences();

    foreach my $sent (@sents) {
      my @sent_list = ();
      my %seen = ();
      chomp $sent;
      my @words = split(/\s+/, $sent);
      foreach my $word (@words) {
	$word = lc $word;
	if (not defined $seen{$word}) {
	  push(@sent_list, $word);
	  $seen{$word} = 1;
	}
      }

      # We now have a hash of words in the sentence
# Update the global counts
my $cnt = scalar @sent_list; # Loop through all the unique words in the sentence
for(my $i = 0; $i < $cnt; $i++) { my $word1 = $sent_list[$i]; if (not defined $word_hash{$word1}) { $word_hash{$word1} = (); } # Loop through all the unique words after this word
for(my $j = $i + 1; $j < $cnt; $j++) { my $word2 = $sent_list[$j]; if ($word1 ne $word2) { if (defined $word_hash{$word1}{$word2}) { $word_hash{$word1}{$word2}++; } elsif (defined $word_hash{$word2}{$word1}) { $word_hash{$word2}{$word1}++; } else { $word_hash{$word1}{$word2} = 1; } } } } } } # Convert to network
my $network = Clair::Network->new(); foreach my $word1 (keys %word_hash) { if (not $network->has_node($word1)) { $network->add_node($word1, $word1); } foreach my $word2 (keys %{$word_hash{$word1}}) { if (not $network->has_node($word2)) { $network->add_node($word2, $word2); } $network->add_edge($word1, $word2); $network->set_edge_weight($word1, $word2, $word_hash{$word1}{$word2}); } } return $network;
}

create_networkdescriptionprevnextTop
sub create_network {
	my $self = shift;

	my %parameters = @_;

	my %cos_matrix = ();
	if (exists $parameters{cosine_matrix}) {
		%cos_matrix = %{ $parameters{cosine_matrix} };
	} elsif (exists $self->{cosine_matrix}) {
		%cos_matrix = $self->{cosine_matrix};
	} else {
		die "Must specify cosine matrix.";
	}

	my $include_zeros = 0;
	if (exists $parameters{include_zeros} && $parameters{include_zeros} == 1) {
		$include_zeros = 1;
	}

	my $property = 'lexrank_transition';
	if (exists $parameters{property}) {
		$property = $parameters{property};
	}

	my $network = Clair::Network->new();

# Add the edges to the graph
# (Vertices will be added automatically)
foreach my $doc1 (keys %cos_matrix) { foreach my $doc2 (keys %{ $cos_matrix{$doc1} }) { if ($cos_matrix{$doc1}{$doc2} != 0 || $include_zeros) { if (not $network->has_node($doc1)) { $network->add_node($doc1, document => $self->get($doc1)); } if ($doc1 ne $doc2) { if (not $network->has_node($doc2)) { $network->add_node($doc2, document => $self->get($doc2)); } $network->add_edge($doc1, $doc2); $network->set_edge_attribute($doc1, $doc2, $property, $cos_matrix{$doc1}{$doc2}); } } } } # Set the cos value to 1 on the diagonal
foreach my $v ($network->get_vertices) { $network->set_vertex_attribute($v, $property, 1); } return $network;
}

create_sentence_based_clusterdescriptionprevnextTop
sub create_sentence_based_cluster {
	my $self = shift;

	my %documents = %{ $self->{documents} };

	my $c = Clair::Cluster::->new();

	foreach my $doc (values %documents) {
		my @sentences = $doc->split_into_sentences;
		my $doc_id = $doc->get_id;

		my $count = 0;

		foreach my $sent (@sentences) {
			++$count;
			my $sent_id = $doc_id . $count;
			my $new_doc = Clair::Document::->new(type => 'text', string => "$sent", id => "$sent_id");
			$new_doc->set_parent_document($doc);
			$c->insert($sent_id, $new_doc);
		}
	}

	return $c;
}

create_sentence_based_networkdescriptionprevnextTop
sub create_sentence_based_network {
	my $self = shift;

	my %documents = %{ $self->{documents} };

	my %params = @_;

	my $c = $self->create_sentence_based_cluster();

	my %cos_hash = $c->compute_cosine_matrix(text_type => 'text');

	if (exists $params{threshold} and $params{threshold} != 0) {
		my $threshold = $params{threshold};
		%cos_hash = $c->compute_binary_cosine($threshold);
	}

	my $include_zeros = 0;
	if (exists $params{include_zeros} and $params{include_zeros} == 1) {
		$include_zeros = 1;
	}

	return $c->create_network(cosine_matrix =>\% cos_hash, include_zeros => $include_zeros);
}

docterm_matrixdescriptionprevnextTop
sub docterm_matrix {
	my $self = shift;
	my %params = @_;
	my $type = $params{type} || "stem";

	my @matrix;
	my $docsref    = $self->documents();
	my @uniq_words = sort $self->get_unique_words(type => $type);
	my @docids     = sort keys %$docsref;
	foreach my $id (@docids) {
		my %doc_tf = $docsref->{$id}->tf(type => $type);
		my @vector;
		foreach my $word (@uniq_words) {
			push @vector, $doc_tf{$word} || 0;
		}
		push @matrix,\@ vector;
	}

	return (\@matrix,\@ docids,\@ uniq_words);
}

documentsdescriptionprevnextTop
sub documents {
	my $self = shift;

	return $self->{documents};
}

documents_by_classdescriptionprevnextTop
sub documents_by_class {
	my $self = shift;

	my $docsref = $self->documents();
	my %docs_by_class;
	foreach my $id (keys %$docsref) {
		my $class = $docsref->{$id}->get_class();
		if (defined $class) {
			$docs_by_class{$class}->{$id} = 1;
		}
	}

	return %docs_by_class;
}

getdescriptionprevnextTop
sub get {
	my $self = shift;
	my $id = shift;

	my $documents_ref = $self->{documents};

	return $documents_ref->{$id};
}

get_classdescriptionprevnextTop
sub get_class {
	my $self = shift;
	my $id = shift;

	return $self->get($id)->get_class();
}

get_iddescriptionprevnextTop
sub get_id {
    my $self = shift;
    return $self->{id};
}

get_largest_cosinedescriptionprevnextTop
sub get_largest_cosine {
	my $self = shift;
	my %parameters = @_;

	my %cos_matrix = ();
	if (exists $parameters{cosine_matrix}) {
		%cos_matrix = %{ $parameters{cosine_matrix} };
	}
	elsif (exists $self->{cosine_matrix}) {
		%cos_matrix = %{ $self->{cosine_matrix} };
	}
	else {
		die "Must specify cosine matrix.";
	}

	my $largest_cosine = -1;
	my $largest_key1 = '';
	my $largest_key2 = '';

	foreach my $doc1_key (keys %cos_matrix)
	{
		foreach my $doc2_key (keys %{ $cos_matrix{$doc1_key} })
		{
			if ($largest_cosine < $cos_matrix{$doc1_key}{$doc2_key})
			{
				$largest_cosine = $cos_matrix{$doc1_key}{$doc2_key};
				$largest_key1 = $doc1_key;
				$largest_key2 = $doc2_key;
			}
		}
	}

	my %retHash = ();
	$retHash{'value'} = $largest_cosine;
	$retHash{'key1'} = $largest_key1;
	$retHash{'key2'} = $largest_key2;

	return %retHash;
}

get_sentence_featuredescriptionprevnextTop
sub get_sentence_feature {
	my $self = shift;
	my $docs = $self->documents();
	my $did = shift;
	my $sno = shift;
	my $name = shift;

	if ($self->has_document($did)) {
		my $doc = $docs->{$did};
		return $doc->get_sentence_feature($sno, $name);
	} else {
		return undef;
	}
}

get_sentence_featuresdescriptionprevnextTop
sub get_sentence_features {
	my $self = shift;
	my $docs = $self->documents();
	my $did = shift;
	my $sno = shift;

	if ($self->has_document($did)) {
		my $doc = $docs->{$did};
		return $doc->get_sentence_features($sno);
	} else {
		return undef;
	}
}

get_unique_wordsdescriptionprevnextTop
sub get_unique_words {
	my $self = shift;
	my %params = @_;

	my %words;
	my $docsref = $self->{documents};
	foreach my $id (keys %$docsref) {
		my $doc = $docsref->{$id};
		map { $words{$_} = 1 } $doc->get_unique_words(%params);
	}
	return keys %words;
}

has_documentdescriptionprevnextTop
sub has_document {
    my $self = shift;
    my $id = shift;

    return $self->{documents}->{$id};
}

insertdescriptionprevnextTop
sub insert {
	my $self = shift;

	my $id = shift;
	my $document = shift;

	my $documents_ref = $self->{documents};

	$documents_ref->{$id} = $document;
}

load_documentsdescriptionprevnextTop
sub load_documents {
	my $self = shift;
	my $document_expr = shift;
	my %parameters = @_;

	my $doc_type = 'text';
	if (exists $parameters{type}) {
		$doc_type = $parameters{type};

		if ($doc_type ne 'text' and $doc_type ne 'html' and $doc_type ne 'stem') {
			die "Document type must be\' html\',\' text\', or\' stem\'.";
		}
	}

	my $filename_id = 1;
	if ( (exists $parameters{filename_id} and $parameters{filename_id} == 0) or
	     (exists $parameters{count_id} and $parameters{count_id} == 1) ) {
		$filename_id = 0;
	}

	my $count = 0;
	if (exists $parameters{start_count} ) {
		$count = $parameters{start_count};
	}

	open (LS, "ls -1 $document_expr |") or die "Could not run ls: $!";
	while ( <LS> ) {
		chomp;
		my $file = $_;

		my $id;
		if ($filename_id == 1) {
			$id = $file;
		} else {
			$id = $count;
		}

		my $doc = new Clair::Document(type => $doc_type, file => $file, id => $id);
		$self->insert($id, $doc);

		$count++;
	}

	close LS;

	return $count;
}

load_file_list_arraydescriptionprevnextTop
sub load_file_list_array {
	my $self = shift;
	my $filelist_ref = shift;
	my @filelist = @$filelist_ref;

	my %parameters = @_;

	my $doc_type = 'text';
	if (exists $parameters{type}) {
		$doc_type = $parameters{type};

		if ($doc_type ne 'text' and $doc_type ne 'html' and $doc_type ne 'stem') {
			die "Document type must be\' html\',\' text\', or\' stem.\'";
		}
	}

	my $filename_id = 1;
	if ( (exists $parameters{filename_id} and $parameters{filename_id} == 0) or
	     (exists $parameters{count_id} and $parameters{count_id} == 1) ) {
		$filename_id = 0;
	}

	my $count = 0;
	if (exists $parameters{start_count} ) {
		$count = $parameters{start_count};
	}

	foreach my $file (@filelist) {
		my $id;
		if ($filename_id == 1) {
			$id = $file;
		} else {
			$id = $count;
		}

		my $doc = new Clair::Document(type => $doc_type, file => $file, id => $id);
		$self->insert($id, $doc);

		$count++;
	}