sub cluster
{
my $self = shift;
my $cluster = shift;
my $verbose = $self->{_params}->{verbose};
# Cluster size must be > 1 if ($cluster->count_elements() <= 1) {
print STDERR "Cluster must be >= 2 docs\n" if $verbose;
return undef;
}
my $docsref = $cluster->documents();
my @docs = values %$docsref;
# Keep track of three things: clusters, clusters of sketches, and centroids my @cluster_sketches;
my @clusters;
my @centroids;
for (my $i = 0; $i < @docs; $i++) {
my $doc = $docs[$i];
my $did = $doc->get_id();
print STDERR "Sketching document $did\n" if $verbose;
my $sketch = $self->_sketch_document($doc);
# If this is the first document, make a new cluster if ($i == 0) {
print STDERR "First document, making new cluster\n" if $verbose;
my $sclust = Clair::Cluster->new();
my %centroid = $self->_insert_into_sketch_cluster($sketch, $sclust);
my $clust = Clair::Cluster->new();
$clust->insert($doc->get_id(), $doc);
push @cluster_sketches, $sclust;
push @clusters, $clust;
push @centroids,\% centroid;
# Otherwise, find the cluster with the maximum similarity to the doc. # If it is >= sim_threshold, add the doc to the cluster. } else {
my $max_sim = 0;
my $max_index = 0;
for (my $j = 0; $j < @centroids; $j++) {
my $sim = $self->_sim_centroid_doc($centroids[$j], $sketch);
print STDERR "Document $did vs. cluster $j: $sim\n" if $verbose;
if ($sim > $max_sim) {
$max_sim = $sim;
$max_index = $j;
}
}
my $sim_threshold = $self->{_params}->{sim_threshold};
if ($max_sim >= $sim_threshold) {
# Insert the document sketch into the cluster and update # the centroid. my $max_scluster = $cluster_sketches[$max_index];
my $max_cluster = $clusters[$max_index];
my %centroid = $self->_insert_into_sketch_cluster(
$sketch, $max_scluster);
$centroids[$max_index] =\% centroid;
$max_cluster->insert($doc->get_id(), $doc);
print STDERR "Added document $did to $max_index\n" if $verbose;
} else {
# Create a new cluster and get its centroid. my $scluster = Clair::Cluster->new();
my %centroid = $self->_insert_into_sketch_cluster(
$sketch, $scluster);
my $cluster = Clair::Cluster->new();
$cluster->insert($doc->get_id(), $doc);
push @clusters, $cluster;
push @cluster_sketches, $scluster;
push @centroids,\% centroid;
print STDERR "Created new cluster for doc $did\n" if $verbose;
}
}
}
# Return a list of hashrefs my @results;
for (my $k = 0; $k < @clusters; $k++) {
push @results, {
cluster => $clusters[$k],
centroid => $centroids[$k]
};
}
return @results;} |