sub run_cidr
{
my $self = shift;
my $cidr_home = $self->{cidr_home};
my $cidr_script = "$cidr_home/cidr.pl";
my $temp_dir = $self->{dest} || "temp.cidr";
unless (-d $temp_dir) {
mkdir($temp_dir) or die "Could not create temporary dir: $!";
}
# Save the files to disk my @file_list;
my %dochash = %{ $self->{raw_cluster}->documents() };
foreach my $id (keys %dochash) {
my $doc = $dochash{$id};
$doc->strip_html();
$id =~ /([^\/]+)$/;
my $filename = $1;
my $outfile = "$temp_dir/$filename";
open DOC, "> $outfile" or die "Could not open file: $outfile";
print DOC $doc->get_text();
close DOC;
push @file_list, $filename;
}
# Make the list of files my $allfile = "$temp_dir/ALL";
open ALL, "> $allfile" or die "Could not open file: $!";
for (@file_list) {
print ALL "$_\n";
}
close ALL;
# Copy the idf data to the directory, ignoring enidf.txt my @idf_files = grep(!/enidf.txt$/, glob("$DBM_HOME/enidf*"));
for (@idf_files) {
if ($_ =~ /([^\/]+)$/) {
my $file = $1;
# cidr.pl needs nidf, mead has enidf my $noefile = $file;
$noefile =~ s/^e//g;
#system("ln -s $DBM_HOME/$file $temp_dir/$noefile"); system("cp $DBM_HOME/$file $temp_dir/$noefile");
} else {
warn "Unexpected enidf file: $_";
}
}
my $cidr_command = "$cidr_script 0 $self->{sim_threshold} "
. "$self->{word_decay} "
. "$self->{keep_threshold} "
. "$self->{centroid_size}";
# Run CIDR chdir($temp_dir);
system("cat ALL | $cidr_command > /dev/null");
my @dirnames;
opendir DIR, ".";
for (readdir DIR) {
if (-d $_ and $_ ne "." and $_ ne "..") {
push @dirnames, $_;
}
}
closedir DIR;
my @clusters;
my %idf;
dbmopen %idf, "$DBM_HOME/enidf", 0666
or die "Couldn't open $DBM_HOME/enidf: $!";
my $raw_cluster = $self->{raw_cluster};
foreach my $dirname (@dirnames) {
chdir($dirname);
my %centroid;
dbmopen %centroid, "centroid", 0666;
delete $centroid{numberofarticles};
my $centroid_copy = _copy_centroid(\%centroid,\% idf);
dbmclose %centroid;
unlink <centroid*>;
my $cluster = Clair::Cluster->new();
#$cluster->load_documents("*");
foreach my $filename (`ls *`) {
chomp $filename;
if ($raw_cluster->has_document($filename)) {
$cluster->insert($filename, $raw_cluster->get($filename));
} else {
my $docs = $raw_cluster->documents();
my $ids = join ", ", keys %$docs;
warn "Couldn't find $filename in cluster (in clust: $ids)";
}
}
push @clusters, { cluster => $cluster, centroid => $centroid_copy };
chdir("..");
}
chdir("..");
return @clusters;} |