sub create_corpus
{ my $self = shift;
my %params = @_;
my $N = $self->{base_collection}->{size};
my $doc_dir = $self->{base_collection}->{docs_dir};
my $download_dir = $self->{download_base} . "/" . $params{corpus_name};
my $corpus_dir = $self->{corpus_data} . "/" . $params{corpus_name};
$self->prepare_directories($params{corpus_name});
# Generate our link specification open (LINKS, ">$corpus_dir/$params{corpus_name}.links") ||
croak "Could not create file $corpus_dir/$params{corpus_name}.links\n";
open (COS, $self->{base_collection}->{cosine_file}) ||
croak "Could not open file $self->{base_collection}->{cosine_file}\n";
my ($steepness, $thresh) = ($params{sigmoid_steepness},
$params{sigmoid_threshold});
my ($src, $tgt, $cos);
while (<COS>) {
chomp;
($src, $tgt, $cos) = split;
if (logistic ($cos, $steepness, $thresh) >= random_uniform()) {
print LINKS "$src $tgt\n";
print LINKS "$tgt $src\n";
}
}
close (LINKS);
close (COS);
# Now, generate the html docs and the url2file map, which is # needed for indexing the corpus. my $url2file = $self->create_html_no_anchors
(src_doc_dir => $self->{base_collection}->{docs_dir},
html_dir =>
"$download_dir/www.$params{corpus_name}.com",
links_file => "$corpus_dir/$params{corpus_name}.links",
base_url => "www.$params{corpus_name}.com");
my $uniq_file = $self->{base_dir} . "/$params{corpus_name}.download.uniq";
open (LINKSUNIQ, ">$uniq_file") ||
croak "Cannot create file $uniq_file\n";
for (my $i = 0; $i < @$url2file; $i++) {
print LINKSUNIQ $url2file->[$i] . "\n";
}
close (LINKSUNIQ);} |