sub create_corpus
{ my $self = shift;
my %params = @_;
my $tf_dir = "$self->{base_collection}->{collection_base}/tf_docs";
my $weight_file = $params{term_weights};
my $reserve = $params{prob_reserve};
my $thresh = $params{sigmoid_threshold};
my $steepness = $params{sigmoid_steepness};
my $download_dir = $self->{download_base} . "/" . $params{corpus_name};
my $corpus_dir = $self->{corpus_data} . "/" . $params{corpus_name};
$self->prepare_directories($params{corpus_name});
# Create our links specification open (LINKS, ">$corpus_dir/$params{corpus_name}.links") ||
croak "Could not create file $corpus_dir/$params{corpus_name}.links\n";
# Read-in weight model my $weight_model = read_model_from_file ($weight_file);
# Load filenames into memory unless (-d $tf_dir) { die "$tf_dir is not a directory\n" }
my $src_file;
my $term;
my $tfw_model;
my @tf_files;
my %tfw_models;
my $terms_not_in_model = 0;
my $terms_in_model = 0;
opendir (TFDIR, $tf_dir) || die "Cant open $tf_dir\n";
while (defined ($src_file = readdir (TFDIR))) {
next if $src_file =~ /^\.+/; # Strip out dotfiles push (@tf_files, $src_file); # Store filename
# Read TF model and weight by term weight model $tfw_model = read_model_from_file ("$tf_dir/$src_file");
foreach $term (keys %$tfw_model) {
# Do we have a weight for this term? if (exists $weight_model->{$term}) {
# Weight exists - modify model $tfw_model->{$term} = $weight_model->{$term} * $tfw_model->{$term};
$terms_in_model++;
# print STDERR $tfw_model->{$term} . "\n";
} elsif (exists $weight_model->{Clair::Utils::porter(lc $term)}) {
# Weight exists for stemmed term- modify model $tfw_model->{$term} =
$weight_model->{Clair::Utils::porter(lc $term)} * $tfw_model->{$term};
$terms_in_model++;
# print STDERR $tfw_model->{$term} . "\n"; } else {
# Weight does not exist - remove term from model delete $tfw_model->{$term};
$terms_not_in_model++;
}
}
# Save weighted model. $tfw_models{$src_file} = $tfw_model;
}
closedir (TFDIR);
# Now determine links based on probabilities my $rP = $params{mix_probability};
my $tgt_file;
my $link_prob;
my $cur_reserve;
my $rank_itor;
my %indegree = ();
my $timestep = 1;
foreach $tgt_file (@tf_files) {
foreach $src_file (@tf_files) {
next if $src_file eq $tgt_file; # Skip self-links
# Decide whether to use R-based or preferential attatchment links if (random_uniform() < $rP) {
# Use R-based # Refill our reserve for this pair of docs $cur_reserve = $reserve;
$rank_itor = 0;
# Iterate over terms from greatest to least weights in $tgt_file TERMS: foreach $term (sort
{$tfw_models{$tgt_file}->{$b} <=> $tfw_models{$tgt_file}->{$a}}
keys %{$tfw_models{$tgt_file}}) {
$rank_itor++;
# Proceed iff we have reserve to spare last TERMS if ($cur_reserve <= 0);
# See if this term exists in src-doc if (exists $tfw_models{$src_file}->{$term}) {
# It exists. Compute link probability $link_prob = logistic ($tfw_models{$tgt_file}->{$term} +
$tfw_models{$tgt_file}->{$term},
$steepness, $thresh);
# Given the probability, do we output a link? if ($link_prob > random_uniform (1, 0, 1)) {
print LINKS "$src_file $tgt_file $term\n";
$indegree{$tgt_file}++;
#print "$src_file $tgt_file $term $link_prob RANK:$rank_itor\n"; }
# Draw from our reserve $cur_reserve -= $link_prob;
} # If term is in both documents } # foreach $term } else {
# Use preferential attatchment links # make sure we have an in-degree for the target node (fix this) unless (exists $indegree{$tgt_file}) {
$indegree{$tgt_file} = 1;
}
# Compute probability of a link. if (random_uniform() < $indegree{$tgt_file}/$timestep) { # Grab a high TF*W term $term = (sort {$tfw_models{$tgt_file}->{$b} <=> $tfw_models{$tgt_file}->{$a}} keys %{$tfw_models{$tgt_file}})[0]; print LINKS "$src_file $tgt_file $term\n";
$indegree{$tgt_file}++;
}
} # PA or Radev? $timestep++;
} # foreach source doc } # foreach target doc
} |