sub create_corpus
{ my $self = shift;
my %params = @_;
my $cosP = $params{mix_probability};
my $N = $self->{base_collection}->{size};
my $doc_dir = $self->{base_collection}->{docs_dir};
my $download_dir = $self->{download_base} . "/" . $params{corpus_name};
my $corpus_dir = $self->{corpus_data} . "/" . $params{corpus_name};
$self->prepare_directories($params{corpus_name});
# Generate our link specification open (LINKS, ">$corpus_dir/$params{corpus_name}.links") ||
croak "Could not create file $corpus_dir/$params{corpus_name}.links\n";
open (COS, $self->{base_collection}->{cosine_file}) ||
croak "Could not open file $self->{base_collection}->{cosine_file}\n";
# my ($steepness, $thresh) = ($params{sigmoid_steepness}, # $params{sigmoid_threshold});
my ($tgt, $src, $cos);
my %degree = ();
my $desired_links = $params{desired_links};
print "Desired Edges: $desired_links\n";
# Create bags of nodes my %nodes = ();
my @input_nodes = ();
my @added_nodes = ();
my %cosines = ();
my $cos_sum = 0;
while (<COS>) {
chomp;
($tgt, $src, $cos) = split;
$nodes{$tgt}++;
$nodes{$src}++;
$cosines{"$src $tgt"} = $cos;
$cos_sum += $cos;
}
close (COS);
# Randomize input node traversal order @input_nodes = keys %nodes;
fisher_yates_shuffle (\@input_nodes);
# Compute the probability of linking for PA as a function of the # rough number of links we want in the final graph my $num_links = round ($desired_links / scalar (@input_nodes)) * 2; #my $num_links = 5;
my $prob; # Temp var for storing probabilities my $total_degree; # Store total degree for graph as we add edges my $node_itor; # Index iterator over newly added nodes
my $coslink_count = 0;
my $palink_count = 0;
my $initlink_count = 0;
# Iterate over the randomized input nodes for ($node_itor = 0; $node_itor < @input_nodes; $node_itor++) {
$src = $input_nodes[$node_itor];
# store link data as a hash of node indices (indices into input_nodes) my %link_hash = ();
# Do we need to initialize the graph? if ($node_itor < $num_links) {
# Initialize graph with fully-connected component for (my $init_itor = 0; $init_itor < $node_itor; $init_itor++) {
print LINKS "$src $input_nodes[$init_itor]\n";
#print "$src $input_nodes[$init_itor]\n"; $link_hash{$init_itor} = 1;
$initlink_count++;
}
} else {
# Graph initialized....Select Cosine or PA if (random_uniform() <= $cosP) {
# Using cosine for this node # Examine all cosines betwen $src and nodes in the graph foreach $tgt (0 .. $node_itor - 1) {
if (exists $cosines{"$src $input_nodes[$tgt]"}) {
$prob = $cosines{"$src $input_nodes[$tgt]"};
} elsif (exists $cosines{"$input_nodes[$tgt] $src"}) {
$prob = $cosines{"$input_nodes[$tgt] $src"};
} else {
croak "Cosine between $src and $input_nodes[$tgt] not available!\n";
}
# Do we create our link? if (random_uniform() <= $prob / $cos_sum * $desired_links) { print LINKS "$src $input_nodes[$tgt]\n"; $link_hash{$tgt} = 1;
$coslink_count++;
}
}
} else {
# Use Pref attach for this node foreach (1 .. $num_links) {
foreach $tgt (0 .. $node_itor - 1) {
$prob = $degree{$tgt}/$total_degree;
# Do we create a link? if (random_uniform() <= $prob) {
print LINKS "$src $input_nodes[$tgt]\n";
$palink_count++;
}
}
}
}
# Update Degree counter for nodes we've linked to foreach my $linked_to_node (keys %link_hash) {
$degree{$linked_to_node}++;
}
}
# We've created (scalar keys %link_hash) links to our new node $degree{$node_itor} = scalar keys %link_hash;
# Total added degree is 2 X the number of edges added $total_degree += $degree{$node_itor} * 2;} |