Clair::LinkPolicy

RadevPAMixed


SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods

SummaryTop
Clair::LinkPolicy::RadevPAMixed - Class implementing the RadevPAMixed link model

Package variablesTop
Privates (from "my" definitions)
$url2file = $self->create_html_with_anchors (src_doc_dir => $self->{base_collection}->{docs_dir}, html_dir => "$download_dir/www.$params{corpus_name}.com", links_file => "$corpus_dir/$params{corpus_name}.links", base_url => "www.$params{corpus_name}.com")
$e = 2.718281828
$uniq_file = $self->{base_dir} . "/$params{corpus_name}.download.uniq"

Included modulesTop
Carp
Clair::LinkPolicy::LinkPolicyBase
Clair::Utils::porter
Math::Random
strict

InheritTop
Clair::LinkPolicy::LinkPolicyBase

SynopsisTop

DescriptionTop
INHERITS FROM:
LinkPolicy
METHODS IMPLEMENTED BY THIS CLASS:
new Object Constructor
create_corpus Creates a corpus using this link policy. Generic object constructor for all link policies. Should
only be called by subclass constructors.
  base_collection	=> $collection_object
base_dir => $collection_directory
type => $name_of_this_linker_type

MethodsTop
create_corpusDescriptionCode
logisticDescriptionCode
newNo descriptionCode
read_model_from_fileDescriptionCode

Methods description


create_corpus code    nextTop
Generates a corpus using the RadevPAMixed model.
REQUIRED ARGUMENTS:
corpus_name
term_weights
sigmoid_steepness
sigmoid_threshold
prob_reserve

logisticcodeprevnextTop
Takes a TF*W value "x", a threshold steepness "s", and a threshold
center "c"

read_model_from_filecodeprevnextTop
Takes a filename, returns a model as a hash ref.

Methods code


create_corpusdescriptionprevnextTop
sub create_corpus {
  my $self = shift;
  my %params = @_;

  my $tf_dir = "$self->{base_collection}->{collection_base}/tf_docs";
  my $weight_file = $params{term_weights};
  my $reserve = $params{prob_reserve};
  my $thresh = $params{sigmoid_threshold};
  my $steepness = $params{sigmoid_steepness};

  my $download_dir = $self->{download_base} . "/" . $params{corpus_name};
  my $corpus_dir = $self->{corpus_data} . "/" . $params{corpus_name};

  $self->prepare_directories($params{corpus_name});

  # Create our links specification
open (LINKS, ">$corpus_dir/$params{corpus_name}.links") || croak "Could not create file $corpus_dir/$params{corpus_name}.links\n"; # Read-in weight model
my $weight_model = read_model_from_file ($weight_file); # Load filenames into memory
unless (-d $tf_dir) { die "$tf_dir is not a directory\n" } my $src_file; my $term; my $tfw_model; my @tf_files; my %tfw_models; my $terms_not_in_model = 0; my $terms_in_model = 0; opendir (TFDIR, $tf_dir) || die "Cant open $tf_dir\n"; while (defined ($src_file = readdir (TFDIR))) { next if $src_file =~ /^\.+/; # Strip out dotfiles
push (@tf_files, $src_file); # Store filename
# Read TF model and weight by term weight model
$tfw_model = read_model_from_file ("$tf_dir/$src_file"); foreach $term (keys %$tfw_model) { # Do we have a weight for this term?
if (exists $weight_model->{$term}) { # Weight exists - modify model
$tfw_model->{$term} = $weight_model->{$term} * $tfw_model->{$term}; $terms_in_model++; # print STDERR $tfw_model->{$term} . "\n";
} elsif (exists $weight_model->{Clair::Utils::porter(lc $term)}) { # Weight exists for stemmed term- modify model
$tfw_model->{$term} = $weight_model->{Clair::Utils::porter(lc $term)} * $tfw_model->{$term}; $terms_in_model++; # print STDERR $tfw_model->{$term} . "\n";
} else { # Weight does not exist - remove term from model
delete $tfw_model->{$term}; $terms_not_in_model++; } } # Save weighted model.
$tfw_models{$src_file} = $tfw_model; } closedir (TFDIR); # Now determine links based on probabilities
my $rP = $params{mix_probability}; my $tgt_file; my $link_prob; my $cur_reserve; my $rank_itor; my %indegree = (); my $timestep = 1; foreach $tgt_file (@tf_files) { foreach $src_file (@tf_files) { next if $src_file eq $tgt_file; # Skip self-links
# Decide whether to use R-based or preferential attatchment links
if (random_uniform() < $rP) { # Use R-based
# Refill our reserve for this pair of docs
$cur_reserve = $reserve; $rank_itor = 0; # Iterate over terms from greatest to least weights in $tgt_file
TERMS: foreach $term (sort {$tfw_models{$tgt_file}->{$b} <=> $tfw_models{$tgt_file}->{$a}} keys %{$tfw_models{$tgt_file}}) { $rank_itor++; # Proceed iff we have reserve to spare
last TERMS if ($cur_reserve <= 0); # See if this term exists in src-doc
if (exists $tfw_models{$src_file}->{$term}) { # It exists. Compute link probability
$link_prob = logistic ($tfw_models{$tgt_file}->{$term} + $tfw_models{$tgt_file}->{$term}, $steepness, $thresh); # Given the probability, do we output a link?
if ($link_prob > random_uniform (1, 0, 1)) { print LINKS "$src_file $tgt_file $term\n"; $indegree{$tgt_file}++; #print "$src_file $tgt_file $term $link_prob RANK:$rank_itor\n";
} # Draw from our reserve
$cur_reserve -= $link_prob; } # If term is in both documents
} # foreach $term
} else { # Use preferential attatchment links
# make sure we have an in-degree for the target node (fix this)
unless (exists $indegree{$tgt_file}) { $indegree{$tgt_file} = 1; } # Compute probability of a link.
if (random_uniform() < $indegree{$tgt_file}/$timestep) {
# Grab a high TF*W term
$term = (sort {$tfw_models{$tgt_file}->{$b} <=>
$tfw_models{$tgt_file}->{$a}}
keys %{$tfw_models{$tgt_file}})[0];
print LINKS "$src_file $tgt_file $term\n"; $indegree{$tgt_file}++; } } # PA or Radev?
$timestep++; } # foreach source doc
} # foreach target doc
}

logisticdescriptionprevnextTop
sub logistic {
($$$) {  my ($x, $s, $c) = @_;
  $x += (0.5 - $c);
  return 1 / (1 + ($e ** ((-$s * $x) + ($s / 2))));
}

newdescriptionprevnextTop
sub new {
  my $class = shift;

  my %params = @_;
  # Instantiate our base class/create representation
$params{model_name} = "MenczerMacro"; my $self = $class->new_linker(%params); return $self;
}

read_model_from_filedescriptionprevnextTop
sub read_model_from_file {
($) {  my $file = shift;
  my %model;
  my ($term, $weight);
  open (MF, $file) || die "Cant open $file\n";
  while (<MF>) {
    chomp;
    ($term, $weight) = split;
    $model{$term} = $weight;
  }
  close (MF);
  return\% model;
}

General documentation


No general documentation available.