| Summary | Package variables | Synopsis | Description | General documentation | Methods |
| Summary | Top |
| Clair::Corpus - Class for dealing with TREC corpus format data |
| Package variables | Top |
| No package variables defined. |
| Included modules | Top |
| DB_File |
| Synopsis | Top |
| Clair::Corpus |
| Description | Top |
| This module loads and stores TREC format corpuses. It also contains functions for indexing the corpus. |
| Methods | Top |
| get_directory | Description | Code |
| get_name | Description | Code |
| get_term_counts | Description | Code |
| new | Description | Code |
| get_directory | code | next | Top |
| Returns the base directory the corpus is in |
| get_name | code | prev | next | Top |
| Returns the name of the corpus |
| get_term_counts | code | prev | next | Top |
| Returns a hash table of terms and term counts (frequencies) |
| new | code | prev | next | Top |
| $cref = Corpus::new(rootdir => "/path/to/project", corpusname = "uiuc"); rootdir (optional) The path to the directory where the corpus and associated TFIDF will be built and stored. Default is "/data0/projects/tfidf". This path should be an absolute path, not a relative one. corpusname (required) The name of the corpus that will be built. The corpus will consist of all documents with URLs in the array @urls that can be located at build time. The top level of the corpus will be named $rootdir/$corpusname. |
| get_directory | description | prev | next | Top |
sub get_directory
{ my $self = shift;
return $self->{rootdir};} |
| get_name | description | prev | next | Top |
sub get_name
{ my $self = shift;
return $self->{corpus};} |
| get_term_counts | description | prev | next | Top |
sub get_term_counts
{ my $self = shift;
my %args = @_;
if ( defined $args{stemmed} ) {
$self->{stemmed} = $args{stemmed};
}
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $base_dir = "$rootdir/corpus-data/$corpus";
print "$base_dir\n";
my $tc_fname = ($stemmed ?
"$base_dir/$corpus-tc-s" : "$base_dir/$corpus-tc");
my %freq = ();
my %tf;
dbmopen %tf, $tc_fname, 0666 or die "Couldn't open $tc_fname: $!\n";
foreach my $term (keys %tf) {
$freq{$term} = $tf{$term};
}
dbmclose %tf;
return %freq;} |
| new | description | prev | next | Top |
sub new
{ my $class = shift;
my %args = @_;
my $usedocno = 1;
my $stemmed = (defined $args{stemmed} ? $args{stemmed} : 0);
my $rootdir = (defined $args{rootdir} ? $args{rootdir} :
"/data0/projects/tfidf");
my $corpus = $args{corpusname};
my $self = {rootdir => $rootdir, corpus => $corpus};
bless($self, $class);
return $self;} |