| Summary | Package variables | Synopsis | Description | General documentation | Methods |
| Summary | Top |
| CorpusDownload |
| Package variables | Top |
| |
| %batch = () |
| Included modules | Top |
| Clair::Config |
| Clair::Utils::TFIDFUtils |
| File::Copy qw ( copy ) |
| File::Find |
| File::Path(1) |
| File::Path(2) qw ( mkpath ) |
| FindBin |
| HTML::LinkExtractor |
| Lingua::Stem |
| POSIX |
| strict |
| Synopsis | Top |
| CorpusDownload |
| Description | Top |
| This module supplies the functionality of a subset of the perltree routines. Specifically, it downloads the documents requested, stores them in the TREC corpus format used by perltree and builds the TF/IDF databases. |
| Methods | Top |
| add_file | No description | Code |
| add_tf_entries | No description | Code |
| addlinks | No description | Code |
| buildCorpus | No description | Code |
| buildCorpusFromFiles | No description | Code |
| buildIdf | Description | Code |
| buildTf | Description | Code |
| build_corpus_from_directory | No description | Code |
| build_doc_len | No description | Code |
| build_docno_dbm | Description | Code |
| build_term_counts | Description | Code |
| cleanup | Description | Code |
| copyFiles | No description | Code |
| deleteCorpus() | No description | Code |
| download_urls | No description | Code |
| getUniqList | Description | Code |
| get_doc_len_dist() | No description | Code |
| get_term_counts() | No description | Code |
| list_dir | No description | Code |
| local_normalize_url | No description | Code |
| makeDirs | Description | Code |
| new | Description | Code |
| poach | No description | Code |
| process_batch | No description | Code |
| process_document | No description | Code |
| process_document_text | Description | Code |
| queue_tf_entry | No description | Code |
| readUrlsFile | No description | Code |
| urlsToCorpus | Description | Code |
| verifyUrl | No description | Code |
| wgetall2 | Description | Code |
| write_links | No description | Code |
| buildIdf | code | next | Top |
| $cref->buildIdf(stemmed => 0, punc => 0); Builds the IDF. The IDF entries are stemmed or not depending on the parameter passed to the constructor. Punctuation is included depending on the punc argument. stemmed (optional) Set to 1 if the IDF elements should be stemmed, and 0 otherwise. Default is 0 (not stemmed). punc (optional) Set to 1 to include punctuation. Default is 0. |
| buildTf | code | prev | next | Top |
| $cref->buildTf(stemmed => 1); Builds the TF. The TF entries are stemmed or not depending on the parameter passed to the constructor. Note that build_docno_dbm must have been called before this. stemmed (optional) Set to 1 if the IDF elements should be stemmed, and 0 otherwise. Default is 0 (not stemmed). NOTE If both stemmed and unstemmed TFs are desired, there is no need to rebuild the docno dbm prior to building the second TF. Using of the TF and IDF once they are built is described in Tf.pm and Idf.pm |
| build_docno_dbm | code | prev | next | Top |
| $cref->build_docno_dbm(); Builds the DOCNO-to-URL and URL-to-DOCNO database. The details of this will be explained in the .pdf file that will be available soon from the CLAIR website. Meanwhile, all the user needs to know is this method must be called before either TF is built. |
| build_term_counts | code | prev | next | Top |
| token_counts Returns an array of term counts in the corpus This is used by the synth corpus tools |
| cleanup | code | prev | next | Top |
| (private) remove metafiles |
| getUniqList | code | prev | next | Top |
| (private) compile list of unique downloads |
| makeDirs | code | prev | next | Top |
| (private) makes directory tree for corpus |
| new | code | prev | next | Top |
| $cref = Clair::Utils::CorpusDownload::new(rootdir => "/path/to/project", corpusname = "uiuc"); rootdir (optional) The path to the directory where the corpus and associated TFIDF will be built and stored. Default is "/data0/projects/tfidf". This path should be an absolute path, not a relative one. corpusname (required) The name of the corpus that will be built. The corpus will consist of all documents with URLs in the array @urls that can be located at build time. The top level of the corpus will be named $rootdir/$corpusname. buildCorpus $cref->buildCorpus(urlref => \@urls, cleanup => 0); Builds a new corpus consisting of all documents at the URLs passed in parameter @urls. urlref (required) A reference to an array of URLs from which the corpus should be built. cleanup (optional) Remove (default) or retain (parameter 0 passed) metafiles produced during corpus build. (Note: Retaining the metafiles can produce undesirable side-effects during a rebuild.) |
| process_document_text | code | prev | next | Top |
| (private) split text into words and store words in hash |
| urlsToCorpus | code | prev | next | Top |
| (private) build corpus in TREC format from downloaded documents |
| wgetall2 | code | prev | next | Top |
| (private) downloads documents from URLS in {urlsref}, using GNU wget |
| add_file | description | prev | next | Top |
sub add_file
{ my $files = shift;
my $root = shift;
return sub {
next if -d $_;
my $file = $File::Find::name;
# print $root . " " . $file, "\n";} |
| add_tf_entries | description | prev | next | Top |
sub add_tf_entries
{ my $self = shift();
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $word = shift();
my $countpos_ref = shift();
my $dir;
# this is a workaround for the fact that as_text() doesn't insert} |
| addlinks | description | prev | next | Top |
sub addlinks
{# my($url,$contents, $utoid, $indexer) = @_;} |
| buildCorpus | description | prev | next | Top |
sub buildCorpus
{
my $self = shift;
my %args = @_;
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $urlsref = $args{urlsref};
my $cleanup = ( defined $args{cleanup} ? $args{cleanup} : 1 );
makeDirs($rootdir, $corpus);
chdir($rootdir);
wgetall2($corpus, $urlsref);
verifyUrl($corpus);
getUniqList($corpus);
urlsToCorpus($rootdir, $corpus);
chdir("../..");
if ( $cleanup == 1 ) {
cleanup($rootdir, $corpus);
}} |
| buildCorpusFromFiles | description | prev | next | Top |
sub buildCorpusFromFiles
{
my $self = shift;
my %args = @_;
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $filesref = $args{filesref};
my $cleanup = ( defined $args{cleanup} ? $args{cleanup} : 1 );
my $safe = ( defined $args{safe} ? $args{safe} : 0 );
my $skipCopy = (defined $args{skipCopy} ? $args{skipCopy} : 0);
makeDirs($rootdir, $corpus, safe => $safe);
my $dir = `pwd`;
chomp $dir;
my $dest = "$rootdir/download/$corpus";
if ( $skipCopy ){
opendir DIR, $dest;
if ( scalar(grep( !/^\.\.?$/, readdir(DIR)) == 0)) {
print STDERR "Tried to skip file copy when $dest files don't exist!\n";
return;
}
}
if ( ! $skipCopy ) {
copyFiles($filesref, $corpus, "dest" => $dest, "root" => $rootdir,
"pwd" => $dir);
}
# getUniqList($corpus);} |
| buildIdf | description | prev | next | Top |
sub buildIdf
{
my $self = shift();
my %args = @_;
if ( defined $args{stemmed} ) {
$self->{stemmed} = $args{stemmed};
}
if ( defined $args{punc} ) {
$self->{punc} = $args{punc};
}
my $punc = $self->{punc};
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $orig_dir = `pwd`;
chomp $orig_dir;
chdir("$rootdir");
# -------------------------------------------------------} |
| buildTf | description | prev | next | Top |
sub buildTf
{
my $self = shift();
my %args = @_;
if ( defined $args{stemmed} ) {
$self->{stemmed} = $args{stemmed};
}
if ( defined $args{punc} ) {
$self->{punc} = $args{punc};
}
my $punc = $self->{punc};
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $orig_dir = `pwd`;
chomp($orig_dir);
chdir("$rootdir");
print "Building TF...\n";
# -------------------------------------------------------} |
| build_corpus_from_directory | description | prev | next | Top |
sub build_corpus_from_directory
{ my $self = shift;
my %args = @_;
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $dir = $args{dir};
my $cleanup = ( defined $args{cleanup} ? $args{cleanup} : 0 );
my $safe = ( defined $args{safe} ? $args{safe} : 0 );
my $skipCopy = ( defined $args{skipCopy} ? $args{skipCopy} : 0);
my @files = list_dir($dir);
$self->buildCorpusFromFiles(filesref =>\@ files, cleanup => $cleanup,
safe => $safe, skipCopy => $skipCopy);} |
| build_doc_len | description | prev | next | Top |
sub build_doc_len
{ my $self = shift;
my %args = @_;
if ( defined $args{stemmed} ) {
$self->{stemmed} = $args{stemmed};
}
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $orig_dir = `pwd`;
chomp $orig_dir;
chdir("$rootdir");
# -------------------------------------------------------} |
| build_docno_dbm | description | prev | next | Top |
sub build_docno_dbm
{
my $self = shift();
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
print "Building docno-to-URL database...\n";
# -------------------------------------------------------} |
| build_term_counts | description | prev | next | Top |
sub build_term_counts
{ my $self = shift;
my %args = @_;
if ( defined $args{stemmed} ) {
$self->{stemmed} = $args{stemmed};
}
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
my $stemmed = $self->{stemmed};
my $orig_dir = `pwd`;
chomp $orig_dir;
chdir("$rootdir");
# -------------------------------------------------------} |
| cleanup | description | prev | next | Top |
sub cleanup
{
my $rootdir = shift();
my $corpus = shift();
system("rm $corpus.list");
system("rm $corpus.download");
system("rm $corpus.download.duplicates");
system("rm $corpus.download.uniq");
system("rm download/$corpus/empty");
system("rm download/$corpus/duplicates");
system("rm download/$corpus/notdownloaded");} |
| copyFiles | description | prev | next | Top |
sub copyFiles
{ my $filesref = shift;
my @files = @$filesref;
my $corpus = shift;
my %args = @_;
my $dest = ( defined $args{dest} ? $args{dest} : "" );
my $root = ( defined $args{root} ? $args{root} : "" );
# Write the .uniq file list} |
| deleteCorpus() | description | prev | next | Top |
sub deleteCorpus()
{ my $self = shift;
my $rootdir = $self->{rootdir};
my $corpus = $self->{corpus};
`rm -rf $rootdir/download/$corpus`;
`rm -rf $rootdir/corpora/$corpus`;
`rm -rf $rootdir/corpus-data/$corpus`;
`rm -rf $rootdir/corpus-data/$corpus-tf`;
`rm -rf $rootdir/corpus-data/$corpus-tf-s`;} |
| download_urls | description | prev | next | Top |
sub download_urls
{ my $self = shift;
my %args = @_;
my $rootdir = $self->{rootdir};
print $rootdir, "\n";
my $corpus = $self->{corpus};
my $urlsref = $args{urlsref};
my $cleanup = ( defined $args{cleanup} ? $args{cleanup} : 1 );
makeDirs($rootdir, $corpus);
chdir($rootdir);
wgetall2($corpus, $urlsref);
verifyUrl($corpus);
getUniqList($corpus);
chdir("../..");
if ( $cleanup == 1 ) {
cleanup($rootdir, $corpus);
}} |
| getUniqList | description | prev | next | Top |
sub getUniqList
{
my $corpus = shift();
my $infile = "../../$corpus.download";
my $outfile = "../../$corpus.download.uniq";
my $errfile = "../../$corpus.download.duplicates";
print "Computing unique URL set...\n";
my %urls = ();
my %locs = ();
open (IN, "<$infile");
open (OUT, ">$outfile");
open (ERR, ">$errfile");
foreach my $line (<IN>) {
if ($line =~ m/\"(.+)\" \"(.+)\"/) {
unless (exists $urls{$1} || exists $locs{$2}) {
print OUT "\"$1\"\" $2\"\n";
}
if (exists $urls{$1} || exists $locs{$2}) {
|