Clair::Utils

CorpusDownload


SummaryPackage variablesSynopsisDescriptionGeneral documentationMethods

SummaryTop
CorpusDownload

Package variablesTop
Privates (from "my" definitions)
%batch = ()

Included modulesTop
Clair::Config
Clair::Utils::TFIDFUtils
File::Copy qw ( copy )
File::Find
File::Path(1)
File::Path(2) qw ( mkpath )
FindBin
HTML::LinkExtractor
Lingua::Stem
POSIX
strict

SynopsisTop
CorpusDownload

DescriptionTop
This module supplies the functionality of a subset of the
perltree routines. Specifically, it downloads the documents
requested, stores them in the TREC corpus format used by
perltree and builds the TF/IDF databases.

MethodsTop
add_fileNo descriptionCode
add_tf_entriesNo descriptionCode
addlinksNo descriptionCode
buildCorpusNo descriptionCode
buildCorpusFromFilesNo descriptionCode
buildIdfDescriptionCode
buildTfDescriptionCode
build_corpus_from_directoryNo descriptionCode
build_doc_lenNo descriptionCode
build_docno_dbmDescriptionCode
build_term_countsDescriptionCode
cleanupDescriptionCode
copyFilesNo descriptionCode
deleteCorpus()No descriptionCode
download_urlsNo descriptionCode
getUniqListDescriptionCode
get_doc_len_dist()No descriptionCode
get_term_counts()No descriptionCode
list_dirNo descriptionCode
local_normalize_urlNo descriptionCode
makeDirsDescriptionCode
newDescriptionCode
poachNo descriptionCode
process_batchNo descriptionCode
process_documentNo descriptionCode
process_document_textDescriptionCode
queue_tf_entryNo descriptionCode
readUrlsFileNo descriptionCode
urlsToCorpusDescriptionCode
verifyUrlNo descriptionCode
wgetall2DescriptionCode
write_linksNo descriptionCode

Methods description


buildIdfcode    nextTop
$cref->buildIdf(stemmed => 0, punc => 0);
Builds the IDF. The IDF entries are stemmed or not depending on the
parameter passed to the constructor. Punctuation is included depending on
the punc argument.
stemmed
(optional) Set to 1 if the IDF elements should be stemmed,
and 0 otherwise. Default is 0 (not stemmed).
punc
(optional) Set to 1 to include punctuation. Default is 0.

buildTfcodeprevnextTop
$cref->buildTf(stemmed => 1);
Builds the TF. The TF entries are stemmed or not depending on
the parameter passed to the constructor.
Note that build_docno_dbm must have been called before this.
stemmed
(optional) Set to 1 if the IDF elements should be stemmed,
and 0 otherwise. Default is 0 (not stemmed).
NOTE
If both stemmed and unstemmed TFs are desired, there is no need
to rebuild the docno dbm prior to building the second TF.
Using of the TF and IDF once they are built is described in Tf.pm
and Idf.pm

build_docno_dbmcodeprevnextTop
$cref->build_docno_dbm();
Builds the DOCNO-to-URL and URL-to-DOCNO database. The details
of this will be explained in the .pdf file that will be available
soon from the CLAIR website. Meanwhile, all the user needs to
know is this method must be called before either TF is built.

build_term_countscodeprevnextTop
token_counts
Returns an array of term counts in the corpus
This is used by the synth corpus tools

cleanupcodeprevnextTop
(private) remove metafiles

getUniqListcodeprevnextTop
(private) compile list of unique downloads

makeDirscodeprevnextTop
(private) makes directory tree for corpus

newcodeprevnextTop
$cref = Clair::Utils::CorpusDownload::new(rootdir => "/path/to/project",
corpusname = "uiuc");
rootdir
(optional) The path to the directory where the corpus
and associated TFIDF will be built and stored. Default is
"/data0/projects/tfidf". This path should be an absolute
path, not a relative one.
corpusname
(required) The name of the corpus that will be built.
The corpus will consist of all documents with URLs in
the array @urls that can be located at build time.
The top level of the corpus will be named
$rootdir/$corpusname.
buildCorpus
$cref->buildCorpus(urlref => \@urls, cleanup => 0);
Builds a new corpus consisting of all documents at the URLs
passed in parameter @urls.
urlref
(required) A reference to an array of URLs from which the
corpus should be built.
cleanup
(optional) Remove (default) or retain (parameter 0 passed) metafiles
produced during corpus build. (Note: Retaining the metafiles can produce
undesirable side-effects during a rebuild.)

process_document_textcodeprevnextTop
(private) split text into words and store words in hash

urlsToCorpuscodeprevnextTop
(private) build corpus in TREC format from downloaded documents

wgetall2codeprevnextTop
(private) downloads documents from URLS in {urlsref}, using
GNU wget

Methods code


add_filedescriptionprevnextTop
sub add_file {
  my $files = shift;
  my $root = shift;

  return sub {
    next if -d $_;
    my $file = $File::Find::name;
#    print $root . " " . $file, "\n";
push @{$files}, $file; }
}

add_tf_entriesdescriptionprevnextTop
sub add_tf_entries {
    my $self = shift();
    my $rootdir = $self->{rootdir};
    my $corpus = $self->{corpus};
    my $stemmed = $self->{stemmed};

    my $word = shift();
    my $countpos_ref = shift();
    my $dir;

    # this is a workaround for the fact that as_text() doesn't insert
# spaces where stuff is.
if (length $word > 50) { print STDERR "\nOverlong word '$word' encountered. Skipping...\n"; return; } if (length $word == 1) { $dir = ($stemmed ? "./corpus-data/$corpus-tf-s/$word" : "./corpus-data/$corpus-tf/$word"); } else { my $dir1 = substr $word, 0, 1; my $dir2 = substr $word, 0, 2; $dir = ($stemmed ? "./corpus-data/$corpus-tf-s/$dir1/$dir2" : "./corpus-data/$corpus-tf/$dir1/$dir2"); } # make sure the directory exists.
mkpath $dir unless -d $dir; my $file = "$dir/$word.tf"; open TFFILE, ">> $file" or die "Unable to open file '$file'"; # PHRASE INDEXING!!!
foreach my $docno (sort keys %$countpos_ref) { my $id = compress_docid($docno, $corpus); my @positions = map (base10_to_base36($_), sort @{$countpos_ref->{$docno}->{positions}}); print TFFILE "$id $countpos_ref->{$docno}->{count} " . join(" ", @positions) . "\n"; } close TFFILE;
}

addlinksdescriptionprevnextTop
sub addlinks {
  # my($url,$contents, $utoid, $indexer) = @_;
my($url,$contents, $utoid) = @_; my %urltoid = %$utoid; $url = local_normalize_url($url); # $indexer->clearurl($url);
my $lx = HTML::LinkExtractor->new(undef,$url) or die "Error creating LinkExtractor: $!\n"; $lx->strip(1); $lx->parse($contents); foreach my $l (@{$lx->links()}) { my($type,$text,$href); if ($$l{tag} eq 'a') { ($type,$text,$href)=($$l{tag},$$l{_TEXT},$$l{href}); } else { next; } $href=local_normalize_url($href); # print "$url -> $href\n";
# if (!$indexer->foundlink($url,$type,$text,$href))
# {
# return undef;
# }
# else
# {
if((defined $urltoid{$url}) && (defined $urltoid{$href})) { print LINKS "$urltoid{$url} $urltoid{$href}\n";} elsif (defined $urltoid{$url} ) { print LINKS "$urltoid{$url} EX\n";} elsif (defined $urltoid{$href} ) { print LINKS "EX $urltoid{$href}\n";} else { print LINKS "EX EX\n"; } # }
} # $indexer->completeurl($url);
# print "PID $$ FILE $ARGV URL $url\n";
}

buildCorpusdescriptionprevnextTop
sub buildCorpus {
    my $self = shift;
    my %args = @_;

    my $rootdir = $self->{rootdir};
    my $corpus = $self->{corpus};

    my $urlsref  = $args{urlsref};
    my $cleanup =  ( defined $args{cleanup} ? $args{cleanup} : 1 );

    makeDirs($rootdir, $corpus);
    chdir($rootdir);
    wgetall2($corpus, $urlsref);
    verifyUrl($corpus);
    getUniqList($corpus);
    urlsToCorpus($rootdir, $corpus);

    chdir("../..");
    if ( $cleanup == 1 )  {
        cleanup($rootdir, $corpus);
    }
}

buildCorpusFromFilesdescriptionprevnextTop
sub buildCorpusFromFiles {
    my $self = shift;
    my %args = @_;

    my $rootdir = $self->{rootdir};
    my $corpus = $self->{corpus};

    my $filesref  = $args{filesref};
    my $cleanup =  ( defined $args{cleanup} ? $args{cleanup} : 1 );
    my $safe = ( defined $args{safe} ? $args{safe} : 0 );
    my $skipCopy = (defined $args{skipCopy} ? $args{skipCopy} : 0); 
    
    makeDirs($rootdir, $corpus, safe => $safe);
    my $dir = `pwd`;
    chomp $dir;
    my $dest = "$rootdir/download/$corpus";

    if ( $skipCopy ){
      opendir DIR, $dest;
      if ( scalar(grep( !/^\.\.?$/, readdir(DIR)) == 0)) {
        print STDERR "Tried to skip file copy when $dest files don't exist!\n";
        return;  
      }
    }

    if ( ! $skipCopy ) {
      copyFiles($filesref, $corpus, "dest" => $dest, "root" => $rootdir,
	      "pwd" => $dir);
    }

    # getUniqList($corpus);
chdir($dest); urlsToCorpus($rootdir, $corpus); chdir("../.."); if ( $cleanup == 1 ) { cleanup($rootdir, $corpus); } chdir($dir);
}

buildIdfdescriptionprevnextTop
sub buildIdf {
   my $self = shift();

   my %args = @_;
   if ( defined $args{stemmed} )  {
       $self->{stemmed} = $args{stemmed};
   }
   if ( defined $args{punc} ) {
      $self->{punc} = $args{punc};
   }    
   my $punc = $self->{punc};
   my $rootdir = $self->{rootdir};
   my $corpus = $self->{corpus};
   my $stemmed = $self->{stemmed};

   my $orig_dir = `pwd`;
   chomp $orig_dir;
   chdir("$rootdir");

   # -------------------------------------------------------
# TFIDFUtils needs this set
# -------------------------------------------------------
$ENV{TFIDF_DIR} = "./corpus-data/$corpus"; my $BASE_DIR = "./corpora/$corpus"; my $tfidf_dir = "./corpus-data/$corpus"; my %url_list; my $NAME = "$corpus"; my $IDF_DBM_NAME = ( $stemmed ? "$tfidf_dir/$NAME-idf-s" : "$tfidf_dir/$NAME-idf"); mkdir $BASE_DIR; opendir(DIR, $BASE_DIR) or die "Unable to open directory $BASE_DIR\n"; my @subdirs = sort(grep /\d+/, readdir DIR); closedir DIR; my %df = (); my $num_docs = 0; my $num_err_docs = 0; open(OUT, ">$tfidf_dir/$corpus.build-idf.log"); foreach my $subdir (@subdirs) { my $dir = "$BASE_DIR/$subdir"; next unless (-d $dir); print OUT "Processing $dir\n"; opendir DIR, $dir or die "Cannot open directory $dir\n"; my @files = sort(grep /\d+/, readdir DIR); closedir DIR; foreach my $file (@files) { $file = "$dir/$file"; print OUT "\t$file "; my $doc = ""; open FILE, $file or die; while (<FILE>) { chomp; $doc .= $_ . " "; if (m|</DOC>|) { $doc =~ m|<DOC>.*?<DOCHDR>.*</DOCHDR>.*</DOC>| or warn "Improperly formatted document in $file\n"; next unless $doc =~ m|<DOC>.*?<DOCHDR>.*(http:[^ ]*) .*</DOCHDR>(.*)</DOC>|; my $url=$1; chomp $url; print OUT "On $url\n"; $url=local_normalize_url($url); next if (defined $url_list{$url}); $url_list{$url}=1; my $html = $2; my $text = extract_text_from_html($html); $text =~ s/&.*?;//g; $self->process_document_text($text,\% df, $punc); $doc = ""; $num_docs++; } } close FILE; print OUT "\n"; } print OUT "\n"; } print OUT "Building IDF DBM: $IDF_DBM_NAME\n"; print OUT "Stemmed = $stemmed\n"; my %idf; dbmopen %idf, $IDF_DBM_NAME, 0666 || die "Unable to open database $IDF_DBM_NAME"; %idf = (); my $LOG_2 = log(2); my $num_words = 0; # -------------------------------------------------------
# Create the idf hash
# -------------------------------------------------------
while ( my ($word, $df) = each %df ) { my @url_keys=keys %url_list; my $number_normalized_urls=$#url_keys+1; $idf{$word} = -log(1 - exp(-$df/$number_normalized_urls)) / $LOG_2; $num_words++; print OUT "." unless $num_words % 1000; } dbmclose %idf; print OUT "\n\n"; print OUT "$num_words words\n"; close(OUT); chdir($orig_dir);
}

buildTfdescriptionprevnextTop
sub buildTf {
   my $self = shift();

   my %args = @_;
   if ( defined $args{stemmed} )  {
       $self->{stemmed} = $args{stemmed};
   }
   if ( defined $args{punc} ) {
      $self->{punc} = $args{punc};
   }    
   my $punc = $self->{punc};
   my $rootdir = $self->{rootdir};
   my $corpus = $self->{corpus};
   my $stemmed = $self->{stemmed};

   my $orig_dir = `pwd`;
   chomp($orig_dir);

   chdir("$rootdir");

   print "Building TF...\n";

   # -------------------------------------------------------
# TFIDFUtils needs this set
# -------------------------------------------------------
$ENV{TFIDF_DIR} = "./corpus-data/$corpus"; my $BASE_DIR = "./corpora/$corpus"; my $TF_BASE_DIR = ( $stemmed ? "./corpus-data/$corpus-tf-s" : "./corpus-data/$corpus-tf"); # -------------------------------------------------------
# Note: this is to prevent multiple entries from
# multiple builds.
# -------------------------------------------------------
if (-d $TF_BASE_DIR) { system("rm -rf $TF_BASE_DIR"); } mkdir $TF_BASE_DIR; opendir DIR, $BASE_DIR or die "Unable to open directory $BASE_DIR\n"; my @subdirs = sort(grep /\d+/, readdir DIR); closedir DIR; open(OUT, ">./corpus-data/$corpus/$corpus.build-tf.log"); foreach my $subdir (@subdirs) { my $dir = "$BASE_DIR/$subdir"; print OUT "Processing $dir\n"; opendir DIR, $dir or die; my @files = sort(grep /\d+/, readdir DIR); closedir DIR; foreach my $file (@files) { $file = "$dir/$file"; print OUT "\t$file "; my $doc = ""; open BFILE, $file or die; if ($verbose) { print STDERR "Parsing $file\n"; } while (<BFILE>) { chomp; $doc .= $_ . " "; if (m|</DOC>|) { $doc =~ m|<DOC> <DOCNO>(.*?)</DOCNO>.*?</DOCHDR>(.*)</DOC>| or die "Document does not match expected pattern:\n $doc\n\n"; my $docno = $1; my $html = $2; my $text = extract_text_from_html($html); $self->process_document($docno, $text, $punc); $doc = ""; } } if ($verbose) { print STDERR "Done parsing $file\n"; } close BFILE; print OUT "\n"; if ($verbose) { print STDERR "Batch processing\n"; } $self->process_batch(); print OUT "\n"; } } chdir($orig_dir);
}

build_corpus_from_directorydescriptionprevnextTop
sub build_corpus_from_directory {
  my $self = shift;
  my %args = @_;

  my $rootdir = $self->{rootdir};
  my $corpus = $self->{corpus};

  my $dir  = $args{dir};
  my $cleanup =  ( defined $args{cleanup} ? $args{cleanup} : 0 );
  my $safe = ( defined $args{safe} ? $args{safe} : 0 );
  my $skipCopy = ( defined $args{skipCopy} ? $args{skipCopy} : 0);

  my @files = list_dir($dir);

  $self->buildCorpusFromFiles(filesref =>\@ files, cleanup => $cleanup,
			      safe => $safe, skipCopy => $skipCopy);
}

build_doc_lendescriptionprevnextTop
sub build_doc_len {
  my $self = shift;

  my %args = @_;
  if ( defined $args{stemmed} ) {
    $self->{stemmed} = $args{stemmed};
  }

  my $rootdir = $self->{rootdir};
  my $corpus = $self->{corpus};
  my $stemmed = $self->{stemmed};

  my $orig_dir = `pwd`;
  chomp $orig_dir;
  chdir("$rootdir");

  # -------------------------------------------------------
# TFIDFUtils needs this set
# -------------------------------------------------------
$ENV{TFIDF_DIR} = "./corpus-data/$corpus"; my $BASE_DIR = "./corpora/$corpus"; my $tfidf_dir = "./corpus-data/$corpus"; my %url_list; my $NAME = "$corpus"; my $DOCLEN_DBM_NAME = ( $stemmed ? "$tfidf_dir/$NAME-doclen-s" : "$tfidf_dir/$NAME-doclen"); mkdir $BASE_DIR; opendir(DIR, $BASE_DIR) or die "Unable to open directory $BASE_DIR\n"; my @subdirs = sort(grep /\d+/, readdir DIR); closedir DIR; my %df = (); my $num_docs = 0; my $num_err_docs = 0; open(OUT, ">$tfidf_dir/$corpus.build-doclen.log"); print OUT "Building DOCLEN DBM: $DOCLEN_DBM_NAME\n"; print OUT "Stemmed = $stemmed\n"; my %doclen; dbmopen %doclen, $DOCLEN_DBM_NAME, 0666 || die "Unable to open database $DOCLEN_DBM_NAME"; %doclen = (); foreach my $subdir (@subdirs) { my $dir = "$BASE_DIR/$subdir"; next unless (-d $dir); print OUT "Processing $dir\n"; opendir DIR, $dir or die "Cannot open directory $dir\n"; my @files = sort(grep /\d+/, readdir DIR); closedir DIR; foreach my $file (@files) { $file = "$dir/$file"; print OUT "\t$file "; my $doc = ""; open FILE, $file or die; while (<FILE>) { chomp; $doc .= $_ . " "; if (m|</DOC>|) { $doc =~ m|<DOC>.*?<DOCNO>.*</DOCNO>.*<DOCHDR>.*</DOCHDR>.*</DOC>| or warn "Improperly formatted document in $file\n"; next unless $doc =~ m|<DOC>.*?<DOCNO>(.*)</DOCNO>.*<DOCHDR>.*(http:[^ ]*) .*</DOCHDR>(.*)</DOC>|; my $docid=$1; chomp $docid; print OUT "On $docid\n"; # $docid=ale_normalize_docid($url);
next if (defined $url_list{$docid}); $url_list{$docid}=1; my $html = $3; my $text = extract_text_from_html($html); $text =~ s/&.*?;//g; my $count = $self->process_document_text($text,\% df); $doclen{$docid} = $count; $doc = ""; $num_docs++; } } close FILE; print OUT "\n"; } print OUT "\n"; } dbmclose %doclen; close(OUT); chdir($orig_dir);
}

build_docno_dbmdescriptionprevnextTop
sub build_docno_dbm {
   my $self = shift();
   my $rootdir = $self->{rootdir};
   my $corpus = $self->{corpus};
   print "Building docno-to-URL database...\n";

   # -------------------------------------------------------
# alecanonurl needs this set
# -------------------------------------------------------
my $currdir = `pwd`; chomp($currdir); $ENV{ALECACHE} = "$currdir"; # -------------------------------------------------------
# TFIDFUtils need this set
# -------------------------------------------------------
$ENV{TFIDF_DIR} = "./corpus-data/$corpus"; chdir("$rootdir"); my $BASE_DIR = "./corpora/$corpus"; my $COMPRESS_DBM_NAME = "./corpus-data/$corpus/$corpus-compress-docid"; my $EXPAND_DBM_NAME = "./corpus-data/$corpus/$corpus-expand-docid"; my $TO_URL_DBM_NAME = "./corpus-data/$corpus/$corpus-docid-to-url"; my $FROM_URL_DBM_NAME = "./corpus-data/$corpus/$corpus-url-to-docid"; opendir DIR, $BASE_DIR or die "Unable to open directory $BASE_DIR\n"; my @subdirs = sort(grep /\d+/, readdir DIR); closedir DIR; my @docids = (); my @urls = (); foreach my $subdir (@subdirs) { my $dir = "$BASE_DIR/$subdir"; opendir DIR, $dir or die; my @files = sort grep /\d+/, readdir DIR; closedir DIR; foreach my $file (@files) { $file = "$dir/$file"; if ($verbose) { print STDERR "Processing $file\n"; } my $next_has_url = 0; open FILE, $file or die; while (<FILE>) { chomp; if (m|<DOCNO>(.*?)</DOCNO>|) { push @docids, $1; } elsif (m|<DOCHDR>|) { $next_has_url = 1; } elsif ($next_has_url) { # m|^(http\://[^\s]+)| or die "$_\n";
m|^(http[s]?\://[^\s]+)| or die "$_\n"; my $url = $1; my $save_url = $url; # NEW! convert the url to wget canonical form.
# $url = alecanonurl($url);
# alecanonurl complains if there are illegal characters,
# so if it does (and this is only about 10 times total for
# wt2g), just use the original (non-canonicalized) url.
unless (defined $url) { print "\n\tBAD: $save_url"; $url = $save_url; } push @urls, $url; $next_has_url = 0; } } close FILE; print "\n"; } print "\n"; } print scalar(@docids), " docids\n"; print scalar(@urls), " urls "; print "(", scalar(uniq(@urls)), " unique)\n"; die "Unequal numbers of docids and urls" unless (scalar @urls) == (scalar @docids); # -------------------------------------------------------------------
# NOTE: I removed the expand/compress stuff because I didn't need to rebuild
# them. Just uncomment all commanted lines below to rebuild them too.
# -------------------------------------------------------------------
my %compress = (); dbmopen %compress, $COMPRESS_DBM_NAME, 0666 or die "Can't open '$COMPRESS_DBM_NAME'"; %compress = (); my %expand = (); dbmopen %expand, $EXPAND_DBM_NAME, 0666 or die "Can't open '$EXPAND_DBM_NAME'"; %expand = (); my %to_urls = (); dbmopen %to_urls, $TO_URL_DBM_NAME, 0666 or die "Can't open '$TO_URL_DBM_NAME'"; %to_urls = (); my %from_urls = (); dbmopen %from_urls, $FROM_URL_DBM_NAME, 0666 or die "Can't open '$FROM_URL_DBM_NAME'"; %from_urls = (); my $count; for ($count = 0; $count < @docids; $count++) { my $docid = $docids[$count]; my $url = $urls[$count]; my $comp = base10_to_base36($count+1); $compress{$docid} = $comp; $expand{$comp} = $docid; # NEW! print an error message if we have a collision.
if ($from_urls{$url}) { print "\nCOLLISION: '$from_urls{$url}' and '$docid' both point to:\n"; print "\t$url\n"; } $to_urls{$docid} = $url; $from_urls{$url} = $docid; if ($verbose) { print STDERR "." unless $count % 100; } } print "\n"; print "$count documents\n"; print scalar(keys(%compress)), " keys in the docid compression dbm\n"; print scalar(keys(%expand)), " keys in the docid expansion dbm\n"; print scalar(keys(%to_urls)), " keys in the docid to url dbm\n"; print scalar(keys(%from_urls)), " keys in the url to docid dbm\n"; print "docno length: ", length($docids[0]), "\n"; print "max id length: ", length(base10_to_base36($count)), "\n"; #warn unless scalar keys %compress == $count;
#warn unless scalar keys %expand == $count;
#warn unless scalar keys %to_urls == $count;
#warn unless scalar keys %from_urls == $count;
dbmclose %compress; dbmclose %expand; dbmclose %to_urls; dbmclose %from_urls; chdir($currdir);
}

build_term_countsdescriptionprevnextTop
sub build_term_counts {
  my $self = shift;

  my %args = @_;
  if ( defined $args{stemmed} ) {
    $self->{stemmed} = $args{stemmed};
  }

  my $rootdir = $self->{rootdir};
  my $corpus = $self->{corpus};
  my $stemmed = $self->{stemmed};

  my $orig_dir = `pwd`;
  chomp $orig_dir;

  chdir("$rootdir");

  # -------------------------------------------------------
# TFIDFUtils needs this set
# -------------------------------------------------------
$ENV{TFIDF_DIR} = "./corpus-data/$corpus"; my $BASE_DIR = "./corpora/$corpus"; my $tfidf_dir = "./corpus-data/$corpus"; my %url_list; my $NAME = "$corpus"; my $IDF_DBM_NAME = ( $stemmed ? "$tfidf_dir/$NAME-tc-s" : "$tfidf_dir/$NAME-tc"); mkdir $BASE_DIR; opendir(DIR, $BASE_DIR) or die "Unable to open directory $BASE_DIR\n"; my @subdirs = sort(grep /\d+/, readdir DIR); closedir DIR; my %df = (); my $num_docs = 0; my $num_err_docs = 0; open(OUT, ">$tfidf_dir/$corpus.build-tc.log"); foreach my $subdir (@subdirs) { my $dir = "$BASE_DIR/$subdir"; next unless (-d $dir); print OUT "Processing $dir\n"; opendir DIR, $dir or die "Cannot open directory $dir\n"; my @files = sort(grep /\d+/, readdir DIR); closedir DIR; foreach my $file (@files) { $file = "$dir/$file"; print OUT "\t$file "; my $doc = ""; open FILE, $file or die; while (<FILE>) { chomp; $doc .= $_ . " "; if (m|</DOC>|) { $doc =~ m|<DOC>.*?<DOCHDR>.*</DOCHDR>.*</DOC>| or warn "Improperly formatted document in $file\n"; next unless $doc =~ m|<DOC>.*?<DOCHDR>.*(http:[^ ]*) .*</DOCHDR>(.*)</DOC>|; my $url=$1; chomp $url; print OUT "On $url\n"; $url=local_normalize_url($url); next if (defined $url_list{$url}); $url_list{$url}=1; my $html = $2; my $text = extract_text_from_html($html); $text =~ s/&.*?;//g; my $count = $self->process_document_text($text,\% df); $doc = ""; $num_docs++; } } close FILE; print OUT "\n"; } print OUT "\n"; } print OUT "Building IDF DBM: $IDF_DBM_NAME\n"; print OUT "Stemmed = $stemmed\n"; my %idf; dbmopen %idf, $IDF_DBM_NAME, 0666 || die "Unable to open database $IDF_DBM_NAME"; %idf = (); my $LOG_2 = log(2); my $num_words = 0; # -------------------------------------------------------
# Create the idf hash
# -------------------------------------------------------
my @url_keys = keys %url_list; my $number_normalized_urls = $#url_keys+1; while ( my ($word, $df) = each %df ) { $idf{$word} = $df; $num_words++; print OUT "." unless $num_words % 1000; } dbmclose %idf; print OUT "\n\n"; print OUT "$num_words words\n"; print OUT "$number_normalized_urls number_normalized_urls\n"; close(OUT); chdir($orig_dir);
}

cleanupdescriptionprevnextTop
sub cleanup {
   my $rootdir = shift();
   my $corpus = shift();

   system("rm $corpus.list");
   system("rm $corpus.download");
   system("rm $corpus.download.duplicates");
   system("rm $corpus.download.uniq");
   system("rm download/$corpus/empty");
   system("rm download/$corpus/duplicates");
   system("rm download/$corpus/notdownloaded");
}

copyFilesdescriptionprevnextTop
sub copyFiles {
  my $filesref = shift;
  my @files = @$filesref;

  my $corpus = shift;
  my %args = @_;

  my $dest =  ( defined $args{dest} ? $args{dest} : "" );
  my $root =  ( defined $args{root} ? $args{root} : "" );

  # Write the .uniq file list
open LIST_UNIQ, "> $root/$corpus.download.uniq"; # Copy each file
foreach my $file (@files) { # skip the file if it doesn't exist
if (not -f $file) { print STDERR "$file does not exist.\n"; next; } # Get the name of the directory (remove the leading '/' and the filename
if ($file =~ m#^\/?((.*)\/[^\/]*)$#) { my $copy_to = $dest . "/" . $1; my $directory = $dest . "/" . $2; if (not -d $directory) { mkpath($directory); } # Copy the file
# `cp -p $file $copy_to`;
copy($file, $copy_to) or die "Failed to copy file: $!"; if (not -e $copy_to) { print STDERR "Error copying $copy_to\n"; } else { print LIST_UNIQ "\"http://$copy_to\"\" $1\"\n"; } } else { print STDERR "Unable to read line: $file\n"; } } close LIST_UNIQ;
}

deleteCorpus()descriptionprevnextTop
sub deleteCorpus() {
	my $self = shift;
	my $rootdir = $self->{rootdir};
	my $corpus = $self->{corpus};

	`rm -rf $rootdir/download/$corpus`;
	`rm -rf $rootdir/corpora/$corpus`;
	`rm -rf $rootdir/corpus-data/$corpus`;
	`rm -rf $rootdir/corpus-data/$corpus-tf`;
	`rm -rf $rootdir/corpus-data/$corpus-tf-s`;
}

download_urlsdescriptionprevnextTop
sub download_urls {
    my $self = shift;
    my %args = @_;

    my $rootdir = $self->{rootdir};
    print $rootdir, "\n";
    my $corpus = $self->{corpus};

    my $urlsref  = $args{urlsref};
    my $cleanup =  ( defined $args{cleanup} ? $args{cleanup} : 1 );

    makeDirs($rootdir, $corpus);
    chdir($rootdir);
    wgetall2($corpus, $urlsref);
    verifyUrl($corpus);
    getUniqList($corpus);

    chdir("../..");
    if ( $cleanup == 1 )  {
        cleanup($rootdir, $corpus);
      }
}

getUniqListdescriptionprevnextTop
sub getUniqList {
    my $corpus = shift();
    my $infile = "../../$corpus.download";
    my $outfile = "../../$corpus.download.uniq";
    my $errfile = "../../$corpus.download.duplicates";
    print "Computing unique URL set...\n";

    my %urls = ();
    my %locs = ();

    open (IN, "<$infile");
    open (OUT, ">$outfile");
    open (ERR, ">$errfile");

    foreach my $line (<IN>)  {

        if ($line =~ m/\"(.+)\" \"(.+)\"/)  {

            unless (exists $urls{$1} || exists $locs{$2})  {
                print OUT "\"$1\"\" $2\"\n";
            }
            if (exists $urls{$1} || exists $locs{$2})  {