| Summary | Package variables | Synopsis | Description | General documentation | Methods |
| Summary | Top |
| Clair::ALE::Extract - Extract links and add them to the database |
| Package variables | Top |
| No package variables defined. |
| Included modules | Top |
| Clair::ALE::Stemmer qw ( ale_stemsome ) |
| Clair::ALE::Wget qw ( alefile2url ) |
| Clair::ALE::_SQL |
| Clair::Utils::ALE(1) |
| Clair::Utils::ALE(2) qw ( %ALE_ENV ) |
| File::Find |
| FileHandle |
| HTML::LinkExtractor |
| Synopsis | Top |
my $ex = Clair::ALE::Extractor->new(); |
| Description | Top |
| Adds the links of the given list of files to the ALE database. This is a Perl module version of the original alext script. This module depends on modules that use the ALE environment variables, so they must be set before using this. Clair::ALE::Extract->new $ex = Clair::ALE::Extract->new( verbose => 1 ); Constructs a new Extract object. Set verbose to 1 to print information to STDOUT. $ex->extract # Run ALE on a corpus from CorpusDownload Extracts the links from the given list of files. 'drop_tables' is an optional parameter that when set to true will completely reset the current ALESPACE tables before adding the links. Optionally you can specify a CorpusDownload rootdir and corpusname to have ALE index the files downloaded by CorpusDownload. See CorpusDownload. |
| Methods | Top |
| _create_nonexistant_tables | No description | Code |
| _delete_links | No description | Code |
| _encode | No description | Code |
| _extract_links | No description | Code |
| _get_urlid | No description | Code |
| _insert_link | No description | Code |
| _set_last_updated | No description | Code |
| drop_tables | No description | Code |
| extract | No description | Code |
| new | No description | Code |
| _create_nonexistant_tables | description | prev | next | Top |
sub _create_nonexistant_tables
{
my $self = shift;
my $alesql = $self->{alesql};
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
my $sql = "CREATE TABLE IF NOT EXISTS $links_table ("
. "linkid bigint NOT NULL auto_increment, "
. "link_from int NOT NULL, "
. "link_to int NOT NULL, "
. "link_num int NOT NULL, "
. "link_text char(255) NOT NULL, "
. "PRIMARY KEY(linkid), "
. "UNIQUE KEY(link_from,link_num), "
. "KEY (link_from), "
. "KEY (link_to) );";
$alesql->do($sql) or $alesql->errdie("Error creating table $links_table");
$sql = "CREATE TABLE IF NOT EXISTS $urls_table ( "
. "urlid int NOT NULL auto_increment, "
. "url char(255) NOT NULL, "
. "last_updated DATETIME, "
. "PRIMARY KEY (urlid), "
. "UNIQUE KEY (url) "
. ")";
$alesql->do($sql) or $alesql->errdie("Error creating table $urls_table");
$sql = "CREATE TABLE IF NOT EXISTS $words_table ( "
. "linkid bigint NOT NULL, "
. "word char(25) NOT NULL, "
. "KEY (linkid), "
. "KEY (word) "
. ")";
$alesql->do($sql) or $alesql->errdie("Error creating table $words_table");} |
| _delete_links | description | prev | next | Top |
sub _delete_links
{
my $self = shift;
my $url = shift;
my $alesql = $self->{alesql};
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
$self->{link_count} = 0;
my $urlid = $self->_get_urlid($url);
unless ($urlid) {
warn "Couldn't get urlid for $url";
return undef;
}
#my $sql = "DELETE $links_table, $words_table FROM $links_table AS links "} |
| _encode | description | prev | next | Top |
sub _encode
{
my $self = shift;
my $str = shift;
if ($str) {
$str =~ s/\t/%09/g;
$str =~ s/\r/%0D/g;
$str =~ s/\n/%0A/g;
return $str;
} else {
return "";
}} |
| _extract_links | description | prev | next | Top |
sub _extract_links
{
my $self = shift;
my ($url, $fn) = @_;
my $f = FileHandle->new("< $fn");
unless ($f) {
warn "Couldn't open $fn: $!";
return undef;
}
my $lx = HTML::LinkExtractor->new(undef, $url);
$lx->strip(1);
$lx->parse($f);
foreach my $link (@{ $lx->links() }) {
if ($$link{tag} eq 'a') {
my $type = $self->_encode($$link{tag});
my $text = $self->_encode($$link{_TEXT});
my $href = $self->_encode($$link{href});
$href =~ s/\/index\.html$//;
$href =~ s/\/$//;
if ($href =~ /[\x00-\x1f\x7f-\xff]/) {
warn "Bad link\# $self->{link_count} from page $url";
next;
}
$self->_insert_link($url, $type, $text, $href);
}
}
close($f) or die "Error closing '$fn': $!";
return 1;} |
| _get_urlid | description | prev | next | Top |
sub _get_urlid
{
my $self = shift;
my $url = shift;
my $alesql;
if ($self->{get_urlid_sql}) {
$alesql = $self->{get_urlid_sql};
} else {
$alesql = Clair::ALE::_SQL->new();
}
$self->{get_urlid_sql} = $alesql;
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
my %get_urlid_cache = %{$self->{get_urlid_cache}};
if (length($url) > 255) {
$url = substr($url, 0, 255);
}
# First try to get it from the cache} |
| _insert_link | description | prev | next | Top |
sub _insert_link
{
my $self = shift;
my ($url, $type, $text, $href) = @_;
my $alesql = $self->{alesql};
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
unless ($url && $href) {
warn "Link to/from nothing ($url,$type,$text,$href)\n";
return undef;
}
my $urlid = $self->_get_urlid($url) or return undef;
my $hrefid = $self->_get_urlid($href) or return undef;
my $list = join(", ", map { $alesql->quote($_) }
($urlid, $hrefid, $self->{link_count}, $text) );
my $sql = "INSERT INTO $links_table (link_from, link_to, link_num, "
. "link_text) VALUES ($list)";
$alesql->do($sql) or $alesql->errdie("Error inserting new link");
my $linkid = $alesql->insertid() or die "Didn't get a link ID!";
my @words = grep(/./, ale_stemsome(
map { lc $_ } split(/[^a-zA-Z]+/, $text)));
if (@words) {
$list = join(", ",
map { "(".$alesql->quote($linkid).", ".$alesql->quote($_).")" }
@words);
$sql = "INSERT INTO $words_table (linkid, word) VALUES $list";
$alesql->do($sql) or $alesql->errdie("Error inserting new words");
}
$self->{link_count}++;
return 1;} |
| _set_last_updated | description | prev | next | Top |
sub _set_last_updated
{
my $self = shift;
my $url = shift;
my $urlid = $self->_get_urlid($url);
my $alesql = $self->{alesql};
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
my $sql = "UPDATE $urls_table SET last_updated = NOW() WHERE "
. "urlid=" . $alesql->quote($urlid);
$alesql->do($sql) or $alesql->errdie("Error setting last_updated");
return 1;} |
| drop_tables | description | prev | next | Top |
sub drop_tables
{
my $self = shift;
my $alesql = $self->{alesql};
my ($links_table, $urls_table, $words_table) =
($alesql->links_table, $alesql->urls_table, $alesql->words_table);
my $sql = "DROP TABLE IF EXISTS $links_table, $urls_table, $words_table";
$alesql->do($sql)
or $alesql->errdie("Couldn't drop tables $links_table, $urls_table, "
. "$words_table");} |
| extract | description | prev | next | Top |
sub extract
{
my $self = shift;
my %args = @_;
my $files;
my $old_space = $ALE_ENV{ALESPACE};
my $old_cache = $ALE_ENV{ALECACHE};
if ($args{files}) {
$files = $args{files};
} elsif ($args{rootdir} && $args{corpusname}) {
$ALE_ENV{ALESPACE} = $args{corpusname};
$ALE_ENV{ALECACHE} = "$args{rootdir}/download/$args{corpusname}";
my @found;
find(sub { push @found, $File::Find::name if (-f $_); },
$ALE_ENV{ALECACHE});
$files =\@ found;
} else {
die "Must specify either 'files' or 'rootdir' and 'corpusname'";
}
if ($args{drop_tables}) {
print "Dropping tables\n" if $self->{verbose};
$self->drop_tables();
}
print "Creating tables\n" if $self->{verbose};
$self->_create_nonexistant_tables();
my $total_added = 0;
foreach my $file (@$files) {
if (-f $file) {
print "Converting $file to url\n" if $self->{verbose};
my $url = alefile2url($file);
unless($url) {
warn "Couldn't get url for $file";
next;
}
print "Running on $url\n" if $self->{verbose};
print "Deleting links\n" if $self->{verbose};
$self->_delete_links($url);
print "Extracting links\n" if $self->{verbose};
$self->_extract_links($url, $file);
print "Updating timestamp\n" if $self->{verbose};
$self->_set_last_updated($url);
$total_added++;
} else {
warn "Skipping $file, not a file";
}
}
$ALE_ENV{ALESPACE} = $old_space;
$ALE_ENV{ALECACHE} = $old_cache;
print "Added $total_added pages\n" if $self->{verbose};} |
| new | description | prev | next | Top |
sub new
{
my $class = shift;
my %args = @_;
my $self = {};
bless $self,$class;
$self->{alesql} = Clair::ALE::_SQL->new();
$self->{get_urlid_cache} = {};
$self->{link_count} = 0;
$self->{verbose} = $args{verbose};
return $self;} |
| SEE ALSO | Top |
| Clair::Utils::ALE, Clair::ALE::Conn, Clair::ALE::Link, Clair::ALE::URL. |
| AUTHOR | Top |
| Tony Fader (afader@umich.edu) |