Clair::Nutch

Search


SummaryPackage variablesSynopsisGeneral documentationMethods

SummaryTop
Clair::Nutch::Search - A class for performing simple Nutch searches.

Package variablesTop
Privates (from "my" definitions)
$SEARCH_CLASS = "edu.umich.si.clair.nutch.SimpleSearch"

Included modulesTop
Clair::Cluster
Clair::Document

SynopsisTop
    #!/usr/bin/perl -w
    use strict;
    use Clair::Nutch::Search;
    my $search = Clair::Nutch::Search->new(
        nutch_home => "/path/to/nutch",
        index_path => "/path/to/index"
    );
    # Returns a list of hits, where each hit is a hashref
    my @results = $search->query("cat rabies", 20);
    foreach my $hit (@results) {
        foreach my $key (%$hit) {
            print "$key => $hit->{$key}\n";
        }
    }

DescriptionTop
No description!
MethodsTop
newDescriptionCode
queryDescriptionCode
query_clusterDescriptionCode

Methods description


newcode    nextTop
Takes two required parameters: "nutch_home" (the path to nutch) and
"index_path" (the path to a Nutch index directory [it will contain db and
segments]).

querycodeprevnextTop
    $search->query($query, $numhits)
Queries Nutch with the given query (required) and returns at most $numhits
(optional, defaults to 10).

query_clustercodeprevnextTop
    $search->query($query, $numhits)
Queries Nutch with the given query (required) and returns at most $numhits
in a Clair::Cluster. The id of the each document is set to the query followed
by the index of the hit.

Methods code


newdescriptionprevnextTop
sub new {
    my $class = shift;
    my %params = @_;

    die "'nutch_home' is a required field" 
        unless (defined $params{nutch_home});

    die "'index_path' is a required field"
        unless (defined $params{index_path});

    my $self = bless\% params, $class;

    return $self;
}

querydescriptionprevnextTop
sub query {
    my $self = shift;

    my $query = shift;
    my $hits = shift;
    
    unless ($hits) {
        $hits = 10;
    }

    my $script = "$self->{nutch_home}/bin/nutch";
    my $command = "$script $SEARCH_CLASS $self->{index_path} '$query' $hits";

    unless (defined $self->{verbose}) {
        $command .= " 2>/dev/null";
    }

    my @lines = `$command`;
    my @result;
    foreach my $line (@lines) {
        my @pairs = split /\t/, $line;
        my %hit = @pairs;
        for (keys %hit) {
            if ($_ =~ /^\s*$/) {
                delete $hit{$_};
            }
        }
        push @result,\% hit;
    }
    return @result;
}

query_clusterdescriptionprevnextTop
sub query_cluster {
    my $self = shift;
    my $query = shift;
    my $hits = shift;

    my @hits = $self->query($query, $hits);
    my $cluster = Clair::Cluster->new();
    my $i = 1;

    my $clean_query = $query;
    $clean_query =~ s/\s/_/;
    foreach my $hit (@hits) {
        if (defined $hit->{content}) {
            my $text = $hit->{content};
            my $id = "$clean_query$i";
            my $doc = new Clair::Document(
                string => $text,
                type => "text",
                id => $id
            );
            $cluster->insert($id, $doc);
        }
        $i++;
    }

    return $cluster;
}

General documentation


VERSIONTop
Version 0.01