Clair::MEAD

DocsentConverter


SummaryPackage variablesSynopsisGeneral documentationMethods

SummaryTop
CLAIR::MEAD::DocsentConverter - Document => Mead Cluster converter
head1 VERSION
Version 0.01

Package variablesTop
No package variables defined.

Included modulesTop
Clair::Config
Clair::Document
MEAD::MEAD_ADDONS_UTIL
lib " $MEAD_HOME /lib "

SynopsisTop
This module is use to take documents from a directory, Clair::Document
objects, or Clair::Cluster objects and create a Mead-style cluster directory.

use Clair::MEAD::DocsentConverter;
use Clair::Document;
use Clair::Cluster;
my $doc = Clair::Document->new( ... );
my $cluster = Clair::Cluster->new( ... );
my $dir = "some/path";
my $file = "file.txt";
...
my $c = Clair::MEAD::DocsentConverter->new(
dest => "outputdir",
name => "mycluster" );
$c->add_document($doc);
$c->add_cluster($cluster);
$c->add_directory($dir);
$c->add_file($file);
$c->convert();

DescriptionTop
No description!
MethodsTop
_clean_idsNo descriptionCode
add_clusterDescriptionCode
add_directoryDescriptionCode
add_documentDescriptionCode
add_fileDescriptionCode
convertDescriptionCode
newDescriptionCode

Methods description


add_clustercode    nextTop
Adds all of the documents in the given Clair::Cluster to the cluster.

add_directorycodeprevnextTop
Adds all of the files in the given directory to the cluster.

add_documentcodeprevnextTop
Adds the given Clair::Document to the cluster.

add_filecodeprevnextTop
Adds the given file to the cluster.

convertcodeprevnextTop
Converts the previously added files to a Mead cluster in the directory
set in the constructor.

newcodeprevnextTop
    $c = Clair::MEAD::DocsentConverter(
dest => "output/path",
lang => "ENG",
naem => "mycluster"
);
Creates a new converter object.

Methods code


_clean_idsdescriptionprevnextTop
sub _clean_ids {
    my $self = shift;
    my @docs = @{ $self->{docs} };
    foreach my $doc (@docs) {
        my $id = $doc->get_id();
        $id =~ s/.*?\///g;
        $id =~ s/\.docsent//g;
        $doc->set_id(id => $id);
    }
}

add_clusterdescriptionprevnextTop
sub add_cluster {
    my $self = shift;
    my $cluster = shift;

    my @docs = @{ $self->{docs} };
    my $documents = $cluster->documents();
    foreach my $doc (values %$documents) {
        push @docs, $doc;
    }

    $self->{docs} =\@ docs;
}

add_directorydescriptionprevnextTop
sub add_directory {
    my $self = shift;
    my $dir = shift;

    my @docs = @{ $self->{docs} };

    die "Expected directory: $!" unless (-d $dir);

    opendir DIR, $dir or die "Could not read directory: $!";
    my @files = readdir(DIR);
    close DIR;

    foreach my $file (@files) {
        if (-f "$dir/$file") {
            my $type = "text";
            if ($file =~ /\.html$/) {
                $type = "html"
            }
            $file =~ /([^\/]+)$/;
            my $id = $1;
            my $doc = Clair::Document->new(
                file => "$dir/$file",
                type => $type,
                id => $id
            );
            push @docs, $doc;
        }
    }

    $self->{docs} =\@ docs;
}

add_documentdescriptionprevnextTop
sub add_document {
    my $self = shift;
    my @new_docs = @_;

    my @docs = @{ $self->{docs} };
    
    push @docs, @new_docs;

    $self->{docs} =\@ docs;
}

add_filedescriptionprevnextTop
sub add_file {
    my $self = shift;
    my @files = @_;

    my @docs = @{ $self->{docs} };
   
    foreach my $file (@files) {

        my $type;
        if ($file =~ /\.html$/) {
            $type = "html";
        } else {
            $type = "text";
        }

        $file =~ /([^\/]+)$/;
        my $id = $1;

        my $doc = Clair::Document->new(file => $file, 
                                       type => $type, 
                                       id => $1);
        push @docs, $doc;
    }

    $self->{docs} =\@ docs;
}

convertdescriptionprevnextTop
sub convert {
    my $self = shift;
    $self->_clean_ids();
    my @docs = @{ $self->{docs} };

    my $docsent_dir = "$self->{dest}/docsent";
    my $orig_dir = "$self->{dest}/orig";

    mkdir($self->{dest}) or die $! unless (-d $self->{dest});
    mkdir($docsent_dir) or die $! unless (-d $docsent_dir); 
    mkdir($orig_dir) or die $! unless (-d $orig_dir); 


    # Create the docsent files and the originals
foreach my $doc (@docs) { my $id = $doc->get_id(); my $out_file = "$docsent_dir/$id.docsent"; $doc->strip_html(); open DOCSENT, "> $out_file" or die "Could not write to $out_file: $!"; print DOCSENT get_docsent_header($id); my $i = 1; foreach my $sentence ($doc->split_into_sentences()) { $sentence = sanitize($sentence); print DOCSENT "<S PAR=\"1\" RSNT=\"1\" SNO=\"$i\">$sentence</S>\n"; $i++; } print DOCSENT get_docsent_tail; close DOCSENT; open ORIG, "> $orig_dir/$id"; print ORIG $doc->get_text(); close ORIG; } # Create the cluster file
my $outname = $self->{name}; $outname =~ s/.*?\///g; my $cluster_file = "$self->{dest}/$outname.cluster"; open CLUSTER, "> $cluster_file"; print CLUSTER get_cluster_header($self->{lang}); foreach my $doc (@docs) { my $id = $doc->get_id(); $id =~ s/.*\///g; print CLUSTER "<D DID=\"$id\"/>\n"; } print CLUSTER get_cluster_tail; close CLUSTER;
}

newdescriptionprevnextTop
sub new {
    my $class = shift;
    my %parameters = @_;

    unless (defined $parameters{dest}) {
        die "Expected directory";
    }

    unless (defined $parameters{lang}) {
        $parameters{lang} = "ENG";
    }

    unless (defined $parameters{name}) {
        $parameters{name} = $parameters{dest};
    }

    my @docs = ();
    $parameters{docs} =\@ docs;

    my $self = bless\% parameters, $class; 
    return $self;
}

General documentation


No general documentation available.