| Summary | Package variables | Synopsis | General documentation | Methods |
| Summary | Top |
| Clair::Polisci::AustralianParser - A class for parsing Australian hansard html. |
| Package variables | Top |
| No package variables defined. |
| Included modules | Top |
| HTML::TokeParser |
| XML::Writer |
| Synopsis | Top |
|
| Description | Top |
| Methods | Top |
| clean_header | No description | Code |
| convert_tags | No description | Code |
| double_pop_token | No description | Code |
| get_header | Description | Code |
| get_speeches | Description | Code |
| new | Description | Code |
| push_onto_speeches | No description | Code |
| set_out | Description | Code |
| write_xml | Description | Code |
| get_header | code | next | Top |
Returns a hashref containing header key/value pairs.my $header = $p->get_header(); |
| get_speeches | code | prev | next | Top |
Returns an arrayref containing hashrefs to speech info.my $speeches = $p->get_speeches(); |
| new | code | prev | next | Top |
my $out = \*OUT; |
| set_out | code | prev | next | Top |
my $out = \*OUT; |
| write_xml | code | prev | next | Top |
| Converts the data from $header and $speeches into XML and prints it to "out". |
| clean_header | description | prev | next | Top |
sub clean_header
{ my $header = shift;
my %month_map = ( "January" => "01",
"February" => "02",
"March" => "03",
"April" => "04",
"May" => "05",
"June" => "06",
"July" => "07",
"August" => "08",
"September" => "09",
"October" => "10",
"November" => "11",
"December" => "12" );
if (exists $header->{"Date"}) {
$header->{"Date"} =~ /(\d\d?) (\w+), (\d\d\d\d)/;
my ($d, $m, $y) = ($1, $2, $3);
if ($d !~ /\d\d/) {
$d = "0$1";
}
$header->{"Date"} = $y . $month_map{$m} . $d;
} else {
$header->{"Date"} = "";
}
if (exists $header->{"Time"}) {
$header->{"Time"} =~ /(\d\d?)\.(\d\d) ([pa])\.m\./;
if ($1 and $2 and $3) {
my ($h, $m, $x) = ($1, $2, $3);
if ($x eq "p") {
$h = $h + 12;
}
if ($h !~ /\d\d/) {
$h = "0$h";
}
$header->{"Time"} = "$h$m";
}
} else {
$header->{"Time"} = "";
}
if (!exists $header->{"Type"}) {
$header->{"Type"} = "";
}} |
| convert_tags | description | prev | next | Top |
sub convert_tags
{ my $writer = shift;
my $body = shift;
while ($body =~ /^(.*?)(<\/?[^>]+>)(.*)$/) {
my $left = $1;
my $right = $3;
my $tag = $2;
$writer->characters($left);
if ($tag) {
if ($tag =~ /<(\w+)>/) {
$writer->startTag($1);
} elsif ($tag =~ /<\/(\w+)>/) {
$writer->endTag($1);
}
}
$body = $right;
}
$writer->characters($body);} |
| double_pop_token | description | prev | next | Top |
sub double_pop_token
{ my $stream = shift;
$stream->get_token;
return $stream->get_token;} |
| get_header | description | prev | next | Top |
sub get_header
{
my $self = shift;
my $filename = $self->{file};
my $stream = HTML::TokeParser->new($filename);
my ($key, $val);
my %header;
while (my $token = $stream->get_tag("span")) {
if ($token->[1]{id} && $token->[1]{id} =~ /Label(\d)$/) {
if ($1 eq "2") {
$token = double_pop_token($stream);
$key = $token->[1];
} elsif ($1 eq "3") {
$token = $stream->get_token;
$val = $token->[1];
if ($key) {
$header{$key} = $val;
}
}
} elsif ($token->[1]{id} && $token->[1]{id} eq "txtTitle") {
$token = double_pop_token($stream);
$header{"Title"} = $token->[1];
}
}
# Get the time by simply searching line by line} |
| get_speeches | description | prev | next | Top |
sub get_speeches
{
my $self = shift;
my $filename = $self->{file};
my $stream = HTML::TokeParser->new($filename);
my @speeches;
my $token;
my $keepcount = 0;
my $divcount = 1;
my $slurp_body = 0;
my $quote_level = 0;
my $found_quote = 0;
my $motion_level = 0;
my $found_motion = 0;
my ($speech_body, $speaker_id, $speech_type) = ("", "", "");
# Jump to speech} |
| new | description | prev | next | Top |
sub new
{ my $class = shift;
my %parameters = @_;
my ($out, $file);
if (exists $parameters{file}) {
$file = $parameters{file};
}
if (exists $parameters{out}) {
$out = $parameters{out};
} else {
$out =\* STDOUT;
}
my $self = bless {
out => $out,
file => $file
}, $class;
return $self;} |
| push_onto_speeches | description | prev | next | Top |
sub push_onto_speeches
{ my $speeches = shift;
my ($type, $speaker, $body) = @_;
if ($body ne "") {
$body =~ s/^.*?\p{Pd}//g unless (!$speaker);
push @$speeches, {type => $type, speaker => $speaker, body => $body};
}} |
| set_out | description | prev | next | Top |
sub set_out
{ my $self = shift;
$self->{out} = shift;} |
| write_xml | description | prev | next | Top |
sub write_xml
{
my $self = shift;
my ($header, $speeches);
if (exists $self->{header}) {
$header = $self->{header};
} else {
$header = $self->get_header();
}
if (exists $self->{speeches}) {
$speeches = $self->{speeches};
} else {
$speeches = $self->get_speeches();
}
my $writer = new XML::Writer(OUTPUT => $self->{out}, NEWLINES => 1);
$writer->xmlDecl("UTF-8");
$writer->doctype("record", undef, "record_aus.dtd");
$writer->startTag("record");
$writer->startTag("header");
$writer->startTag("date");
$writer->characters($header->{"Date"});
$writer->endTag("date");
$writer->startTag("source");
$writer->characters($header->{"Source"});
$writer->endTag("source");
$writer->startTag("type");
$writer->characters($header->{"Type"});
$writer->endTag("type");
$writer->startTag("title");
$writer->characters($header->{"Title"});
$writer->endTag("title");
unless (!exists $header->{"Main Committee"}) {
$writer->startTag("main-committee");
$writer->characters($header->{"Main Committee"});
$writer->endTag("main-committee");
}
unless (!exists $header->{"Proof"}) {
$writer->startTag("proof");
$writer->characters($header->{"Proof"});
$writer->endTag("proof");
}
unless (!exists $header->{"Stage"}) {
$writer->startTag("stage");
$writer->characters($header->{"Stage"});
$writer->endTag("stage");
}
unless (!exists $header->{"Context"}) {
$writer->startTag("context");
$writer->characters($header->{"Context"});
$writer->endTag("context");
}
unless (!exists $header->{"Time"}) {
$writer->startTag("time");
$writer->characters($header->{"Time"});
$writer->endTag("time");
}
$writer->endTag("header");
$writer->startTag("body");
foreach my $speech (@$speeches) {
if (!$speech->{"speaker"} or $speech->{"speaker"} eq "") {
$writer->startTag("nonspeech");
convert_tags($writer, $speech->{"body"});
#$writer->characters($speech->{"body"});} |
| VERSION | Top |
| Version 0.01 |
| FUNCTIONS | Top |