#!/usr/bin/perl

# to see the documentation, type:
# perldoc clean_pages_from_url_list.pl

use strict;
use lib('/corpora/tools');
use warnings;
#use Text::PotaModule; once it's on CPAN
use PotaModule;

use Getopt::Long;

#autoflush STDOUT 1;

my $usage = <<"_USAGE_"; 

This script takes html docs from the input file list and formats their contents as text, applying
various filters.

To see documentation and usage, type:

perldoc $0

Copyright 2005-2006, Marco Baroni, Serge Sharoff

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.

_USAGE_

my $help;

# read type/token maxima of acceptable bad words
# initialized to default values
my $maxbadtypes = 3;
my $maxbadtokens = 10;

# read type/token minima of required good words
# initialized to default values

my $mingoodtypes = 10;
my $mingoodtokens = 30;
my $mingoodratio = 0.25;

# maximum/minimum size of doc before processing
# defaults to 5Kb/200Kb
# (or equivalent char numbers in "character semantics")

my $minbytesize = 1048;
my $maxbytesize = 204801;

my $inlanguage='en';

my $external_tokeniser_call;
my ($logfile);

#parameters 
my ($b,$f,$g,$k,$m);
Getopt::Long::Configure('auto_help');
GetOptions('bad=s' => \$b, 'f=s' => \$f, 'good=s' => \$g, 'k=s' => \$k, 'minmax=s' => \$m, 'language=s' => \$inlanguage, 'tokeniser' => \$external_tokeniser_call, 'journal=s' => \$logfile);

# if user specified type/token maxima but no bad word list,
# something is wrong
if ($f){
    die "if you specify bad word thresholds, you should also have a bad word list" unless $b;
    ($maxbadtypes,$maxbadtokens) = split "/",$f;
}

my %bad = ();

if ($b) {
    open BADWORDS,$b or die "could not find bad word file $b";
    while (<BADWORDS>) {
	utf8::decode($_);
	chomp;
	$bad{$_} = 1;
    }
    close BADWORDS;
}


# if user specified type/token minima but no good word list,
# something is wrong

if ($k){
    die "if you specify good word thresholds, you should also have a good word list" unless $g;
    ($mingoodtypes,$mingoodtokens,$mingoodratio) = split "/",$k;
}

my %good = ();

if ($g) {
    open GOODWORDS,$g or die "could not find good word file $g";
    while (<GOODWORDS>) {
	utf8::decode($_);
	chomp;
	$good{$_} = 1;
    }
    close GOODWORDS;
} else { # required minima should be set to 0;
    $mingoodtypes = 0;
    $mingoodtokens = 0;
    $mingoodratio = 0;
}

if ($m) {
    ($minbytesize,$maxbytesize) = split "/",$m;
}

if ($logfile) {
    open(STDLOG, ">$logfile") or die "Cannot create the logfile '$logfile': $!\n"
} else {
    *STDLOG=*STDERR;
};
binmode(STDLOG, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

$/="</text>\n";
while (<>) {
#    utf8::decode($_);
    my $header='id ="x"';
    if (s/<text (.+)>\n//) {
	$header=$1;
    } else {
	    print STDLOG "no header in text $. \n" if /\S/;
    }
	my $potaref = dig_content_uni($_, 
				  badref => \%bad,
				  maxbadtypes => $maxbadtypes,
				  maxbadtokens => $maxbadtokens,
				  goodref => \%good,
				  mingoodtypes => $mingoodtypes,
				  mingoodtokens => $mingoodtokens,
				  mingoodratio => $mingoodratio,
				  mincharactersize => $minbytesize,
				  maxcharactersize => $maxbytesize,
				  inlanguage => $inlanguage,
				  tokeniser => $external_tokeniser_call
	    );
    # if we did get something back from the cleaning process,
    # let's print it

	if (${$potaref}[0]) {
	    my $result=${$potaref}[0];
	    print "<text $header>\n$result\n</text>\n";
	} else {
	    print STDLOG "---skipped $header\n";
	}
}

=head1 NAME

I<clean_pages_from_url_list.pl>: a simple script that
takes html pages collected to a single file.

=head1 SYNOPSIS

clean_pages_from_url_list.pl -l en -b en_stop_words -g en_keep_words 
                                          file_list > corpus.txt

clean_pages_from_url_list.pl -l ru -b ru_stop_words -f 3/10 
                                  -g ru_keep_words -k "10/30/0.25" 
                                                -m  5119/204801 
                                          file_list > corpus.txt

=head1 DESCRIPTION

This script takes html pages collected to a single file, and extracts their
contents as text, applying a heuristic method to look for the
"content-rich" section of a page, and removing the rest. Moreover,
various filters can be applied, and the pages that do not satisfy them
are not printed to output.

For details about the boilerplate stripping heuristic, take a look at
the documentation of the PotaModule (which you must have installed in
order to use this script):

perldoc PotaModule

=head2 ARGUMENT AND OPTIONS

The only argument to the script is a file with a list of urls, one per line.

The options are:

B<-h>: Prints short info about the script and quits.

B<-b filename>: A list of "bad" words (e.g., pornographic terms). If a
document contains more than a certain number of types or tokens from
this list, it will not be printed (see option B<-f> for default
thresholds and how to change them).

B<-f type_N/token_N>: Two slash-separated values that specify,
respectively, the number of types and tokens from the "bad" word list
sufficient to cause a document to be discarded. Default thresholds are
3 types and 10 tokens. Of course, this option can only be specified if
a bad word list is also passed.

B<-g filename>: A list of "good" words (e.g., function words). A
document is printed only if it contains a certain number of types and
tokens from this list, and if the ratio of tokens from the list to
total token is above a certain threshold (see option B<-k> for default
thresholds and how to change them).

B<--journal filename>: For keeping the log

B<-k type_N/token_N/ratio>: Three slash-separated values that specify,
respectively, the minimum number of types and tokens from the "good"
word list that a document must contain to be printed, and the minimum
ratio of tokens from the list to total tokens in the document that a
document must have to be printed. Default thresholds are 10 types, 30
tokens and a ratio of 0.25 (or 0, 0 and 0 if no "good" word list is
specified). Of course, the option can only be used if a good word list
is also passed.

B<-m min_size/max_size>: Two slash-separated values specifying the
minimum and maximum allowed sizes (in characters) for documents to be
printed. Default values are 5119 and 204801, respectively, which, in
single-byte encodings, means 5KB and 2000KB, respectively.

B<-l language --language language>: the language can be specified to
guide the tokeniser (for CJK languages).

B<-t tokeniser --tokeniser tokeniser>: The command to run a
language-dependent tokenizer, which works as a filter taking a plain
UTF8-encoded text at STDIN and writing the string of characters
separated with blanks (anything matching to \s) to STDOUT.


=head1 DEPENDENCIES

You need to have the following modules installed:

PotaModule

=head1 AUTHOR

Marco Baroni, baroni AT sslmit.unibo.it
Serge Sharoff, University of Leeds

=head1 ACKNOWLEDGMENTS

Thanks to Eros Zanchetta for help, advice and testing and to Tom
Emerson for the regular expression to filter out non-html.

=head1 BUGS

Probably many: if you find one, please let me know: baroni AT sslmit
unibo it

=head1 COPYRIGHT

Copyright 2005-2007, Marco Baroni, Serge Sharoff

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.

=head1 SEE ALSO

The PotaModule and other software from the WaCky project:

http://sslmitdev-online.sslmit.unibo.it/wac/wac.php

=cut
    

