# for documentation, type:
# perldoc PotaModule.pm
# Marco Baroni, Serge Sharoff

package PotaModule;
use strict;
use vars qw(@ISA @EXPORT);
use Exporter;
use Unicode::String;

@ISA = qw(Exporter);

@EXPORT = qw(&get_htmlfile &dig_content &dig_content_uni &convert_to_utf8 &remove_crap &identify_tags &character_tokeniser &glueprint &punctuation_tokeniser &convert_latin1 &convert_enca &convert_cjk &translate_html_entities);


use strict;
use File::Temp qw/ tempfile tempdir /;
use File::Path;
use Encode::Guess;

my $headertoken="XHEADERX";  # token to delimit the html file header
my $wspacetoken="_G_"; # token to preserve whitespace around non-CJK chars
my $tagtoken="XTAGX"; #token to indicate tags

my $defaulsegmentlength=2; #the default segment length for character tokeniser

my $legacy_converter_ref; # converter to UTF from source encodings
my $inlanguage;  # the input language (can be used for making decisions in the legacy converter)
my $tokeniser_ref; # the tokenisation procedure, the default is to use punctuation marks
my $tokeniser_command; #the external tokenisation command
my $default_encoding;

sub get_htmlfile { # simple processing that takes care of a variety of formats for storing HTML pages
    my ($htmlheader,$header,$htmltext,$url,$docid,$title,$author);
    $htmlheader=join('',@_[0..50]); #for Paraget to save us from looking through the complete file
    if (($header)=$_[0]=~/^<text (.+)>$/) { # this is the output from the BootCat print_html_pages tool
	$_[0]="$header $headertoken ";
	$_[-1]=~s%</text>%%;
	$htmltext=join('',@_);
    } elsif ((($docid)=$htmlheader=~/DocID: (.+)\n/) and (($url)=$htmlheader=~/\nURI: (.+)\n/)) {
	# this is the output from Niels' paraget
	$header=qq{id="$url"};
	if (($title)=$htmlheader=~/\nTitle: (.+)\n/) {
	    $header.=qq{ title="$title"};
	};
	if (($author)=$htmlheader=~/Author: (.+)\n/) {
	    $header.=qq{ author="$author"};
	};
	my $i=0;
	while ( $_[$i]=~/^(?:[\w-]+: .+\n|\n)$/ ) { # either empty or a header 
	    $_[$i++]='';
	};
	$htmltext="$header $headertoken ".join('',@_);
    } else { 
	$htmltext=join('',@_);
    };
    return($htmltext);
}

sub dig_content_uni {
    my $text=shift;  
    my %params=@_;
    my $badref;    # reference to hash of stop words, mapping them to some non 0 value

# read type/token maxima of acceptable bad words
# initialized to default values
    my $maxbadtypes = 3;
    my $maxbadtokens = 10;

    my $goodref; # reference to hash of common words, mapping them to some non 0 value

# read type/token minima of required good words
# initialized to default values

    my $mingoodtypes = 10;
    my $mingoodtokens = 30;
    my $mingoodratio = 0.25;

# maximum/minimum size of doc before processing
# defaults to 3Kb/200Kb
# (or equivalent char numbers in "character semantics")
#SS: lower value discards empty documents, but penalizes short documents without much formatting
    my $mincharactersize = 3072;
    my $maxcharactersize = 204801;



    foreach (keys %params) {
	if ($_ eq 'badref') {
	    $badref = $params{$_};
	} elsif ($_ eq 'maxbadtypes') {
	    $maxbadtypes = $params{$_};
	} elsif ($_ eq 'maxbadtokens') {
	    $maxbadtokens = $params{$_};
	} elsif ($_ eq 'goodref') {
	    $goodref = $params{$_};
	} elsif ($_ eq 'mingoodtypes') {
	    $mingoodtypes = $params{$_};
	} elsif ($_ eq 'mingoodtokens') {
	    $mingoodtokens = $params{$_};
	} elsif ($_ eq 'mingoodratio') {
	    $mingoodratio = $params{$_};
	} elsif ($_ eq 'mincharactersize') {
	    $mincharactersize = $params{$_};
	} elsif ($_ eq 'maxcharactersize') {
	    $maxcharactersize = $params{$_};
	} elsif ($_ eq 'inlanguage') {
	    $inlanguage = $params{$_};
	} elsif ($_ eq 'encoding') {
	    $default_encoding = $params{$_};
	} elsif ($_ eq 'tokeniser_ref') {
	    $tokeniser_ref = $params{$_};
	} elsif ($_ eq 'tokeniser') {
	    $tokeniser_command = $params{$_};
	}
    };

    if (defined $tokeniser_command) {
	$tokeniser_ref=\&external_tokeniser;
    } elsif ((not defined $tokeniser_ref) or (ref $tokeniser_ref eq 'CODE')) {
	 if (defined $inlanguage) {
	     $tokeniser_ref=($inlanguage=~/^(?:chinese|zh|japanese|jp|korean|kr|zh_cn|zh_hk|zh_tw)$/i) ? 
		 \&character_tokeniser :
		 \&punctuation_tokeniser;
	 } else {
	     $tokeniser_ref=\&punctuation_tokeniser;
	 };
     };
    return process_text_bte($text, $badref, $maxbadtypes, $maxbadtokens,  $goodref, $mingoodtypes, $mingoodtokens, $mingoodratio, $mincharactersize, $maxcharactersize);
}

# $array_ref = dig_content($html_text,\%bad_words,
#                            $max_bad_types, $max_bad_tokens,
#                            \%good_words, $min_good_types,
#                            $min_good_tokens, $min_good_ratio,
#                            $min_size, $max_size);
# the function assumes source texts in latin1 encoding or utf8.  Retained for compatibility
sub dig_content {
    # a string with the contents of the document
    # could be null string
    my $text = shift;
    # reference to hash of stop words, mapping them to some
    # non 0 value
    # could be empty
    my $badref = shift;
    # max thresholds for stop words
    my $maxbadtypes = shift;
    my $maxbadtokens = shift;
    # reference to hash of common words, mapping them to some
    # non 0 value
    # could be empty
    my $goodref = shift;
    # min thresdholds for common words
    my $mingoodtypes = shift;
    my $mingoodtokens = shift;
    my $mingoodratio = shift;
    # min and max number of bytes, or rather caracters, of doc before
    # processing it
    my $mincharactersize = shift;
    my $maxcharactersize = shift;

    $tokeniser_ref=\&punctuation_tokeniser;

    return process_text_bte($text, $badref, $maxbadtypes, $maxbadtokens,  $goodref, $mingoodtypes, $mingoodtokens, $mingoodratio, $mincharactersize, $maxcharactersize);

}

sub process_text_bte {
    my ($text, $badref, $maxbadtypes, $maxbadtokens,  $goodref, $mingoodtypes, $mingoodtokens, $mingoodratio, $mincharactersize, $maxcharactersize)=@_;

    if (!($text)) {
	return ["",0];
    }

    # if too long or too short, return empty string
    if ((length($text)<$mincharactersize)||(length($text)>$maxcharactersize)) {
	return ["",0];
    }

# remove junk (from BTE)
    $text = remove_crap($text);

# replace tags with XTAGX
    $text = identify_tags($text);
# tokenize
    my $tokensref = &{$tokeniser_ref}($text);
# look for longest low tag density stretch
    my $maxboundariesref = look_for_text_stretch($tokensref);
# if what is left in doc satisfies min/max good/bad word constraints, return
# string corresponding to max stretch, and info about score of corresponding
# stretch
    my @cleanedstretch=@{$tokensref}[${$maxboundariesref}[0]..${$maxboundariesref}[1]];


    if (check_page(\@cleanedstretch,$badref,$maxbadtypes,$maxbadtokens,$goodref,$mingoodtypes,$mingoodtokens,$mingoodratio)) {
	my $joiner=($tokeniser_ref == \&character_tokeniser) ? '' : ' ';
	my $cleaned_string = join ($joiner, (grep {$_ ne $tagtoken} @cleanedstretch));

	#replace preserved whitespaces
	if ($tokeniser_ref == \&character_tokeniser) {
	    $cleaned_string=glueprint($cleaned_string);
	};
	return [$cleaned_string,${$maxboundariesref}[2]];
    }
    # else, return an empty string and 0 as the score
    return ["",0];
}

# from BTE
sub remove_crap{
    my $data = shift;
#enough to spot PDF, PS and MS docs that passed through our filters
    return '' if (/^\%PDF-\d/) or (/^\%!PS-/) or (/^\xD0\xCF/);

# we are not interested in anything up to <body
    $data =~ s/.*<body/<body/is;

#############################
# get rid of comments       #
#############################
    $data =~ s{ <!                   # comments begin with a `<!'
		    # followed by 0 or more comments;
		    
    (.*?)		# this is actually to eat up comments in non 
			# random places

     (                  # not suppose to have any white space here

                        # just a quick start; 
      --                # each comment starts with a `--'
        .*?             # and includes all text up to and including
      --                # the *next* occurrence of `--'
        \s*             # and may have trailing while space
                        #   (albeit not leading white space XXX)
     )+                 # repetire ad libitum  XXX should be * not +
    (.*?)		# trailing non comment text
   >                    # up to a `>'
}{
    if ($1 || $3) {	# this silliness for embedded comments in tags
	"<!$1 $3>";
    } 
}gesx;                 # mutate into nada, nothing, and niente

# .. and scripts
    $data =~ s#<script.*?/script>#<script></script>#gsi;
# .. and styles
    $data =~ s#<style.*?/style>#<style></stylex>#gsi;


    # replace sequences of any kind of space symbol
    # with a single space
    $data =~ s/\s+/ /gs;
    # just in case, this includes also entity #10;
#    $data =~ s/\&\#0?10;/ /gs;

#from Niels&Ramon filters: removal of ascii furniture
#??do they really occur in real texts
	my $count =0;
	my $deco = " $tagtoken ";
	
	$count += ($data =~ s/(~~~~@)+/$deco/go);
	$count += ($data =~ s/~~~+/$deco/go);
	$count += ($data =~ s/---+(?:\s+---+)*/$deco/go);
	$count += ($data =~ s/---+/$deco/go);
	$count += ($data =~ s/===+/$deco/go);
	$count += ($data =~ s/\s\*\*\*+\s/$deco/go);
	$count += ($data =~ s/\s___+\s/$deco/go);
	$count += ($data =~ s/\s(?:\*\s\*\s\*)(?:\s\*)*\s/$deco/go);
	$count += ($data =~ s/(?:=\s=\s=)(?:\s=)*/$deco/go);

    return($data);
}

sub glueprint {
    $_[0]=~s/$wspacetoken/ /g;
    return $_[0];
}


# from BTE
sub identify_tags{
###############################
# identify the remaining tags #
###############################
    my $data = shift;


# we brutally add period before unambiguous block elements
#    $data =~ s/([^\!\_\-\:\;\,\.\?])\s*<(ADDRESS|BLOCKQUOTE|BR|CENTER|DIR|DIV|DL|FIELDSET|FORM|H1|H2|H3|H4|H5|H6|HR|ISINDEX|MENU|NOFRAMES|NOSCRIPT|OL|P|PRE|TABLE|UL|DD|DT|FRAMESET|LI|TBODY|TD|TFOOT|TH|THEAD|TR)( |>)/$1. <$2$3/gis;
    

    # following expression is greatly simplified and it will probably
    # miss something, but hopefully it will
    # protect us from the segmentation faults caused by original expression
#it's better to retain paragraph tags
    $data =~ s/<([^>]+)>/ if ($1 ne 'p')  {" $tagtoken "} /egs;

#    $data =~ s{ <                 # opening angle bracket
#
#		    (?:           # Non-backreffing grouping paren
#		     [^>\'\"] *   # 0 or more things that are neither
#		                  # > nor ' nor "
#		     |            #    or else
#		     \".*?\"      # a section between double quotes
#		                  # (stingy match)
#		     |            #    or else
#		     \'.*?\'      # a section between single quotes 
#		                  # (stingy match)
#		     ) +          # repetire ad libitum
#		                  #  hm.... are null tags <> legal? XXX
#		     >            # closing angle bracket
#		 }{ $tagtoken }gsx;     # mutate into nada, nothing, and niente

    return $data;

}

sub character_tokeniser {
    my ($data,$segmentlength) = @_;
    unless ($segmentlength) {
	$segmentlength=$defaulsegmentlength;
    }

    chomp $data;
    my @tokens;
    foreach (split "[ ]+",$data) {
	if (/^[A-Za-z0-9]/) { # if a string starts with Latin chars do not tokenise; 
                              # ideally we should provide the full range of non-CJK UTF8 chars
	    if ($_ eq $tagtoken) {
		push @tokens, $_;
	    } else {
		push @tokens, $wspacetoken, $_, $wspacetoken; #preserve whitespaces
	    }
	} elsif (/\S/) { #otherwise split nonempty lines into characters
	    for (my $i=0; $i < length($_); $i+=$segmentlength) {
		push @tokens,substr($_,$i,$segmentlength);
	    }
	};
    }
    return \@tokens;
}

sub external_tokeniser{
    my $data = shift;
    my ($tempfh, $tmpfile) = tempfile();
    print $tempfh $data;
    close($tempfh);
    $data=`$tokeniser_command <$tmpfile`;
    unlink($tmpfile);
    my @tokens = split /\s+/, $data;
    return \@tokens;
}

sub punctuation_tokeniser{

    my $data = shift;

    chomp $data;
    	

    # sequences of punctuation marks immediately preceding spaces
    # (or end of line) are likely to be true punctuation marks, split them
    # from token that precedes
    # XXX [this should actually be language specific!] XXX
 #   $data =~ s/([\.\,\!\)\]\}\:\;\?\>\'\"]+)($|[\s]+)/ $1 /gs;
    
    # and the same goes for some word-initial non-alphanumeric symbols
 #   $data =~ s/(^|[\s]+)([\[\(\<\{\'\"\xbf\xa1]+)/ $1 /g;
    
    # also, if there is more than one - it's probably some kind of
    # boundary marker
 #   $data =~ s/\-\-+/ - /g;
    
    # ok, now besides dashes, apostrophes and slashes we should
    # have only internal non-alhpanumericals left 
    # let's tokenize

    my @tokens = grep {/[\S]/} (split "[ ]+",$data);
    return \@tokens;

}

sub look_for_text_stretch{

    my $tokensref =shift;

    my $tok_count = scalar(@{$tokensref});

    my $i = 0;
    my $prev_t = 0;
    my $tag;
    my @chunk_values = ();
    my @chunk_indices = ();
    my $prev_i;
    my $curr_value;

    while ($i < $tok_count) {
	if (${$tokensref}[$i] eq $tagtoken) {
	    $tag = 1-2;
	}
	else {
	    $tag = 1;
	}

	if ($tag != $prev_t) {
	    
	    if ($prev_t) {
		push @chunk_values,$curr_value;
		push @chunk_indices,$prev_i;
	    }
	    $prev_i = $i;
	    $curr_value = 0;
	    $prev_t = $tag;
	}

	$curr_value += $tag;

	$i++;
    }

    push @chunk_values,$curr_value;
    push @chunk_indices,$prev_i;
    
    my $chunk_count = scalar(@chunk_indices);

    my $max_score = 0;
    my $max_begin = 0;
    my $max_end = 0;


    $i = 0;
    my $score;
    while ($i<($chunk_count-1)) {
	
	$score = $chunk_values[$i];
	# if score negative, there is no point in beginning from here
	if ($score < 0) {
	    $i++;
	    next;
	}

	# else, check that this is not already a maximum 
	if ($score>$max_score){
	    $max_score = $score;
	    $max_begin = $i;
	    $max_end = $i;
	}


	my $j= $i+1;
	while ($j<$chunk_count) {
	    
	    $score += $chunk_values[$j];

	    if ($score>$max_score){

#		# debug
#		print STDERR "this was a maximum\n";

		$max_score = $score;
		$max_begin = $i;
		$max_end = $j;
	    }

	    $j++;
	}

	$i++;
    }

    my $begin = $chunk_indices[$max_begin];
    # notice: final chunk _must_ be positive
    my $end = $chunk_indices[$max_end] + $chunk_values[$max_end] - 1;

    return [$begin,$end,$max_score];
}

# check that a page has the required min/max numbers of good/bad
# tokens

sub check_page {
    my $tokensref = shift;
    # hash with bad words
    my $badref = shift;
    # max thresholds for bad words
    my $maxbadtypes = shift;
    my $maxbadtokens = shift;
    # reference to hash of common words
    my $goodref = shift;
    # min thresdholds for common words
    my $mingoodtypes = shift;
    my $mingoodtokens = shift;
    my $mingoodratio = shift;

    my @tokens = @{$tokensref};
    my @badtypes = ();
    my $badtokens = 0;
    my @goodtypes = ();
    my $goodtokens = 0;
    my %seentypes = ();

    # if the page has 5 tokens or less, there is no point in keeping it
    return 0 if (scalar(@tokens) <= 5);
    return 1 unless ($badref) or ($goodref); #nothing to count

    foreach my $token (@tokens) {
	$token =~ s/[\.\,\!\)\]\}\:\;\?\>\'\"]+$//;
	$token =~ s/^[\[\(\<\{\'\"\xbf\xa1]+//;
	if(${$badref}{lc($token)}) {
	    $badtokens++;
	    push @badtypes,$token
		unless $seentypes{$token}++;

	    if (($badtokens >= $maxbadtokens)&&(scalar(@badtypes) >= $maxbadtypes)) {
		return 0;
	    }
	}
	
	elsif(${$goodref}{lc($token)}) {
	    $goodtokens++;
	    push @goodtypes,$token
		unless $seentypes{$token}++;
	}
    }

    if (($goodtokens >= $mingoodtokens)&&(scalar(@goodtypes) >= $mingoodtypes) && (($goodtokens/scalar(@tokens))>=$mingoodratio)) {
	return 1;
    }
    return 0;
}

# The following set is based on a perl module from: Jean-Michel Hiver <jhiver@mkdoc.com>.
# but extended to process &amp; &gt; &lt; &quot; &apos; as well as &#039; used as the apostrophe.
my %ENTITY_2_CHAR = (
 amp    => '&',  # ampersand 
 'gt'    => '>',  # greater than
 'lt'    => '<',  # less than
 quot   => '"',  # double quote
 apos   => "'",  # single quote
 '#039' => "'",
 nbsp   => " ", # non breaking space

 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
 AElig	=> 'Æ',  # capital AE diphthong (ligature)
 Aacute	=> 'Á',  # capital A, acute accent
 Acirc	=> 'Â',  # capital A, circumflex accent
 Agrave	=> 'À',  # capital A, grave accent
 Aring	=> 'Å',  # capital A, ring
 Atilde	=> 'Ã',  # capital A, tilde
 Auml	=> 'Ä',  # capital A, dieresis or umlaut mark
 Ccedil	=> 'Ç',  # capital C, cedilla
 ETH	=> 'Ð',  # capital Eth, Icelandic
 Eacute	=> 'É',  # capital E, acute accent
 Ecirc	=> 'Ê',  # capital E, circumflex accent
 Egrave	=> 'È',  # capital E, grave accent
 Euml	=> 'Ë',  # capital E, dieresis or umlaut mark
 Iacute	=> 'Í',  # capital I, acute accent
 Icirc	=> 'Î',  # capital I, circumflex accent
 Igrave	=> 'Ì',  # capital I, grave accent
 Iuml	=> 'Ï',  # capital I, dieresis or umlaut mark
 Ntilde	=> 'Ñ',  # capital N, tilde
 Oacute	=> 'Ó',  # capital O, acute accent
 Ocirc	=> 'Ô',  # capital O, circumflex accent
 Ograve	=> 'Ò',  # capital O, grave accent
 Oslash	=> 'Ø',  # capital O, slash
 Otilde	=> 'Õ',  # capital O, tilde
 Ouml	=> 'Ö',  # capital O, dieresis or umlaut mark
 THORN	=> 'Þ',  # capital THORN, Icelandic
 Uacute	=> 'Ú',  # capital U, acute accent
 Ucirc	=> 'Û',  # capital U, circumflex accent
 Ugrave	=> 'Ù',  # capital U, grave accent
 Uuml	=> 'Ü',  # capital U, dieresis or umlaut mark
 Yacute	=> 'Ý',  # capital Y, acute accent
 aacute	=> 'á',  # small a, acute accent
 acirc	=> 'â',  # small a, circumflex accent
 aelig	=> 'æ',  # small ae diphthong (ligature)
 agrave	=> 'à',  # small a, grave accent
 aring	=> 'å',  # small a, ring
 atilde	=> 'ã',  # small a, tilde
 auml	=> 'ä',  # small a, dieresis or umlaut mark
 ccedil	=> 'ç',  # small c, cedilla
 eacute	=> 'é',  # small e, acute accent
 ecirc	=> 'ê',  # small e, circumflex accent
 egrave	=> 'è',  # small e, grave accent
 eth	=> 'ð',  # small eth, Icelandic
 euml	=> 'ë',  # small e, dieresis or umlaut mark
 iacute	=> 'í',  # small i, acute accent
 icirc	=> 'î',  # small i, circumflex accent
 igrave	=> 'ì',  # small i, grave accent
 iuml	=> 'ï',  # small i, dieresis or umlaut mark
 ntilde	=> 'ñ',  # small n, tilde
 oacute	=> 'ó',  # small o, acute accent
 ocirc	=> 'ô',  # small o, circumflex accent
 ograve	=> 'ò',  # small o, grave accent
 oslash	=> 'ø',  # small o, slash
 otilde	=> 'õ',  # small o, tilde
 ouml	=> 'ö',  # small o, dieresis or umlaut mark
 szlig	=> 'ß',  # small sharp s, German (sz ligature)
 thorn	=> 'þ',  # small thorn, Icelandic
 uacute	=> 'ú',  # small u, acute accent
 ucirc	=> 'û',  # small u, circumflex accent
 ugrave	=> 'ù',  # small u, grave accent
 uuml	=> 'ü',  # small u, dieresis or umlaut mark
 yacute	=> 'ý',  # small y, acute accent
 yuml	=> 'ÿ',  # small y, dieresis or umlaut mark

 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
 copy   => '©',  # copyright sign
 reg    => '®',  # registered sign

 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
 iexcl  => '¡',
 cent   => '¢',
 pound  => '£',
 curren => '¤',
 yen    => '¥',
 brvbar => '¦',
 sect   => '§',
 uml    => '¨',
 ordf   => 'ª',
 laquo  => '«',
'not'   => '¬',    # not is a keyword in perl
 shy    => '­',
 macr   => '¯',
 deg    => '°',
 plusmn => '±',
 sup1   => '¹',
 sup2   => '²',
 sup3   => '³',
 acute  => '´',
 micro  => 'µ',
 para   => '¶',
 middot => '·',
 cedil  => '¸',
 ordm   => 'º',
 raquo  => '»',
 frac14 => '¼',
 frac12 => '½',
 frac34 => '¾',
 iquest => '¿',
'times' => '×',    # times is a keyword in perl
 divide => '÷',

   OElig    => chr(338),
   oelig    => chr(339),
   Scaron   => chr(352),
   scaron   => chr(353),
   Yuml     => chr(376),
   fnof     => chr(402),
   circ     => chr(710),
   tilde    => chr(732),
   Alpha    => chr(913),
   Beta     => chr(914),
   Gamma    => chr(915),
   Delta    => chr(916),
   Epsilon  => chr(917),
   Zeta     => chr(918),
   Eta      => chr(919),
   Theta    => chr(920),
   Iota     => chr(921),
   Kappa    => chr(922),
   Lambda   => chr(923),
   Mu       => chr(924),
   Nu       => chr(925),
   Xi       => chr(926),
   Omicron  => chr(927),
   Pi       => chr(928),
   Rho      => chr(929),
   Sigma    => chr(931),
   Tau      => chr(932),
   Upsilon  => chr(933),
   Phi      => chr(934),
   Chi      => chr(935),
   Psi      => chr(936),
   Omega    => chr(937),
   alpha    => chr(945),
   beta     => chr(946),
   gamma    => chr(947),
   delta    => chr(948),
   epsilon  => chr(949),
   zeta     => chr(950),
   eta      => chr(951),
   theta    => chr(952),
   iota     => chr(953),
   kappa    => chr(954),
   lambda   => chr(955),
   mu       => chr(956),
   nu       => chr(957),
   xi       => chr(958),
   omicron  => chr(959),
   pi       => chr(960),
   rho      => chr(961),
   sigmaf   => chr(962),
   sigma    => chr(963),
   tau      => chr(964),
   upsilon  => chr(965),
   phi      => chr(966),
   chi      => chr(967),
   psi      => chr(968),
   omega    => chr(969),
   thetasym => chr(977),
   upsih    => chr(978),
   piv      => chr(982),
   ensp     => chr(8194),
   emsp     => chr(8195),
   thinsp   => chr(8201),
   zwnj     => chr(8204),
   zwj      => chr(8205),
   lrm      => chr(8206),
   rlm      => chr(8207),
   ndash    => chr(8211),
   mdash    => chr(8212),
   lsquo    => chr(8216),
   rsquo    => chr(8217),
   sbquo    => chr(8218),
   ldquo    => chr(8220),
   rdquo    => chr(8221),
   bdquo    => chr(8222),
   dagger   => chr(8224),
   Dagger   => chr(8225),
   bull     => chr(8226),
   hellip   => chr(8230),
   permil   => chr(8240),
   prime    => chr(8242),
   Prime    => chr(8243),
   lsaquo   => chr(8249),
   rsaquo   => chr(8250),
   oline    => chr(8254),
   frasl    => chr(8260),
   euro     => chr(8364),
   image    => chr(8465),
   weierp   => chr(8472),
   real     => chr(8476),
   trade    => chr(8482),
   alefsym  => chr(8501),
   larr     => chr(8592),
   uarr     => chr(8593),
   rarr     => chr(8594),
   darr     => chr(8595),
   harr     => chr(8596),
   crarr    => chr(8629),
   lArr     => chr(8656),
   uArr     => chr(8657),
   rArr     => chr(8658),
   dArr     => chr(8659),
   hArr     => chr(8660),
   forall   => chr(8704),
   part     => chr(8706),
   exist    => chr(8707),
   empty    => chr(8709),
   nabla    => chr(8711),
   isin     => chr(8712),
   notin    => chr(8713),
   ni       => chr(8715),
   prod     => chr(8719),
   sum      => chr(8721),
   minus    => chr(8722),
   lowast   => chr(8727),
   radic    => chr(8730),
   prop     => chr(8733),
   infin    => chr(8734),
   ang      => chr(8736),
  'and'     => chr(8743),
  'or'      => chr(8744),
   cap      => chr(8745),
   cup      => chr(8746),
  'int'     => chr(8747),
   there4   => chr(8756),
   sim      => chr(8764),
   cong     => chr(8773),
   asymp    => chr(8776),
  'ne'      => chr(8800),
   equiv    => chr(8801),
  'le'      => chr(8804),
  'ge'      => chr(8805),
  'sub'     => chr(8834),
   sup      => chr(8835),
   nsub     => chr(8836),
   sube     => chr(8838),
   supe     => chr(8839),
   oplus    => chr(8853),
   otimes   => chr(8855),
   perp     => chr(8869),
   sdot     => chr(8901),
   lceil    => chr(8968),
   rceil    => chr(8969),
   lfloor   => chr(8970),
   rfloor   => chr(8971),
   lang     => chr(9001),
   rang     => chr(9002),
   loz      => chr(9674),
   spades   => chr(9824),
   clubs    => chr(9827),
   hearts   => chr(9829),
   diams    => chr(9830),
);

sub translate_html_entities{
    my $data = shift;
    my $c = 0;
#    utf8::encode($data);
#    $data =~ s/(&\#(\d+);?)/$2 > 255 ? chr($2) : $1/eg;
#    $data =~ s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c > 255 ? chr($c) : $1/eg;
    $data =~ s/(&(\w+);?)/$ENTITY_2_CHAR{$2} || $1/eg;
    utf8::decode($data);
    return $data;
}

sub convert_to_utf8 {
    my ($data, $inlanguage,$encoding)=@_;
    if ($encoding) {
	$default_encoding=$encoding;
	$legacy_converter_ref=\&convert_encoding;
    } elsif (defined $inlanguage) {
	$legacy_converter_ref=($inlanguage=~/^(?:belarussian|by|bulgarian|bg|czech|cz|estonian|es|croatian|hr|hungarian|hu|lithuanian|lt|latvian|lv|polish|pl|russian|ru|slovak|sk|slovene|sl|ukrainian|ua|chinese)$/i) ? \&convert_enca :
	    ($inlanguage=~/^(?:chinese|zh|japanese|jp|korean|kr|zh_cn|zh_hk|zh_tw)$/i) ? \&convert_cjk :
	    \&convert_latin1;
    } else {
	$legacy_converter_ref=\&convert_latin1;
    }

    $data=&{$legacy_converter_ref}($data,$inlanguage);
#    return $data;
# resolve entities (they will be in utf8)
    return translate_html_entities($data);
}

#we can distinguish between latin1 and utf8
sub convert_latin1 {
    my $data=shift;
#	utf8::encode($data);  #to be sure that we have an octet string
    if ($data=~/(\xC3[\x80-\xBF])/) {
	#Marco used a longer range: [\xC0-\xDF][\x80-\xBF],
	# but all Latin1 chars start with C3 in UTF8
	utf8::decode($data);
    } else {
# Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default
	$data = Unicode::String::latin1( $data );
	utf8::upgrade($data); #it's always done
    };
    return($data);
}

#this is for a forced UTF8 conversion from a list of suggested encodings
sub convert_encoding {
    my $data=shift;
    my $encoding= shift or $default_encoding;
    my @encodinglist=split(/\s+/, $encoding);
    my $decoder = guess_encoding($data, @encodinglist);
    my ($guess,$utf8);
    if (ref($decoder)) {
	$guess=$decoder->name;
    } else { #if it thinks it can be utf8, it IS utf8, otherwise the first suggestion
	$guess= ($decoder=~/utf8/) ? 'utf8' : $encodinglist[0];
    };
    if ($guess eq 'utf8') {
	utf8::decode($data); #in case the utf8 bit hasn't been set
	return($data);
    } else {
	eval {
	    $utf8 = Encode::decode($guess,$data,Encode::FB_CROAK);
	  };
	if ( $@ ne "" ) {
	   print STDLOG "---Failed Encode::decode($guess): $@";
	   return 0;
	} else {
	    return $utf8;
	}
    };
}

#this function uses enca to convert to any specified encoding known by enca
#Encode::Guess doesn't distinguish between single-byte encodings, eg. 1251 and koi8
#Run enca --list languages 
#to get the list
sub convert_enca { 

    my ($data, $inlanguage)=@_;
    unless (defined $inlanguage) {
	$inlanguage='ru'
    }
    my $outencoding='UTF8';
    my $encacall="enca -L $inlanguage -x $outencoding";
    my ($tempfh, $tmpfile) = tempfile();
    print $tempfh $data;
    close($tempfh);
    $data=`$encacall <$tmpfile`;
    unlink($tmpfile);
    utf8::decode($data);
    return($data);
}

## a list of encodings associated with CJK languages
##in some cases proper utf can't be detected, so it's better to put it second after
##the most frequent legacy encoding
my %get_encoding=(
jp => 'cp932 utf8 7bit-jis euc-jp iso-2022-jp-1',
kr => 'cp949 utf8 euc-kr iso-2022-kr',
zh_cn => 'cp936 utf8',
zh_hk => 'big5-hk utf8',
zh_tw => 'big5-eten utf8 euc-tw'
);

sub convert_cjk {
    my ($data, $inlanguage)=@_;
    my $encoding=$get_encoding{$inlanguage} || $get_encoding{'zh_cn'}; # the list of known encodings
    return convert_encoding($data,$encoding);
}

1;

=head1 NAME

I<PotaModule.pm>: module to perform various forms of cleaning and
filtering on an HTML file in order to use it as corpus fodder.

=head1 SYNOPSIS

Older syntax for Latin 1 only
$array_ref = dig_content($html_text, \%bad_words, $max_bad_types, $max_bad_tokens, \%good_words, $min_good_types, $min_good_tokens, $min_good_ratio, $min_size, $max_size);

The syntax for the UTF8 version
$array_ref = dig_content_uni(html_text,
    bad_words => \%bad_words,
    max_bad_types => $max_bad_types,
    max_bad_tokens => $max_bad_tokens,
    good_words => \%good_words,
    min_good_types => $min_good_types,
    min_good_tokens => $min_good_tokens,
    min_good_ratio => $min_good_ratio,
    min_size => $min_size,
    max_size => $max_size,
    inlanguage => $inlanguage,
    legacy_encoding_ref => \&legacy_encoding_ref,
    tokeniser_ref => \&tokeniser_ref,
    tokeniser => $external_tokeniser_call

    );

=head1 DESCRIPTION

This module exports two functions, I<dig_content> and
I<dig_content_uni>, which are identical in their functionality and can
be considered as a tool to extract the fragment that contains the most
connected text and the least "boilerplate" from an html
document. Moreover, the module applies various filters to the
document, and if the document violates any of them (in particular, it
is too large or too small, contains too many words from a stop word
list, or not enough from a list of words that should cue connected
text), the function filters out the document (in the sense that it
returns an empty string).

The function takes as its input an html document, and possibly empty
stop and keep lists and various parameters, and it returns an array
reference, where the first element of the array is a string containing
the cleaned text extracted from the analyzed document (possibly, an
empty string), and the second element is the score of the returned
fragment for the measure of tag density described below. This score
can be useful for debugging purposes, but it can typically be ignored.

The second function (I<dig_content_uni>) can take several extra
parameters specifying the functions for detecting legacy encodings and
converting them from to utf8, and custom tokenisation.

The module provides defaults for detecting the following encodings
=item Latin1 (ISO, CP1252), this 
=item Latin2 and Cyrillic encodings supproted by Enca, http:/trific.ath.cx/software/enca/
=item CJK languages (Chinese, Japanese, Korean), using Encode::Guess module

=head2 ARGUMENTS

The function takes two arguments:

B<$html_text>: The contents of an html document. If empty, an empty
string will be returned.

B<%parameters>: A hash list of optional parameters including:

B<bad_words => \%bad_words>: Reference to a hash containing a list of stop words
(e.g., pornographic terms) as keys, associated with non-zero values
(so that it's easy to check if a word is in list). Passed hash can be
empty.

B<max_bad_types => $max_bad_types>: Documents with at least as many types in stop word
list are filtered out (i.e., null string is returned). This should
always be higher than 0.

B<max_bad_tokens => $max_bad_tokens>: Documents with at least as many tokens in stop
word list are filtered out (i.e., null string is returned). This
should always be higher than 0.

B<good_words => \%good_words>: Reference to a hash containing a list of words (e.g.,
function words) that are expected to occur with a certain frequency in
all pages containing connected text in the target language. All words
should be associated to non-zero values (so that it's easy to check if
a word is in list). Passed hash can be empty.

B<min_good_types => $min_good_types>: Documents that do not contain at least as many
types from "good word" list are filtered out (i.e., null string is
returned). If list of good words is empty, this should be set to 0.

B<min_good_tokens => $min_good_tokens>: Documents that do not contain at least as many
tokens from "good word" list are filtered out (i.e., null string is
returned). If list of good words is empty, this should be set to 0.

B<min_good_ratio => $min_good_ratio>: At least this ratio of the tokens in a document
should come from good word list -- because of the Zipfian properties
of language, connected texts are expected to have a rather high
proportion of tokens from a small set of function words (empirically,
I found 0.25 to be a reasonable threshold). If list of good words is
empty, this should be set to 0.

B<min_size => $min_size>: Documents that have less than this number of characters
(as assessed by Perl length function) will be filtered out.

B<max_size => $max_size>: Documents that have more than this number of characters
(as assessed by perl length function) will be filtered out.

B<legacy_encoding_ref => \&legacy_encoding_ref>: A reference to
a function that converts from your legacy encodings to utf8.

B<tokeniser_ref => \&tokeniser_ref>: A reference to
a Perl procedure implementing a language-dependent tokenizer

B<tokeniser => $tokeniser_ref>: The command to run a
language-dependent tokenizer, which works as a filter taking a plain
text at STDIN and writing the string of characters separated with
blanks (anything matching to \s) to STDOUT.

B<inlanguage => $inlanguage>: The language of the input document to
choose the default converter.  The list of supproted names:
belarussian, bulgarian, czech, estonian, croatian, hungarian,
lithuanian, latvian, polish, russian, slovak, slovene, ukrainian
(requires Enca), chinese, japanese, korean with the following script
variations for Chinese: zh_cn, zh_hk and zh_hk (requires
Encode::Guess).  If the language is not in the list, the choice of the
converter depends on the value of legacy_encoding_ref.  If it is not
set, latin1 is used.

=head2 PROCEDURE AND NOTES

First, the function exported by this model filters out documents that
are below/above the maximum/minimum size thresholds.

Then it strips off javascript and comments, and it performs a rough
tokenization of the text and the html tags (that are mapped to "tag"
tokens). It then looks for the stretch of text that maximizes the
quantity:

N(textual tokens) - N(tag tokens)

This "low tag density" fragment is likely to contain the (or at least
"a") content rich section of the document. High tag density sections
of a page are more likely to contain boilerplate (navigation
information, links, etc.)

If the low tag density fragment satisfies the constraints on
minimum/maximum quantity of "good"/"bad" words, it is returned
(without the tags, of course) as the output string.

The module, for now, assumes that the input is in either latin1 or
utf8 encoding a latin1 language (utf8 is converted to latin1), and
that the language of the document is one where a simple, white-space
tokenization scheme will do the trick.

The method to find the content-rich section of a page is taken from
Aidan Finn's BTE module (a part of the Hyppia project: see URL below),
and the module contains generous portions of code from BTE.

However, this module, in my experiments, is a lot more efficient than
the original BTE module, probably because, instead of computing the
quantity above for each possible stretch, it avoids doing so for
stretches that could not possibly have the highest score (such as
sequences beginning with textual tokens preceded by other textual
tokens, or ending with textual tokens followed by other textual
tokens, or beginning or ending with tag tokens).

=head1 DEPENDENCIES

None I'm aware of.

=head1 AUTHOR

Marco Baroni, baroni AT sslmit.unibo.it
Serge Sharoff, University of Leeds

=head1 ACKNOWLEDGMENTS

Thanks to Aidan Finn for the BTE module and to Eros Zanchetta for
help, advice and testing.

=head1 BUGS

Probably many: if you find one, please let me know: baroni AT sslmit
unibo it


=head1 COPYRIGHT

Copyright 2005, Marco Baroni, Serge Sharoff

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.

=head1 SEE ALSO

The BTE module of the Hyppia project:
http://www.smi.ucd.ie/hyppia/

A collection of corpora collected from the Internet and processed using this mechanism:
http://corpus.leeds.ac.uk/internet.html

=cut

