#!/usr/bin/perl

# to see the documentation, type:
# perldoc flist2utf8.pl

use strict;
use lib('/corpora/tools');
use warnings;
use IO::Handle;
use Encode;
#use Text::PotaModule; once it's on CPAN
use PotaModule;

use Getopt::Long;

#autoflush STDOUT 1;

my $usage = <<"_USAGE_"; 

This script takes html docs from a file list and converts them all to utf8

To see documentation and usage, type:

perldoc $0

Copyright 2007, Serge Sharoff

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.

_USAGE_

my $help;

my $language='en';

my ($encoding,$logfile);

Getopt::Long::Configure('auto_help');
GetOptions('language=s' => \$language,'encoding=s' => \$encoding, 'journal=s' => \$logfile);

unless ($encoding) {
    $encoding= 0; #to ignore it in utf8 conversion
};
if ($logfile) {
    open(STDLOG, ">$logfile") or die "Cannot create the logfile '$logfile': $!\n"
} else {
    *STDLOG=*STDERR;
};

binmode(STDLOG, ":utf8");
binmode(STDOUT, ":utf8");

while (my $fname=<>) {
    chomp $fname;
    next if -d $fname;
    print STDLOG "$fname\n";
    if (open(HTMLTEXT,$fname)) {
	my @htmltext=<HTMLTEXT>;
	close(HTMLTEXT);
	my ($htmlheader,$htmltext)=get_htmlfile(@htmltext);
	next unless $htmltext=~/\S/;
	($htmlheader,$htmltext)=convert_to_utf8($htmlheader,$htmltext,$language,$encoding);

	unless ($htmlheader=~/\S/) {
	    $htmlheader=qq{id="$fname"}; # at least the filename can be used for id purposes
	};
	unless (Encode::is_utf8($htmltext,1)) {
	    print STDLOG "---utf8 encoding problem in $htmlheader\n";#,substr($htmltext,0,1024),"---\n";
	    next;
	};
	$htmltext=~s%</text>\n%%g; # just in case it exists in the html somehow 
	my $outstr="<text $htmlheader>\n$htmltext\n</text>\n";
	utf8::decode($outstr);
	print $outstr;
    } else {
	print STDLOG "Cannot read from $fname:$!\n";
    } 
}

=head1 NAME

I<flist2utf8.pl>: a simple script that
takes html pages from a file list and converts them to utf8 to produce a single file.

=head1 SYNOPSIS

flist2utf8.pl -e CP936 -j zh.log <file_list > corpus.txt (if all pages are in CP936)

flist2utf8.pl -l Russian -j ru.log <file_list > corpus.txt (if Russian pages are in a variety of encodings)

=head1 DESCRIPTION

The input list must have one file name per line. The script downloads
the corresponding html page, checks existence of a header at the
top and outputs a page in UTF8 encoding.

The module can either the explicit encoding parameter or language, so that the source encoding will be guessed.

=head2 ARGUMENT AND OPTIONS

The only argument to the script is a file with a list of urls, one per line (it can also come from STDIN).

The options are:

B<-h>: Prints short info about the script and quits.

B<--journal filename>: a log file.  If a page cannot be converted, a record is stored.

B<--language language>: the language can be specified to guide the
encoding detection mechanism (by default we use enca for Eastern
European languages, e.g. bg, cz, pl, ru, and the Encoding::Guess
module for CJK).  For other languages, the encoding is assumed to be
aither latin1 or utf8.

B<-e encoding --encoding encoding>: the encoding used in all files in
the corpus.  This switch overrides the language setting.  The source
encoding should be known to iconv.

You can use B<encoding-sort.pl> to sort your html files according to
the charset they B<say> they use (though this dones not mean they
really use it).

=head1 DEPENDENCIES

You need to have the following modules installed:

Encode::Guess

PotaModule

To install Encode::Guess run (as the superuser)
perl -MCPAN -e shell
install Encode::Guess

For successful decoding of Eastern European scripts you need Enca,
which is available from: 

http://trific.ath.cx/software/enca/install/


=head1 AUTHOR

Serge Sharoff, University of Leeds

=head1 BUGS

Probably many: if you find one, please let me know.

=head1 COPYRIGHT

Copyright 2008, Serge Sharoff

This program is free software. You may copy or redistribute it under
the same terms as Perl itself.

=head1 SEE ALSO

The PotaModule and other software from the WaCky project:

http://sslmitdev-online.sslmit.unibo.it/wac/wac.php

=cut
    

