#!/usr/bin/perl -w
#Creates a model in Weka's ARFF format out of a list of features (in the frequency list comparison format) and a corpus subset
use strict;
use CL;
use Getopt::Long;

my $sentend='SENT';
my ($corpusname,$fqname1,$n,$subname);
my $attrname='lemma';
my $mindoc=500;
my $maxdoc=100000;
my $description;
my $punctuation;
my %punclass = (
'?' => 'quest',
'？' => 'quest',
'!' => 'excl',
'！' => 'excl',
'。' => 'istop',
'.' => 'fstop',
'《' => 'quote',
'》' => 'quote',
'"' => 'quote',
'”' => 'quote',
'’' => 'quote',
'“' => 'quote',
'‘' => 'quote',
'' => 'other',
		 );

undef my %punclasstypes;
@punclasstypes{values %punclass}=();
my @punclass=sort keys %punclasstypes;
my $texttyp;
my $wekatest;

GetOptions (
	    "1=s"  => \$fqname1,
	    "attrname=s"  => \$attrname,
	    "break=s" => \$sentend,
	    "corpusname=s" => \$corpusname,
#	    "description=s" => \$descpattern,
	    "description=s" => \$description,
	    "grams=i" => \$n,
	    "mindoc=s" => \$mindoc,
	    "xdoc=s" => \$maxdoc,
	    "punctuation" => \$punctuation,
	    "subcorpus=s" => \$subname,
	    "texttypes=s" => \$texttyp,
	    "weka" => \$wekatest,
	    );
my @flist=@ARGV;
die "Usage: $0 FilesWithFeatures
feature lists are in the one-per line format, extra information about a feature can be recorded after a tab
-1 --fname Frequency list from 
-a --attrname attrname  (default 'lemma')
-b --break token for Ngrams (default 'SENT')
-c --corpusname corpusname (no default value, give as in the CWB registry)
-d --description descriptive feature for a subcorpus
-m --mindoc  N Ignore documents shorter than N words (default 500)
-g --grams  N (features are Ngrams instead of single words/tags)
-p --punctuation Add punctuation count to features 
-s --subcorpus fname with the list of subcorpus ids
-t --texttypelist (the list of all possible features for filling in the ARFF header)
-w --weka for adding docts to an existing Weka file, do not output the header
-c, -d, -t and at least one feature list are required
\n" unless $corpusname or $flist[0] or $description or $texttyp;

my %wl;
my ($refcount,$refwl);
if ($fqname1) {
    ($refcount,$refwl)=create_fq_list($fqname1);
    %wl=%{$refwl};
}

undef my %keywords;
foreach my $fname (@flist) {
    open(IN,$fname) or die "Cannot open $fname: $!\n";
    while (<IN>) {
	chomp;
	next if /_UNDEF_/;
	s/\#.+//;
	my ($word)=split /\t/,$_;
	$keywords{$word}=1 if ($word) and ($word=~/\S/);
    }
    close(IN);
};
my @keywords=sort keys %keywords;

my $C = new CL::Corpus $corpusname;
die "Cannot open corpus '$corpusname'\n" unless (defined $C);

my $pattr = $C->attribute($attrname, 'p');		# 'p' = p-attribute
die "Cannot open attribute '$attrname' in corpus '$corpusname'\n" unless (defined $pattr);

my $wattr = ($attrname eq 'word') ? $pattr : $C->attribute('word', 'p');
die "Cannot open attribute 'word' in corpus '$corpusname'\n" unless (defined $wattr);

my $tattr = $C->attribute('text', 's');		# 's' = s-attribute
die "Cannot open s-attribute 'text' in corpus '$corpusname'\n" unless (defined $tattr);

my @ids;
my (%cpos,%ids);
if (defined $subname) {
    open(my $fh,$subname) or die "Cannot open $subname: $!\n";
    local $/;
    my $ids=<$fh>;
    close($fh);
    @ids{split /\s+/,$ids}=();
    @ids=sort keys %ids;
} else {
    $subname=0;
}

my $nexttempid;
my $totaltexts=$tattr->max_struc;
#if ($totaltexts>10000) {
#    $totaltexts=10000
#};
foreach my $textnum (0..$totaltexts-1) {
    my ($spos,$epos)=$tattr->struc2cpos($textnum);
    next if ($mindoc) and ($epos-$spos<$mindoc);
    next if ($maxdoc) and ($epos-$spos>$maxdoc);
    my $titlestr=$tattr->struc2str($textnum);
    utf8::upgrade($titlestr);
    my ($textid,$textrest)=$titlestr=~/id="(.+?)"(.+)/;
    $textid=$nexttempid++ unless defined $textid;
    my $textdescclass=$description; #originally I wanted to extract it from textrest, but this is not practicable because of hundreds of conventions

    if ($subname) { # we have a sublist
	next unless exists $ids{$textid};
    } else { #we create a list of all ids
	push @ids,$textid;
    };

    @{$cpos{$textid}}=($spos,$epos,$textdescclass);
}

print "\% The model for attribute '$attrname' in $subname (",scalar(@ids)," texts)\n";
unless ($wekatest) {
    print "\@RELATION textcat\n";
    print "\% Attributes taken from @flist\n"; 

    my $i=1;
    foreach (@keywords) {
	print "%% $_\n";
	print "\@ATTRIBUTE w",$i++," real\n";
    };
    if ($punctuation) {
	foreach (@punclass) {
	    print "%% $_\n";
	    print "\@ATTRIBUTE p",$i++," real\n";
	};
    }; 

    print "\@ATTRIBUTE textclass {$texttyp}\n\n\@DATA\n";
};

$|=1;
my $corpus_size=0;
my $textclassnum=1;

foreach my $textid (@ids) {
    if (exists $cpos{$textid}) { #just in case to ensure that this id indeed exists
	my ($spos,$epos,$textdescclass)=@{$cpos{$textid}};
	my $curdoclength=$epos-$spos;
	$corpus_size+=$curdoclength;

	undef my %words;
	undef my @words;
	if ($n) {
	    undef my @nwin;
	    my @singlewords=$pattr->cpos2str($spos..$epos);
	    foreach my $s (@singlewords) {
		push @nwin,$s;
		if (scalar(@nwin)>=$n) {
		    push @words, "@nwin";
		    shift @nwin;
		};
		undef @nwin if $s eq $sentend;
	    }
	} else {
	    my @cpos=($spos..$epos);
	    @words=$pattr->cpos2str(@cpos);
	};
	foreach my $word (@words) {
	    if (exists $keywords{$word}) {
		++$words{$word};
	    };
	};
	next if scalar(@words)<3; # even if we reject short docts, some stray docts can show little variation in pos tags
	print "% $textid\n";
	my $factor=1000000/$curdoclength;
	foreach (@keywords) {
	    if (exists $words{$_}) {
		printf "%.0f,", ($fqname1) ? 
		    llscore($words{$_},$wl{$_},$curdoclength,$refcount) : ($words{$_}*$factor);
	    } else {
		print "0,";
	    };
	};
	undef @words;
	undef %words;
	undef my %puncount;
	if ($punctuation) {
	    my @tokens=$wattr->cpos2str($spos..$epos);
	    foreach (@tokens) {
		if (my $punclass=$punclass{$_}) {
		    ++$puncount{$punclass};
		} elsif (/^[,·，;；:：．（）()*—…%％\/\[\]－-]$/) {
		    ++$puncount{'other'};
		}
	    };
	    foreach (@punclass) {
		if (exists $puncount{$_}) {
		    printf "%.0f,",$puncount{$_}*$factor;
		} else {
		    print "0,";
		};
	    };
	};
	print "$textdescclass\n";
    }
}

sub create_fq_list {
    open(IN,$_[0]) or die "Cannot open $_[0]: $!\n";
    undef my %wl;
    my $totalfq=0;
    my ($fq,$lemma);
    while (<IN>) {
	if (($fq,$lemma)=/^\d+ ([\d.]+) (.+)/) {
	    $wl{$lemma}+=$fq;
	    $totalfq+=$fq;
	} elsif ((($fq,$lemma)=/^\s*(\d+)\s(.+)/) and ($lemma=~/\w/)) { # for plain frq lists from uniq -c
	    $wl{$lemma}+=$fq;
	    $totalfq+=$fq;
	}
    }
    close(IN);
    return($totalfq,\%wl);
}

sub ln {
    return(($_[0]>0) ? log($_[0]) : 0)
}
sub llscore {
    my ($a, $b,$c,$d)=@_;
    return 0 unless $a and $b and $c and $d;
    my $cd=$c+$d;
    my $E1 = $c*($a+$b)/($cd);
    my $E2 = $d*($a+$b)/($cd);
    my $g2= ($a/$c>$b/$d) ?  
	 #if the feature overused in the test corpus
	(2*(($a*ln($a/$E1))+ ($b*ln($b/$E2)))) : 0;
    return $g2
};

