#!/usr/bin/perl -w
#Serge Sharoff, University of Leeds, 2006
#This script collects the most salient words from each document in a corpus
use strict;
use CWB::CL;
use Getopt::Long;
my $verylargecorpussize=50000000;

binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");

printf STDERR "\n";
my $metrics='ll';
my $cutoff=5; #no more than $cutoff keywords per 1000 words in a doct
my $threshold=2; #only consider words with at least $threshold occurrences in the doct 
my $fqname1; #the reference frequency list
my $mindoclength=500; #the minimal length of document to find topic-specific words
my $corpusname;
my $attrname='lemma'; # the default frequency attribute 
my $sentend='SENT';
my $functionwords=50; #the number of words to be ignored in the frqc list
my $minkeywordlength=4; #shorter single keywords are not considered

my $subname; #not used at the moment
my $outname;
my $n;

my $doccount=0;
my $corpus_size=0;
my %wl;
my %frq;
my @ids;
my %ids;
my %cpos;
my @nwin;
my $posfilter;

GetOptions (
	    "1=s" => \$fqname1,
	    "attrname=s"  => \$attrname,
	    "break=s" => \$sentend,
	    "corpusname=s" => \$corpusname,
	    "doclength=i"  => \$mindoclength,
	    "functionwords=i" => \$functionwords,
	    "grams=i" => \$n,
	    "keywords=i"  => \$cutoff,
	 "metrics=s"  => \$metrics,
	    "outname=s"   => \$outname,
	    "posfilter=s"   => \$posfilter,
	    "threshold=i" => \$threshold,
	    );
die "Usage: $0 
-a --attrname attrname  (default 'lemma')
-b --break token for Ngrams (default 'SENT')
-c --corpusname corpusname (no default, value as in the CWB registry)
-g --grams  N (for getting Ngrams instead of single words/tags)
-o --outname FName (or STDIN)
-t --threshold N  the number of words to output
\n" unless $corpusname;

if ($outname) {
    open(OUT,">$outname") or die "Cannot create '$outname': $!\n";
} else {
    *OUT=*STDOUT;
};
my $C = new CWB::CL::Corpus $corpusname;
die "Cannot open corpus '$corpusname'\n" unless (defined $C);

my $pattr = $C->attribute($attrname, 'p');		# 'p' = p-attribute,
die "Cannot open attribute '$attrname' in corpus '$corpusname'\n" unless (defined $pattr);

my $posattr = $C->attribute('pos','p') if $posfilter;

my $tattr = $C->attribute('text', 's');		# 's' = s-attribute,
die "Cannot open s-attribute 'text' in corpus '$corpusname'\n" unless (defined $tattr);

my $totaltexts=$tattr->max_struc;
my $unknownid=1;
foreach my $textnum (0..$totaltexts-1) {
    my ($spos,$epos)=$tattr->struc2cpos($textnum);
    my $titlestr=$tattr->struc2str($textnum);
    utf8::decode($titlestr);
    my ($textid,$textrest)=$titlestr=~/id="(.+?)"(.*)/;
    $textid=$unknownid++ unless $textid;
    if ($subname) { # we have a sublist
	next unless exists $ids{$textid};
    } else { #we create a list of all ids
	push @ids,$textid;
    };
    @{$cpos{$textid}}=($spos,$epos);
}
$|=1;
my $refcount=create_fq_list($fqname1,$functionwords) if $fqname1;
my @attrs;
foreach my $textid (@ids) {
    undef @nwin;
    undef %frq;
    if (exists $cpos{$textid}) {
	my $spos=$cpos{$textid}->[0];
	my $epos=$cpos{$textid}->[1];
	my $doccount=$epos-$spos;
	$corpus_size+=$doccount;
	print STDERR "." if ($corpus_size % 10000000);
	foreach my $cpos ($spos..$epos) {
	    my $id=$pattr->cpos2id($cpos);
	    my $s=$pattr->id2str($id);
	    if ($n) {
		push @nwin,$id;
		if (scalar(@nwin)>=$n) {
		    my $packedstr=pack("w*", @nwin);
		    $frq{$packedstr}++;
		    shift @nwin;
		};
		undef @nwin if $s eq $sentend;
	    } else {
		next if length($s)<$minkeywordlength;
		if ($posfilter) {
		    my $posstr=$posattr->cpos2str($cpos);
		    next unless ($posstr) and ($posstr=~/$posfilter/);
		}
		$frq{$id}++;
	    }
	};
	storefrqdb($textid,$doccount);
	$doccount=0;
    };
}

sub storefrqdb {
    my ($curtextid,$doccount)=@_;
    return if ($mindoclength) and ($doccount<$mindoclength);
    my %score;
    foreach my $id (keys %frq) {
	next if $frq{$id}<$threshold;
	if ($n) {
	    @nwin=unpack("w*",$id);
	    my @lemmas=$pattr->id2str(@nwin);
	    $_=join ' ',@lemmas;
	} else {
	    $_=$pattr->id2str($id);
	    utf8::decode($_);
	};
	next unless exists $wl{$_};
	next if ($frq{$id}/$doccount<$wl{$_}/$refcount);
	$score{$_}=($metrics eq 'll') ? &llscore($frq{$id},$wl{$_},$doccount,$refcount) :
 	    ($metrics eq 'logodds') ? logodds($frq{$id},$wl{$_},$doccount,$refcount) :
 	    ($metrics eq 'pmi') ? pmi($frq{$id},$wl{$_},$doccount,$refcount) :
 	    ($metrics eq 'tfidf') ? tfidf($frq{$id},$wl{$_},$doccount,$refcount) : 
	    $frq{$id};
    };
    print qq{<text id="$curtextid" doccount="$doccount">\n};
    my $i=0;
    my $realcutoff=$cutoff*$doccount/1000;
    foreach (sort {$score{$b} <=> $score{$a}} keys %score) {
	last if ($cutoff) and $i++>$realcutoff;
	print "$score{$_}\t$_\n";
    }
    print "</text>\n";
}

sub create_fq_list {
    open(IN,$_[0]) or die "Cannot open $_[0]: $!\n";
    binmode(IN,":utf8");
    my $functionleft=$_[1];
    undef %wl;
    my $totalfq=0;
    my ($fq,$lemma);
    while (<IN>) {
	if ((($fq,$lemma)=/^\d+\s([\d.]+)\s(.+)/) or
	    ((($lemma,$fq)=/^(\S+.+)\t(\d+)/) and ($lemma=~/\w/))) { # for plain frq lists from uniq -c 
	    next if ($_[1]) and (--$functionleft>=0);
	    $wl{$lemma}+=$fq;
	    $totalfq+=$fq;
	}
    }
    close(IN);
    return($totalfq);
}

sub pmi {
    my ($a, $b,$c,$d)=@_;
    my $ab=$a*$b;
    my $cd=$c*$d;
    return(ln($a*$d) - ln($b*$c));
}

sub logodds {
    my ($a, $b,$c,$d)=@_;
    return(ln($a) + ln($d-$b) - ln($b) - ln($d-$b));
}

sub llscore {
    my ($a, $b,$c,$d)=@_;
    my $cd=$c+$d;
    unless ($b) {
	$b=0.1;
    }
    my $E1 = $c*($a+$b)/($cd);
    my $E2 = $d*($a+$b)/($cd);
    return (2*(($a*ln($a/$E1))+ ($b*ln($b/$E2)))) if ($E1>0) and ($E2>0);
};

sub ln {
    return(($_[0]>0) ? log($_[0]) : 0)
}

