#!/usr/bin/perl -w
#Serge Sharoff, University of Leeds, August, 2006
#A script for getting frequency lists for subcorpora encoded in CWB

use utf8;
use CL;
use Getopt::Long;
my $verylargecorpussize=50000000;
$attrname='lemma'; # the default frequency attribute 
$sentend='SENT';
GetOptions (
	    "attrname=s"  => \$attrname,
	    "break=s" => \$sentend,
	    "corpusname=s" => \$corpusname,
	    "grams=i" => \$n,
	    "naturalfrq"   => \$naturalfrq,
	    "outname=s"   => \$outname,
	    "subname=s" => \$subname,
	    "threshold=i" => \$threshold,
	    "verylargecorpussize=i" => \$verylargecorpussize,
	    );
die "Usage: $0 
-a --attrname attrname  (default 'lemma')
-b --break token for Ngrams (default 'SENT')
-c --corpusname corpusname (no default, value as in the CWB registry)
-g --grams  N (for getting Ngrams instead of single words/tags)
-n --naturalfrq (the actual number of occurrences, otherwise the default ipm, instances per million words)
-o --outname FName (or STDIN)
-s --subcorpus fname with the list of subcorpus ids
-t --threshold N  the number of words to output
\n" unless $corpusname;

if ($outname) {
    open(OUT,">$outname") or die "Cannot create '$outname': $!\n";
} else {
    *OUT=*STDOUT;
};
$C = new CL::Corpus $corpusname;
die "Cannot open corpus '$corpusname'\n" unless (defined $C);

$pattr = $C->attribute($attrname, 'p');		# 'p' = p-attribute,
die "Cannot open attribute '$attrname' in corpus '$corpusname'\n" unless (defined $pattr);
$tattr = $C->attribute('text', 's');		# 's' = p-attribute,
die "Cannot open s-attribute 'text' in corpus '$corpusname'\n" unless (defined $tattr);
if (defined $subname) {
    open(my $fh,$subname) or die "Cannot open $subname: $!\n";
    local $/;
    $ids=<$fh>;
    close($fh);
    @ids=split /\s+/,$ids;
    @ids{@ids}=();
    @ids=sort keys %ids;
} else {
    $subname=0;
};
my $totaltexts=$tattr->max_struc;
my $unknownid=1;
foreach my $textnum (0..$totaltexts-1) {
    my ($spos,$epos)=$tattr->struc2cpos($textnum);
    my $titlestr=$tattr->struc2str($textnum);
    utf8::upgrade($titlestr);
    my ($textid,$textrest)=$titlestr=~/id="(.+?)"(.+)/;
    $textid=$unknownid++ unless $textid;
    if ($subname) { # we have a sublist
	next unless exists $ids{$textid};
    } else { #we create a list of all ids
	push @ids,$textid;
    };
    @{$cpos{$textid}}=($spos,$epos);
}

#binmode(OUT, ":utf8");
$|=1;
print OUT "The frequency distribution for attribute '$attrname'";
print STDERR "The frequency distribution for attribute '$attrname'";
print OUT "in $subname (@ids) \n" if $subname;
print STDERR " in $subname (",scalar(@ids)," texts)\n"  if $subname;

my $tmpdisplay=0;
foreach my $textid (@ids) {
    undef @nwin;
    if (exists $cpos{$textid}) {
	my $spos=$cpos{$textid}->[0];
	my $epos=$cpos{$textid}->[1];
	$corpus_size+=$epos-$spos;
	$tmpdisplay+=$epos-$spos;
	if ($tmpdisplay>10000000) {
	    print STDERR ".";
	    $tmpdisplay=0;
	}
	foreach $cpos ($spos..$epos) {
	    $id=$pattr->cpos2id($cpos);
	    $s=$pattr->id2str($id);
	    if ($n) {
		push @nwin,$id;
		if (scalar(@nwin)>=$n) {
		    my $packedstr=pack("w*", @nwin);
		    $freq{$packedstr}++ if (exists $freq{$packedstr}) or 
			($corpus_size<$verylargecorpussize); # stop counting singletons once we reached this size
		    shift @nwin;
		};
		undef @nwin if $s eq $sentend;
	    } else {
		$freq{$id}++;
	    }
	};
    };
}

#total_corpus_size = $pattr->max_cpos;
print STDERR "\n - subcorpus size: $corpus_size tokens\n";
print OUT "\n - subcorpus size: $corpus_size tokens\n";

$lex_size = scalar(keys %freq);
$threshold=$lex_size unless $threshold;
print STDERR " - lexicon size: $lex_size types\n";
print OUT " - lexicon size: $lex_size types\n";

$i=0;
$factor=$corpus_size/1000000;
foreach $id (sort { $freq{$b} <=> $freq{$a} } keys %freq) {
    if ($n) {
	@nwin=unpack("w*",$id);
	my @lemmas=$pattr->id2str(@nwin);
	$lemma=join ' ',@lemmas;
    } else {
	$lemma=$pattr->id2str($id);
    };
    utf8::upgrade($lemma);
    next if ($lemma eq '__UNDEF__') or ($lemma eq '@card@') or ($lemma eq '@ord@');
    next if ($lemma=~/[\+=(),;:.@&%<>\[\]\'\`\"\/?!§\x{00B7}\x{00BB}\x{00AB}\x{3001}\x{3002}\x{201C}\x{201D}]/) and ($attrname ne 'pos');
    next unless $lemma=~/[^*=\d-]+/; # only dashes and/or digits
    ++$i;
    if ($naturalfrq) {
	$countstr = $freq{$id};
    } else {
	$curfrq=$freq{$id} / $factor;
	$countstr = sprintf("%.2f", $curfrq);
    }
    print OUT "$i $countstr $lemma\n";
    last if $i>=$threshold;
};
    
close(OUT);


