#!/usr/bin/perl -w
use utf8;
use CL;
use Getopt::Long;

%corpuslang=(
'INTERNET-DE' => 'de',
'FICTIONER-RU' => 'ru',
'FICTIONRE-RU' => 'ru',
'GLOBAL-RU' => 'ru',
'IZVESTIA' => 'ru',
'RRC' =>       'ru',
'RS' =>       'ru'
);
$attrname='lemma'; # the default frequency attribute 
GetOptions ("corpusname=s" => \$corpusname,
	    "naturalfrq"   => \$naturalfrq,
	    "outname=s"   => \$outname,
	    "threshold=i" => \$threshold,
	    "attrname=s"  => \$attrname);
die "Usage: $0 -corpusname -naturalfrq -outname -threshold (the number of words to output) -attrname\n" unless ($corpusname) and ($outname);
open(OUT,">$outname") or die "Cannot create '$outname': $!\n";
$corpusname=lc($corpusname);
$C = new CL::Corpus $corpusname;
die "Cannot open corpus '$corpusname'\n" unless (defined $C);
$curlang=$corpuslang{$corpusname} || 'en';
binmode(OUT, ":utf8"); # if $curlang eq 'ru';

$lemmaattr = $C->attribute($attrname, 'p');		# 'p' = p-attribute, $lemmaattr is left for historic reasons
die "Cannot open attribute '$attrname' in corpus '$corpusname'\n" unless (defined $lemmaattr);

print OUT "The frequency distribution for attribute '$attrname' in corpus '$corpusname'\nFor more information visit http://corpus.leeds.ac.uk/list.html\n";
$corpus_size = $lemmaattr->max_cpos;
print " - corpus size: $corpus_size tokens\n";
print OUT " - corpus size: $corpus_size tokens\n";

$lex_size = $lemmaattr->max_id;
$threshold=$lex_size+1 unless $threshold;
print " - lexicon size: $lex_size types\n";
print OUT " - lexicon size: $lex_size types\n";
foreach $id (0..$lex_size) {
    $freq{$id}+=$lemmaattr->id2freq($id);
#$i++;
#last if $i>=100000;
#    $str=$lemmaattr->id2str($id);
}
$i=0;
$factor=$corpus_size/1000000;
foreach $id (sort { $freq{$b} <=> $freq{$a} } keys %freq) {
    $lemma=$lemmaattr->id2str($id);
    utf8::decode($lemma);
    next if ($lemma eq '__UNDEF__') or ($lemma eq '@card@') or ($lemma eq '@ord@');
    next if ($lemma=~/[\+=(),;:.@&%<>\[\]\'\`\"\/?!§\x{00B7}\x{00BB}\x{00AB}\x{3001}\x{3002}\x{201C}\x{201D}]/) and ($attrname ne 'pos');
    next unless $lemma=~/[^*\d-]+/; # only dashes and/or digits
    ++$i;
    if ($naturalfrq) {
	$countstr = $freq{$id};
    } else {
	$curfrq=$freq{$id} / $factor;
	$countstr = sprintf("%.2f", $curfrq);
    }
    print OUT "$i $countstr $lemma\n";
    last if $i>=$threshold;
};
    
close(OUT);

