#!/usr/bin/perl -w
use utf8;
use open ':utf8';

use strict;
use CL;
use Getopt::Long;

my $attrname='lemma';
my $corpusname;
my $mindoc=200;
my $maxdoc=5000;
my $keywords;
my $subname;

GetOptions (
	    "attrname=s"  => \$attrname,
	    "corpusname=s" => \$corpusname,
	    "mindoc=s" => \$mindoc,
	    "Maxdoc=s" => \$maxdoc,
	    "keywords=s" => \$keywords,
	    "subcorpus=s" => \$subname,
	    );

binmode(STDOUT,":utf8");
my %keywords=%{getwordlist($keywords)};
my @keywords=sort keys %keywords;

my $C = new CL::Corpus $corpusname;
die "Cannot open corpus '$corpusname'\n" unless (defined $C);
my $pattr = $C->attribute($attrname, 'p');		# 'p' = p-attribute
die "Cannot open attribute '$attrname' in corpus '$corpusname'\n" unless (defined $pattr);
my $tattr = $C->attribute('text', 's');		# 's' = s-attribute
die "Cannot open s-attribute 'text' in corpus '$corpusname'\n" unless (defined $tattr);

my @ids;
my (%cpos,%ids);
if (defined $subname) {
    open(my $fh,$subname) or die "Cannot open $subname: $!\n";
    local $/;
    my $ids=<$fh>;
    close($fh);
    @ids{split /\s+/,$ids}=();
    @ids=sort keys %ids;
} else {
    $subname=0;
}

my $nexttempid;
undef my %textclass;
my $totaltexts=$tattr->max_struc;
foreach my $textnum (0..$totaltexts-1) {
    my ($spos,$epos)=$tattr->struc2cpos($textnum);
    next if $epos-$spos<$mindoc;
    my $titlestr=$tattr->struc2str($textnum);
    utf8::upgrade($titlestr);
    my ($textid,$textrest)=$titlestr=~/id="(.+?)"(.+)/;
    $textid=$nexttempid++ unless defined $textid;

    if ($subname) { # we have a sublist
	next unless exists $ids{$textid};
    } else { #we create a list of all ids
	push @ids,$textid;
    };
    @{$cpos{$textid}}=($spos,$epos);
}


foreach my $textid (@ids) {
    if (exists $cpos{$textid}) { #just in case to ensure that the id mentioned in the subcorpus file indeed exists
	my ($spos,$epos)=@{$cpos{$textid}};
	my $text_size=$epos-$spos;
	my @cpos=($spos..$epos);
	my @words=$pattr->cpos2str(@cpos);
	my $goodwords=0;
	foreach my $word (@words) {
	    if (exists $keywords{$word}) {
		++$goodwords;
	    };
	};
	printf "%s\t%d\t%d\t%0.4f\n", $textid,$text_size,$goodwords,100*$goodwords/$text_size;
    };
}

sub getfilechars {
    my $fname=shift;
    open(IN, $fname) or die "Cannot open $fname: $!\n";
    undef my %c;
    while (<IN>) {
	s/\#.*//;
	s/^\s+//;
	s/\s+$//;
	next unless /\S/;
	getchars($_,\%c);
    };
    return %c;
}

sub getchars {
    my ($source,$cref)=@_;
	foreach (split //,$_) {
	    unless (/[\d\s.,·‘“’”\"_\'\$、，;；:：．。！？?《》（）()*—…%％\`\/-]/) {
		${$cref}{$_}++;
	    };
	}
}
sub getwordlist {
    my $fname=shift;
    my %keywords;
    open(IN,$fname) or die "Cannot open $fname: $!\n";
    while (<IN>) {
	chomp;
	s/\#.+//;
	my ($word)=split /\t/,$_;
	$keywords{$word}=1 if ($word=~/\S/);
    }
    close(IN);
    return \%keywords;
}

