#!/usr/bin/perl -w
use utf8;
use open ':utf8';

my $knowncharsfname=shift;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
my %knownchars=getfilechars($knowncharsfname);
undef my %c;
while (<>) {
    if (substr($_,0,5) eq '<text') {
	($textid)=/<text id="(.+?)"/;
	$textid=$. unless $textid;
#last if $.>10000
    } elsif (substr($_,0,7) eq '</text>') {
	($fknownchars, $unknownchars, $unknowncharscount, $coverage, $unknowncharsref)=ranktext(\%c);
	print "$textid\t$fknownchars\t$unknownchars\t$unknowncharscount\t$coverage\t@{$unknowncharsref}\n";
	undef %c;
    } elsif (substr($_,0,1) ne '<') {
	getchars($_,\%c);
    }
}


sub ranktext {
    my ($cref)=@_;
    my %c=%{$cref};
    undef my @unknownchars;
    my $fknownchars=0;
    my $unknownchars=0; #the cumulative number of unknown chars
    my $unknowncharscount=0; #storing the number of unique unknown chars
    foreach my $c (keys %c) {
	if (exists $knownchars{$c}) {
	    $fknownchars+=$c{$c};
	} else {
	    $unknownchars+=$c{$c};
	    push @unknownchars,$c;
	    $unknowncharscount++;
	};
    };
    $coverage=$fknownchars/($fknownchars+$unknownchars);
    return $fknownchars, $unknownchars, $unknowncharscount, $coverage, \@unknownchars;
}

sub getfilechars {
    my $fname=shift;
    open(IN, $fname) or die "Cannot open $fname: $!\n";
    undef my %c;
    while (<IN>) {
	s/\#.*//;
	s/^\s+//;
	s/\s+$//;
	next unless /\S/;
	getchars($_,\%c);
    };
    return %c;
}

sub getchars {
    my ($source,$cref)=@_;
	foreach (split //,$_) {
	    unless (/[A-Za-z\d\s.,·‘“’”\"_\'\$、，;；:：．。！？?《》（）()*—…%％\`\/-]/) {
		${$cref}{$_}++;
	    };
	}
}

