#!/usr/bin/perl -w
#a filter for converting CEDICT files to the dictd format
use strict;
my (%p,%s,%t);
my $frqfile=shift;
my ($num,$frqref)=create_fq_list($frqfile);
my %frq=%{$frqref};
print "\# CEDICT converted to the dictd format; the frequency data are from the Chinese Internet corpus\n\n";
print "00-database-short/00databaseshort\n\tCEDICT converted to the dictd format\n\n";
print "00-database-url/00databaseurl\n\thttp://www.mandarintools.com\n\n";
my $head;
while(<>) {
    if (my ($t,$s,$p,$tr)=/(.+?) (.+?) \[(.+?)\] \/(.+)/) {
	$p=lc($p);
	if ($s eq $t) {
	    $head=join("%%%",$s,$p);
#	    $p{$p}=join("\n\t",$s,$tr);
#	    $s{$s}=join("\n\t",$p,$tr);
	} else {
	    $head=join("%%%",$s,$t,$p);
# 	    $p{$p}=join("\n\t",$s,$tr,$t);
# 	    $s{$s}=join("\n\t",$p,$tr,$t);
# 	    $t{$t}=join("\n\t",$s,$p,$tr);
	};
	if (exists $frq{$s}) {
	    $tr="$frq{$s}/ $tr";
	};
	print "$head\n\t$tr\n\n";
    };
}


printlist(\%s);
printlist(\%t);
printlist(\%p);

sub printlist {
    my $lr=shift;
    my %l=%{$lr};
    foreach (sort keys %l) {
	print "$_\n\t$l{$_}\n\n";
    };
}
sub create_fq_list {
    open(IN,$_[0]) or die "Cannot open $_[0]: $!\n";
    my $functionwords=$_[1];
    undef my %wl;
    my $totalfq=0;
    my ($fq,$lemma);
    my $functionleft=$functionwords;
    while (<IN>) {
	if (($fq,$lemma)=/^\d+ ([\d.]+) (.+)/) {
	    next if --$functionleft>=0;
	    $wl{$lemma}+=int($fq);
	    $totalfq+=$fq;
	} elsif ((($fq,$lemma)=/^\s*(\d+)\s(.+)/) and ($lemma=~/\w/)) { # for plain frq lists from uniq -c
	    next if --$functionleft>=0;
	    $wl{$lemma}+=$fq;
	    $totalfq+=$fq;
	}
    }
    close(IN);
    return($totalfq,\%wl);
}

