#!/usr/bin/perl -w
#Serge Sharoff, University of Leeds
#the script converts text descriptions from Weka to CW text connection graphs
$fqname=shift;
$outfile=shift;
$threshold=shift || 100;
die "Usage: $0 <arff-file fqname outfile [threshold]\n fqname sets items that are too frequent and do not distinguish between documents" unless $outfile;
$fref=create_fq_list($fqname);
%f=%{$fref};

undef my $textid;
open(NODES,">$outfile-nodes.txt");
open(EDGES,">$outfile-edges.txt");

{
    local $/='@DATA';
    $_=<STDIN>; #read the header
}
$textnum=0;
while (<STDIN>) {
    chomp;
    if (s/\% (.+)//) {
	$textid=$1;
    } else {
	@attrs=split ',',$_;
	if (defined $attrs[1]) { #we have at least two elements
	    $textnum++;
	    push @textids,$textid;
	    push $textvector, \@attrs;
	    print NODES "$textnum\t$textid\n";
	}
    }
};
close(NODES);
$|=1;
print STDERR "Total: $#textids files\n";
foreach $i (0..$#textids) {
    print STDERR "$i " if ($i % 100) == 0;
    $textid=$textids[$i];
    undef %nodes;
    foreach $j ($i+1..$#textids) {
	$score=0;
	foreach $lemma (@keysi) {
	    $score+=$keys{$textid}{$lemma}+$keys{$textids[$j]}{$lemma} if (exists $keys{$textids[$j]}{$lemma});
	};
	$nodes{$j}=$score if $score;
    };
    $count=$threshold;
    foreach $j (sort {$nodes{$b} <=> $nodes{$a}} keys %nodes) {
	printf EDGES "%d\t%d\t%d\n",$i+1,$j+1,$nodes{$j} ;
	printf EDGES "%d\t%d\t%d\n",$j+1,$i+1,$nodes{$j} ;
	last unless --$count;
    }
}
print STDERR "\n";

sub report_error {
    print "</head><body><h2>@_<h2>\n";
    die;
}


sub setlang {
    my ($corpusname)=uc(shift);
    return (($corpusname=~/RU$/) ? 'ru' : 
	    ($corpusname eq 'RRC') ? 'ru' : 
	    ($corpusname eq 'INTERNET-DE') ? 'de' : 
	    ($corpusname=~/ZH$/) ? 'de' : 
	    'en');
}

sub create_fq_list {
    open(IN,$_[0]) or die "Cannot open $_[0]: $!\n";
    my $fleft=$_[1];
    undef %wl;
    my ($fq,$lemma);
    while (<IN>) {
	if ((($fq,$lemma)=/^\d+\s([\d.]+)\s(.+)/) or # rank frq item
	    (($fq,$lemma)=/^ *(\d+)\s(.+)/)) { # frq item; for plain lists from sort | uniq -c
	    $wl{$lemma}+=$fq;
	    $totalfq+=$fq;
	}
    }
    close(IN);
    return(\%wl);
}

