#!/usr/bin/perl -w
#converts a Weka's ARFF file into a sparse matrix file, used by Cluto and SVDLIB
use strict;
my $outmatrix=shift;
open(OUT,">$outmatrix-tmp") or die "Cannot create $outmatrix-tmp: $!\n";
open(my $outr,">$outmatrix.rlabel") or die "Cannot create $outmatrix.rlabel: $!\n";
open(my $outc,">$outmatrix.clabel") or die "Cannot create $outmatrix.clabel: $!\n";
open(OUTCLASS,">$outmatrix.rclass") or die "Cannot create $outmatrix.rclass: $!\n";
my $outdesc=$outc;
my $nonzeronum=0;
my $m=0;
undef my %textclass;
my $n=0;
my @data;
while (<STDIN>) {
    @data=split ',',$_;
    if (/\%\%? (.+)/) {
	my $label=$1;
	print $outdesc "$label\n";
    } elsif (/\@DATA/) {
	$outdesc=$outr;
	close($outc);
    } elsif ((/\@ATTRIBUTE/) or (/\%.+/)) {
	next;
    } elsif ((scalar(@data)>3) and ($data[-1]=~/\w/)) {
	$m=scalar(@data);
	my $textclass=pop @data;
	print OUTCLASS $textclass;
	my $s;
	for my $i (0..$#data) {
	    if ($data[$i]) {
		$s.= ($i+1)." ".$data[$i]." ";
		$nonzeronum++;
	    };
	};
	print OUT "$s\n";
	$n++;
    }
}
close OUT;
open(OUT,">$outmatrix") or die "Cannot create $outmatrix: $!\n";
print OUT "$n $m $nonzeronum\n";
`cat $outmatrix-tmp >>$outmatrix`;

