#!/usr/bin/perl -w
#The script takes a corpus stored in the CWB format and outputs the most salient multiword expressions
#use GDBM_File;
use Getopt::Long;
use utf8;
use CQP;
use CL;


$starttime=time();

$collocspan=1;
$contextsize=1;
$contexttype='w';
$curlang='en';
$cutoff=100; # the number of collocation pairs to output
$encoding="utf-8";
$idiosyncthreshold=20; # for collocations: at least X instances to consider
$metrics='MLT';
$sort2option='right';
$transliterateout=0;
$log2=log(2);

$w='[\w,;:()\'.*!?-]';
$nw='[^\w,;:()\'.*!?|-]';

$params="@ARGV";
GetOptions ('name=s' => \$corpusname, 'cutoff=s' => \$cutoff, 'input=s' => \$input, 'language=s' => \$curlang, 'output=s' => \$output, 'pattern=s' => \$nonpattern, 'stop=s' => \$stoplist, 'threshold=i' => \$idiosyncthreshold, 'window=s' => \$collocspan, 'help' => \$help);

die "Usage: $0 -name corpusname -cutoff cutoff -input input -language language -metrics metrics -pattern pattern -output output -stop stoplist -threshold threshold -window collocspan\nFor more information run 'perldoc $0'\n" if (! $corpusname) or (! $input) or $help;

$mistat=1 if $metrics=~/M/;
$llstat=1 if $metrics=~/L/;
$tstat=1 if $metrics=~/T/;
if (lc(substr($collocspan,0,1)) eq 'l') {
    $sort2option='left' ;
} elsif (! $collocspan=~/\D/) { # it contains only digits
    $sort2option='span' ;
    $contextsize=$collocspan;
}
open(IN,$input) or die "Cannot open $input: $!\n";
if ($output) {
    open(OUT,">$output") or die "Cannot create $output: $!\n";
} else {
    *OUT=\*STDOUT;
}
readstoplist($stoplist);
opencorpus($corpusname);
print OUT qq{<html><head><meta http-equiv="Content-Type" content="text/html; charset=$encoding"><title>Patterns from $input</title>\n};
print OUT "</head>\n<body>";
print OUT "<pre>$0 $params</pre><p>\n";
      
$cqpprocesstime=0;
while ($querystring=<IN>) {
    chomp($querystring);
    print STDERR "Processing query string '$querystring'\n";
    undef %pairs;
    undef %loglikescore;
    undef %miscore;
    undef %tscore;
    undef %results;
    undef %bestresults;
    processpattern($querystring);
    selectcollocates();
    showcollocates();
};
$processtime=time()-$starttime;
print STDERR "Total process time: $processtime sec; CQP process time: $cqpprocesstime sec.\n";

# strange: I was unable to find the standard functions for min and max in perl5
sub min{return (($_[0] < $_[1]) ? $_[0] : $_[1])}
sub max{return (($_[0] > $_[1]) ? $_[0] : $_[1])}

sub minstring{return (($_[0] lt $_[1]) ? $_[0] : $_[1])}
sub maxstring{return (($_[0] gt $_[1]) ? $_[0] : $_[1])}

sub collocateheader {
    print OUT qq{</table><p><a name="$_[0]"/>$_[0]<br><table><tbody>};
    printf OUT "<tr><td align=center>%s<td align=center>%s<td align=center>%s<td align=center>%s<td align=center>%s</tr>", " Word pair ", " Joint frq ", " Freqc 1 ", " Freqc 2 ", " $_[0] ";
}

sub selectcollocates {
    my $i=0;
    if ($llstat) {
	$i=0;
	foreach $key (sort { $loglikescore{$b} <=> $loglikescore{$a} } keys %loglikescore) {
	    &parsestring;
	    sprintstring($loglikescore{$key},'LL');
	    $i++;
	    last if $i>$cutoff or $loglikescore{$key}<0;
	};
    }
    if ($mistat) {
	$i=0;
	foreach $key (sort { $miscore{$b} <=> $miscore{$a} } keys %miscore) {
	    &parsestring;
	    sprintstring($miscore{$key},'MI');
	    $i++;
	    last if $i>$cutoff or $miscore{$key}<0;
	};
    }
    if ($tstat) {
	$i=0;
	foreach $key (sort { $tscore{$b} <=> $tscore{$a} } keys %tscore) {
	    &parsestring;
	    sprintstring($tscore{$key},'T');
	    $i++;
	    last if $i>$cutoff or $tscore{$key}<0;
	};
    }
}

sub showcollocates {
    my @candidates=sort {$bestresults{$b} <=> $bestresults{$a}} keys %bestresults;
    print OUT "<h2>Pattern: $querystring total candidates: ",scalar(@candidates),"</h2>\n<table><tbody>";
    printf OUT "<tr><td align=center>%s<td align=center>%s<td align=center>%s<td align=center>%s<td align=center>%s</tr>\n", " Word pair ", " Joint frq ", " Freqc 1 ", " Freqc 2 ", " Score";
    $oldbestresults=0;
    foreach (@candidates) {
	if ($bestresults{$_} != $oldbestresults) {
	    print OUT "<tr><td colspan=5>--------------------------------------</tr>\n";
	    $oldbestresults=$bestresults{$_};
	};
	print OUT $results{$_}{'LL'} if $llstat and exists $results{$_}{'LL'};
	print OUT $results{$_}{'T'} if $tstat and exists $results{$_}{'T'};
	print OUT $results{$_}{'MI'} if $mistat and exists $results{$_}{'MI'};
    }
print OUT "\n</tbody></table>\n";
}
sub sprintstring {
    $result=sprintf "<tr><td><b>$_[1]</b>: %s <td align=right>%s <td align=right>%s <td align=right>%s <td align=right>%3.2f <td>\n", $pair, $pairs{$key}, $frqc1, $frqc2, $_[0];
    $results{$key}{$_[1]}=$result;
    ++$bestresults{$key};
}

sub parsestring {
    if ($key=~/^(.+)~~(.+)$/) {
	$frqc1=$frqc{$1};
	$frqc2=$frqc{$2};
	if ($sort2option eq 'left') {$pair="$2 $1"; ($frqc1,$frqc2)=($frqc2,$frqc1) 
	} elsif ($sort2option eq 'right') {$pair="$1 $2"
	} else {$pair="$1/$2"};
	if (($curlang eq 'ru') and (! $transliterateout)) {
	    $pair=&lat2cyr($pair);
	};
    } else { #?? any real cases
	$frqc1=0;
	$frqc2=0;
    };
}



sub opencorpus {
    my ($sourcecorpus)=@_;
    $cqpquery = new CQP;
    $cqpquery->set_error_handler(\&cqperror);
    $cqpquery->exec($sourcecorpus);
    $cqpquery->exec("set context $contextsize $contexttype");
    $cqpquery->exec("set autoshow on");
    $cqpquery->exec("show +lemma");

    $corpus = new CL::Corpus $sourcecorpus; # open the CL interface
    $word = $corpus->attribute("word", 'p');
    $lemma = $corpus->attribute("lemma", 'p');
    $corpussize = $word->max_cpos;
}

sub cqperror {
    grep {print "<strong>$_</strong><p>\n"} @_; 
    die "query: $querystring\n";
}

sub readstoplist {
    open(LIST,$_[0]);
    @stemlines=<LIST>;
    $stemlines=join ' ',@stemlines;
    $stemlines=~s/$nw/ /;
    @stemlines=split ' ',$stemlines;
    @stoplist{@stemlines}=();
    delete($stoplist{''});
}

sub loglike {
 my ($k,$n,$x)=@_;
 my $res=1000;
 if (($x>0) and ((1-$x)>0)) {
     $res = $k*log($x) + ($n-$k)*log(1-$x);
 } else { $res=0;
# the case of complete dependence: one does not occur without the other, but log(0)
 }
 return($res);
}

sub processpattern {
    my ($searchstring)=@_;
    $cqpstart=time();
    @matches = $cqpquery->query($searchstring);
    $cqpprocesstime+=time()-$cqpstart;
    if ($cqpquery->ok) {
	foreach $m (@matches) {
	    if (processmatch($m)) { # otherwise we're dealing with a parallel text or an error
		$matchlemma=getlemmas($match);
		if ($sort2option eq 'left') {
		    $nlemma=getlemma($leftoutput);
		    $nword=getwords($leftoutput);
		} else {
		    $nlemma=getlemma(split(' ',$rightoutput,1));
		    $nword=getwords(split(' ',$rightoutput,1));
		};
		next if (exists $stoplist{$nlemma}) or ($nlemma=~/[^a-z-]/i);  # exclude stopwords or nonwords
		if ($nonpattern) {
		    next if $nword=~/$nonpattern/;
		};
		++$pairs{"$matchlemma~~$nlemma"};
		++$totalpairs;
	    }
	};
    } else {cqperror("Error in querying $searchstring")};

    $numwords=$corpussize;
    foreach $key (keys %pairs) {
	next if ($pairs{$key}<$idiosyncthreshold);
	unless (($matchlemma,$nlemma)=$key=~/^(.+)~~(.+)$/) {
	    cqperror("Internal error in splitting the collocation pair $_");
	};
	$onetwofrqc=$pairs{$key};
	$onefrqc   = scalar(@matches);
	$frqc{$matchlemma}=$onefrqc unless exists $frqc{$matchlemma};
	$id  = $lemma->str2id($nlemma); 
	if ($id) {
	    $twofrqc   = $lemma->id2freq($id);
	    $frqc{$nlemma}=$twofrqc unless exists $frqc{$nlemma};
	    $miscore{$key} = log ($numwords * $onetwofrqc  / 
				  ($onefrqc * $twofrqc))/$log2; # log is the natural logarithm
	    $tscore{$key} = ($onetwofrqc  - ($onefrqc * $twofrqc/$numwords)) / 
		sqrt ($onetwofrqc); # * $numwords

	    #the log like score according to Manning and Schuetze
	    $p=$twofrqc/$numwords;
	    $p1=$onetwofrqc /$onefrqc;
	    $p2=($twofrqc-$onetwofrqc)/($numwords-$onefrqc);
	    $loglikescore{$key}=-0.5*(&loglike($onetwofrqc, $onefrqc, $p) + 
				      &loglike($twofrqc-$onetwofrqc, $numwords-$onefrqc, $p)-
				      &loglike($onetwofrqc, $onefrqc, $p1) - 
				      &loglike($twofrqc-$onetwofrqc, $numwords-$onefrqc, $p2));
	};
    }
}

sub processmatch {
    my $m=$_[0];
    $m=~s/<\/?(s)>\/__UNDEF__/.&lt;$1&gt;/g; # we can't process HTML tags
    if (($curpos,$leftoutput,$match,$rightoutput)=$m=~/(\d+):\s+(.+?)<(.+?)>(.+)/) {
	return 1
    } else {cqperror("Internal error in processing $m\n")}
}

sub getwords {
    my $out='';
    foreach (@_) {
	if (/(.+?)\//) {
	    $out.="$1 ";
	}
    }
    return $out;
}
sub getlemmas {
    my $out='';
    my @lemmas=split ' ',$_[0];
    foreach (@lemmas) {
	$out.=getlemma($_).' ';
    };
    return substr($out,0,length($out)-1); # minus the last space.
}

sub getlemma {
	if ($_[0]=~/\/(\S+)/) {
	    return $1;
	} else {return "xxx"}
}

__END__


=head1 NAME

mwedetect.pl - A tool for detecting the most salient multiword expressions

=head1 SYNOPSIS

 perl mwedetect.pl -n BNC -s stoplist.txt -m MTL -i inputpatterns -o output.html

=head1 DESCRIPTION

The script takes a corpus stored in the CWB format and outputs the
most salient multiword expressions according to patterns specified in
the input file.  Three metrics are used B<Mutual Information>
(MI-score), B<Loglikelihood> (LL-score) and B<Student's t-test>
(T-score).  The MI score tends to select those lexical items that are
rarely used without the second expression.  In contrast, the T-score
is conservative: it tends to select the most frequent collocations,
weighting them according to the difference between the observed and
expected frequencies.  Three CWB attributes are required in a corpus:
word, lemma and pos.

=head2 Parameters

    -name Corpus name (as registered in CWB, required)

    -cutoff Maximal number of collocates (default 100)

    -input File with input patterns (one pattern per line)

    -language Language (as for now, only 'ru' is used for transliteration of input patterns)

    -metrics Metrics used for calculation.  Three metrics can be used (and combined): mutual information (M), t-test (T) and loglikelihood (L), for instance, -m MT for MI and T scores

    -output Output file (in HTML)

    -pattern Pattern for words NOT to be considered, e.g. for capitalised words ^[A-Z]

    -stop File with the list of stop words

    -threshold Minimal number of occurrennces for each pattern to be considered in the output (default 20)

    -window Position of the collocate with respect to the pattern: left, right (default) or window size

=head2 Format of patterns

Patterns are expressed in the same format as normal queries in CQP.  Check the CQP manual at http://www.ims.uni-stuttgart.de/projekte/CorpusWorkbench/CQPUserManual/HTML/

=head1 COPYRIGHT

Copyright 2004, Serge Sharoff (Centre for Translation Studies, University of Leeds, UK).

You may use, redistribute and modify this tool under
the same license as Perl itself.


