#!/usr/bin/perl -w
#Serge Sharoff, University of Leeds
#The script outputs only files passing the threshold of the proportion of new sentences.
#it uses a modified Infogistics algorithm designed to work as a filter: it doesn't know 
#about the length of all documents, so it can miss a short document, which is a fragment included 
#in a larger one.
use Digest::MD5 'md5';
use Encode qw(encode_utf8);
use Getopt::Long;
use IO::Handle;

autoflush STDOUT 1;


#my (%params);
my $recordseparator='</text>';
my $iddetector='<text id="(.+?)"';
my $threshold=0.8;
my $minsentlength=25;
my ($logfile,$skipupto,$help);

Getopt::Long::Configure('auto_help');
GetOptions(
	   "iddetector=s" => \$iddetector,
	   "journalfile=s" => \$logfile,
	   "minsentlength=i" => \$minsentlength,
	   "recordseparator=s" => \$recordseparator,
	   "skipupto=s" => \$skipupto,
	   "treshold=f" => \$threshold,
);

if ($logfile) {
    open(LOG,">$logfile") or die "Cannot create logfile $logfile: $!\n";
    autoflush LOG 1;
} else {
    *LOG=*STDERR;
};

my %dochash;
my %sentences;
$/=$recordseparator;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
binmode(LOG,":utf8");
# success count
    my $count_success = 0;
# failure count
    my $count_failure = 0;
my $nextnumericid=0;
my ($docid,$text);
while (my $fulldoc=<>) {
    $nextnumericid++;
    if (($docid)=$fulldoc=~/$iddetector/s) {
	if ($skipupto) {
	    undef $skipupto if $skipupto eq $docid; #this is the last time we skip
	    next;
	};
	$flinelength=index($fulldoc,"\n");
	$text=substr($fulldoc,$flinelength+1,length($fulldoc)-$flinelength);
    } else {
	$docid=$nextnumericid;
	$text=$fulldoc;
	print LOG "--no real docid for $docid\n";
    }
#    next if exists $dochash{$docid};
#    $dochash{$docid} = length($text);


# counters we need
    my $sentence_count = 0;
    my $duplicate_sentence_count = 0;

# sentence buffer
    undef my @sentence_buffer;
    $fullstop=chr(0x3002);

# break body into sentences 
#    foreach my $sentence (split /(?:[.?!…]\s|。)/, $text) {    
    foreach my $sentence (split /(?:[.?!]\s|$fullstop)/, $text) {
        next unless length $sentence > $minsentlength;
	$sentence=~s/\W+/ /g; #it should always work in UTF8, as Perl knows which alphanum chars are permissible
        my $sen = md5(Encode::encode_utf8(lc($sentence)));
# increment duplicate count if a sentence is in sentence db    
        if (exists $sentences{$sen}) {
            $duplicate_sentence_count++;
            $sentences{$sen}++;
# add sentence to sentence buffer otherwise
        } else {
            push @sentence_buffer, $sen;
        }
# increment sentence count in any case
        $sentence_count++;
    }

# check for documents without ANY sentences    
    if ($sentence_count == 0) {
        $count_failure++;
    } else {
    # caclulate fraction 
        my $fraction = $duplicate_sentence_count / $sentence_count;
    # check if we are beyond the limit, rule out if yes
        if ($fraction > $threshold) {
	    printf LOG "-- skipped %s (%1.2f overlap)\n",$docid,$fraction;
            $count_failure++;
    # include otherwise
        } else {
            print $fulldoc;
	    print LOG "$docid\n";
            $count_success++;
        }
       
    # put sentence buffer in sentence db 
        foreach (@sentence_buffer) {
            $sentences{$_} = 1;
        }
    
    }

}

