#!/usr/bin/perl -w
#Serge Sharoff, University of Leeds, 2005
#the script takes a corpus collected from the Internet and makes some simple filtering, 
#e.g. removal of navigation bars, tables, ERROR 404, etc
#it also writes down the list of URLs that are considered as passed
use strict;
use Getopt::Long;
use utf8;
my $err404messages='not found|not exist|nicht gefunden|nicht vorhanden';
my $minstrlength=10;
my $barlength=30; # the definition of what is a short line
my $seqlength=6; # the definition of what is a long sequence of short lines
my $maxtableindicators=10; # this includes digits, | > etc
undef my @lines;
my $outname='final_url_list';
my $falselinks='false-links';
my %falselinks;
open(OUT,">$outname") or die "Cannot create $outname: $!\n";
open(ERR404,">>ERR404") or die "Cannot write to ERR404: $!\n";
if (open(FALSELINKS,$falselinks)) { # we can filter out occassional wrong pages especially if suggested from mirror sites (such as a Gutenberg novel returned in response to an art query.
    while (<FALSELINKS>) {
	$falselinks{$_}=1 if /^http:/;
    }
}
$/="</text>\n";
while (<>) {
    my ($url)=/<text(?:_| )id="(.+?)"/;
    next unless $url;
    next if $falselinks{$url};
    next if (/\%PDF-\d/) and (/ endobj /) and (/ obj /); # enough to identify a PDF
    print STDERR "Processing $url\n";
    if ((/(?:error|fehler) 404\b/i) and (/\b(?:$err404messages)\b/i)) {
	print ERR404;
    } else {
	@lines=split /\n/;
	&removenavigation();
	$_=join("\n", @lines);
	if (length($_)>1024) {
	    print OUT "$url\n";
	    print "$_\n";
	} else {
	    print STDERR "Skipping $url as it is too short: ",length($_), " bytes\n";
	}
    }
}
close(OUT);

sub removenavigation{
#navigation bars are detected if more than $seqlength consecutive lines satisfy one of the following conditions:
#each lines is shorter than $barlength chars; each line starts with * or chr(149); 
#À Á Â Ã Ä Å Æ Ç È Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ý Þ ß
    my $stat=0;
    my $dot=chr(149);
    for my $i (0..$#lines) {
	$_=$lines[$i];
	next if /^</; # tags are not counted
	my $tableindicators=0;
	s/ \d\d:\d\d [AP]M/ /;
	while (/[0-9|=.>-]/g) {
	    $tableindicators++;
	};
	#if the line *starts* with any of the following or is too short or contains too much table indicators
	if ((length($_)<=$barlength) or (/^\s*(?:\+|$dot)\s+/) or (/^(?:Home|Advanced|Click|Posted|Choose|Subscribe|Contact|Email|Site|Address|Name|Telephone|Logo)/) or ($tableindicators>=$maxtableindicators)) {
	    ++$stat;
	    if ($stat>=$seqlength) {
		foreach my $j ($i+2-$seqlength..$i) { # delete $seqlength lines, including line $i and $seqlength-1 lines before it
		    $lines[$j]='' ;
		}
	    }
	} else {
	    $stat=0
	}
    }
}

