#!/usr/bin/perl
#	Relevance benchmark of Sherlock search server
#	(c) 2003, Robert Spalek <robert@ucw.cz>

use strict;
use warnings;

use lib 'lib/perl5';
use Sherlock::Query();

my $search_svr = 'localhost:8192';
my $query_prefix = 'show 1..20 db "main" filetype = {"text", "html", "pdf"} context 0 titlelen 0 urls 256 morph 1 spell 0 syn 0 accents 0 sitemax 2';
my $debug_level = 0;

sub debug($$$);
sub pos2weight($);
sub normalize_url($);
sub format_url($$$);

# Parse the input into one array
$/ = "";
my @tests = <>;
$/ = "\n";

# Parse each record into QUERY and hash of URL -> BONUS
my (%expurls, %fndurls);
foreach (@tests)
{
	my ($query, $urls) = /^([^\n]*)\n(.*)\n$/s;
	my @urls = map {
		my ($weight, $url) = split /\s/;
		(normalize_url($url), $weight);
	} split /\n/, $urls;
	$expurls{$query} = { @urls };
	$_ = $query;
}

# Ask the search server every QUERY and parse the responses
foreach my $query (@tests)
{
	my $q = new Sherlock::Query($search_svr);
	my $res = $q->query("$query_prefix $query");
	debug($q, $res, "Response on query $query");
	my @queryurls;
	if ($res =~ /^\+/)
	{
		debug($q, $q->{HEADER}, "Header");
		my $i = 0;
		foreach my $c (@{$q->{CARDS}})
		{
			$i++;
			my $urlrecU = [ $i, $c->{"Q"}, 'U' ];
			my $urlrecy = [ $i, $c->{"Q"}, 'y' ];
			my $urlrecb = [ $i, $c->{"Q"}, 'b' ];
			foreach my $url (@{$c->{"(U"}})
			{
				# concatenate the URL's of the card to the big list
				my @cardurls = map { (normalize_url($_), $urlrecy); } (@{$url->{"y"}});
				@queryurls = (@queryurls, (normalize_url($url->{U}), $urlrecU), @cardurls);
				@queryurls = (@queryurls, (normalize_url($url->{b}), $urlrecb)) if $url->{b};
			}
			debug($q, $c, "Card $i");
		}
		debug($q, $q->{FOOTER}, "Footer");
	}
	$fndurls{$query} = { @queryurls };
	debug($q, $fndurls{$query}, "Found URL's");
	debug($q, $expurls{$query}, "Expected URL's");
}

# Compare these two datasets
my %grades;
my $total_grade = 0;
foreach my $query (@tests)
{
	print "Query:\t\t$query\n";
	my $fnd = $fndurls{$query};
	my $exp = $expurls{$query};
	my @fnd = sort { $fnd->{$a}->[0] <=> $fnd->{$b}->[0] } keys %$fnd;
	my @exp = sort { $exp->{$b} <=> $exp->{$a} } keys %$exp;
	my $grade = 0;
	my $max_grade = 0;
	my $i = 1;
	foreach my $url (@exp)
	{
		$grade += $exp->{$url} * pos2weight($fnd->{$url});
		$max_grade += $exp->{$url} * pos2weight([$i++]) if $exp->{$url} > 0;
	}
	my $rel_grade = $grade / $max_grade;
	print "Total grade:\t$rel_grade = $grade / $max_grade\n";
	print "Found URL's:\n";
	foreach my $url (@fnd)
	{
		print format_url($fnd, $exp, $url);
	}
	print "Expected URL's that have not been found:\n";
	foreach my $url (@exp)
	{
		print format_url($fnd, $exp, $url) if !$fnd->{$url};
	}
	print "\n";
	$grades{$query} = $rel_grade;
	$total_grade += $rel_grade;
}

# Print total results
foreach my $query (@tests)
{
	printf("%8.5f : %s\n", $grades{$query}, $query);
}
printf("%8.5f = Total\n", $total_grade);
printf("%8.5f = Average (%d tests)\n", $total_grade / @tests, 0 + @tests);

### Subroutines:

# Verbose debug dumps
sub debug($$$)
{
	my ($q, $what, $title) = @_;
	if ($debug_level) {
		print "$title: \n" if $title;
		$q->format(sub { print $_[0]; }, $what);
	}
}

# Computing the weight of a given position at the output
sub pos2weight($)
{
	my ($urlrec) = @_;
	return 0 if !$urlrec;
	my $pos = $urlrec->[0];
	return 105 - 5*$pos;
}

# Cut www. prefix
sub normalize_url($)
{
	my ($url) = @_;
	$url =~ s|//www\.|//|;
	return $url;
}

# Formats the output line about one URL
sub format_url($$$)
{
	my ($fnd, $exp, $url) = @_;
	my $wt = $exp->{$url};
	$wt = 0 if !$wt;
	my $urlrec = $fnd->{$url};
	my $poswt = pos2weight($urlrec);
	$urlrec = [ 0, 0, '0' ] if !$urlrec;
	return "$wt\t$poswt\t$urlrec->[2] #$urlrec->[0] Q$urlrec->[1]\t$url\n";
}
