#!/usr/bin/perl -w

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use strict;
use warnings;

use vars qw($opt_a $opt_t);
use Getopt::Long qw(:config auto_help bundling);

GetOptions("a|all" => \$opt_a,
	   "t|ignore" => \$opt_t);

=head1 NAME

overlap - Tool to help determine which tests overlap significantly

=head1 SYNOPSIS

overlap [options] <log file>

 Options:
   -a,--all          Show all entries (including reverses of pairs)
   -t,--ignore       Ignore T_ tests (rules under testing)

=head1 DESCRIPTION

B<overlap> will read the mass-check results log specified and output
pairs of tests and how frequently they occur together in absolute
terms, and relative to their individual hit rates.

The output is of the form:

 COUNT   PAIR/A  PAIR/B  A,B

where C<COUNT> is the number of times the tests hit on the same
message, C<PAIR/A> is the ratio of times that both test hit to the
number of times test A hits, C<PAIR/B> is the ratio of pair hits to B
hits, and the C<A,B> column shows the names of the two tests.

Do not abuse this tool.  Just because a test highly correlates with
another test does not mean you can simply remove one or merge them
without further consideration.  You need to also look at hit rates,
false positives, false negatives, and actually compare the tests.
Some overlap is often good, especially if the tests have different
characteristics.

=cut


my $prog = $0;
$prog =~ s@.*/@@;

if ($#ARGV < 0) {
    push(@ARGV, "-");
}

my %solo;
my %pair;

foreach my $file (@ARGV) {
    read_file($file);
}

print "COUNT\tPAIR/A\tPAIR/B\tA,B\n";

foreach my $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
    my ($a, $b) = split(/ /, $k);
    my $a_pct = $pair{$k} / $solo{$a};
    my $b_pct = $pair{$k} / $solo{$b};
    if ($opt_a) {
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b;
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a;
    }
    else {
	if (($a_pct > $b_pct) || ($a_pct == $b_pct && $a lt $b)) {
	    printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b;
	}
	else {
	    printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a;
	}
    }
}

sub read_file {
    my ($input) = @_;

    open(FILE, $input) || die "open failed: $input";
    my $line = 0;
    while(<FILE>) {
	next if /^#/;
	if (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) {
	    my @tests = split(/,/, $1);
	    @tests = grep { !/^T_/ } @tests if $opt_t;
	    my $i = 0;
	    for my $a (@tests) {
		$solo{$a}++;
		$pair{"$a $_"}++ for @tests[(++$i) .. $#tests];
	    }
	}
	else {
	    die "$prog: error in input format in $input\n";
	}
    }
    close(FILE);
}