#!/usr/bin/perl -w # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use warnings; use vars qw($opt_a $opt_t); use Getopt::Long qw(:config auto_help bundling); GetOptions("a|all" => \$opt_a, "t|ignore" => \$opt_t); =head1 NAME overlap - Tool to help determine which tests overlap significantly =head1 SYNOPSIS overlap [options] Options: -a,--all Show all entries (including reverses of pairs) -t,--ignore Ignore T_ tests (rules under testing) =head1 DESCRIPTION B will read the mass-check results log specified and output pairs of tests and how frequently they occur together in absolute terms, and relative to their individual hit rates. The output is of the form: COUNT PAIR/A PAIR/B A,B where C is the number of times the tests hit on the same message, C is the ratio of times that both test hit to the number of times test A hits, C is the ratio of pair hits to B hits, and the C column shows the names of the two tests. Do not abuse this tool. Just because a test highly correlates with another test does not mean you can simply remove one or merge them without further consideration. You need to also look at hit rates, false positives, false negatives, and actually compare the tests. Some overlap is often good, especially if the tests have different characteristics. =cut my $prog = $0; $prog =~ s@.*/@@; if ($#ARGV < 0) { push(@ARGV, "-"); } my %solo; my %pair; foreach my $file (@ARGV) { read_file($file); } print "COUNT\tPAIR/A\tPAIR/B\tA,B\n"; foreach my $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) { my ($a, $b) = split(/ /, $k); my $a_pct = $pair{$k} / $solo{$a}; my $b_pct = $pair{$k} / $solo{$b}; if ($opt_a) { printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b; printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a; } else { if (($a_pct > $b_pct) || ($a_pct == $b_pct && $a lt $b)) { printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b; } else { printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a; } } } sub read_file { my ($input) = @_; open(FILE, $input) || die "open failed: $input"; my $line = 0; while() { next if /^#/; if (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { my @tests = split(/,/, $1); @tests = grep { !/^T_/ } @tests if $opt_t; my $i = 0; for my $a (@tests) { $solo{$a}++; $pair{"$a $_"}++ for @tests[(++$i) .. $#tests]; } } else { die "$prog: error in input format in $input\n"; } } close(FILE); }