#!/usr/bin/perl # This script is used to do a statistical comparative analysis of use Statistics::Distributions; use strict; my \$alpha = 0.95; # acceptable confidence in not making a type 1 error # i.e. assume that the means are different my \$lambda = 50; # desired lambda for TCR calculation if ( scalar(@ARGV) < 2 ) { print STDERR "Usage: compare-models [validate1] [validate2]\n"; exit 1; } my (@fp1, @fn1, @tcr1); # allow dir names to be specified, too, for brevity if (-f "\$ARGV[0]/validate") { \$ARGV[0] .= "/validate"; } if (-f "\$ARGV[1]/validate") { \$ARGV[1] .= "/validate"; } open (FILE, \$ARGV[0]) || die \$!; while () { my @x = split(/\s+/); push (@fp1, \$x[2] / (\$x[0] + \$x[2])); push (@fn1, \$x[3] / (\$x[1] + \$x[3])); push (@tcr1, \$x[1] / (\$x[3] + \$lambda * \$x[2])); } close (FILE); my (@fp2, @fn2, @tcr2); open (FILE, \$ARGV[1]) || die \$!; while () { my @x = split(/\s+/); push (@fp2, \$x[2] / (\$x[0] + \$x[2])); push (@fn2, \$x[3] / (\$x[1] + \$x[3])); push (@tcr2, \$x[1] / (\$x[3] + \$lambda * \$x[2])); } close (FILE); stat_analysis ("False positives", "pct", \@fp1, \@fp2); stat_analysis ("False negatives", "pct", \@fn1, \@fn2); stat_analysis ("TCR (lambda=\$lambda)", "lin", \@tcr1, \@tcr2); sub stat_analysis { my \$title = shift; my \$pct = shift; my \$s1 = shift; my \$s2 = shift; unless ( scalar(@\$s1) == scalar(@\$s1) ) { print STDERR "Can't compute stats for \$title. Samples are not paired.\n"; return; } # This is the number of degrees of freedom of the two sample sets (i.e. # the number of samples in each set). my \$dof = scalar(@{\$s1}); print "\$title:\n"; # Compute the mean and standard deviation of the first sample # mean = 1/n * sum(s[i]) my \$mean_s1 = 0; foreach my \$i (1..\$dof) { \$mean_s1 += \$\$s1[\$i]; } \$mean_s1 /= \$dof; # var = 1/(n-1) * sum((mean - s[i])^2) my \$var_s1 = 0; foreach my \$i (1..\$dof) { \$var_s1 += (\$mean_s1 - \$\$s1[\$i])**2; } \$var_s1 /= \$dof - 1; # std = sqrt(var) my \$std_s1 = sqrt(\$var_s1); # Compute the mean and standard deviation of the second sample # mean = 1/n * sum(s[i]) my \$mean_s2 = 0; foreach my \$i (1..\$dof) { \$mean_s2 += \$\$s2[\$i]; } \$mean_s2 /= \$dof; # var = 1/(n-1) * sum((mean - s[i])^2) my \$var_s2 = 0; foreach my \$i (1..\$dof) { \$var_s2 += (\$mean_s2 - \$\$s2[\$i])**2; } \$var_s2 /= \$dof - 1; # std = sqrt(var) my \$std_s2 = sqrt(\$var_s2); # SA developers like percentage points instead of probabilities. if ( \$pct eq "pct" ) { printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*\$mean_s1,100*\$std_s1; printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*\$mean_s2,100*\$std_s2; } else { printf "\tSample 1: mean=%0.4f std=%0.4f\n",\$mean_s1,\$std_s1; printf "\tSample 2: mean=%0.4f std=%0.4f\n",\$mean_s2,\$std_s2; } # Compute the mean of the differences between the two samples my \$mean_d = 0; foreach my \$i (1..\$dof) { \$mean_d += \$\$s1[\$i] - \$\$s2[\$i]; } \$mean_d /= \$dof; # Compute the variance of the differences between the two samples my \$var_d = 0; foreach my \$i (1..\$dof) { \$var_d += (\$mean_d - \$\$s1[\$i] + \$\$s2[\$i])**2; } \$var_d /= \$dof - 1; my \$std_d = sqrt(\$var_d); # To determine whether two samples are from the same distribution # (i.e. they have the same mean), we are going to use a paired sample # t-test. You can find more information about this in Tom Mitchell's # "Machine Learning" book. # Let t = mean / (var * sqrt(n)) my \$tstat; if ( \$var_d > 0 ) { \$tstat = \$mean_d / sqrt(\$var_d / \$dof); } else { \$tstat = 0; } # Now we find the critical value of t for the alpha% confidence # interval. my \$tcrit = Statistics::Distributions::tdistr (\$dof, (1-\$alpha)/2); # This is the probability that the two distributions are from different # means. my \$tprob = 1-Statistics::Distributions::tprob (\$dof, abs(\$tstat))/2; # If the t statistic is less than the critical value, this means that # we should accept the null hypothesis (the distributions have the same # mean), otherwise it should be accepted. if ( abs(\$tstat) < \$tcrit ) { printf "\tNot statistically significantly different (alpha=%0.4f)\n", \$alpha; } else { printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*\$tprob; } # This displays an estimate of the confidence interval around the # estimated mean difference between the two samples. Bear in mind that # the t statistic is working on the local differences and not the # global difference. if ( \$pct eq "pct" ) { printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*\$mean_d, 100*\$std_d*\$tcrit; } else { printf "\tEstimated difference: %0.4f +/- %0.4f\n", \$mean_d, \$std_d*\$tcrit; } print "\n"; }