#!/usr/bin/perl # This script is used to print some statistics about classification accuracy # with a k-fold cross validation use strict; my \$lambda = 50; # desired lambda for TCR calculation if ( scalar(@ARGV) < 1 ) { print STDERR "Usage: model-statistics [validate]\n"; exit 1; } my (@fp1, @fn1, @tcr1); open (FILE, \$ARGV[0]) || die \$!; while () { my @x = split(/\s+/); push (@fp1, \$x[2] / (\$x[0] + \$x[2])); push (@fn1, \$x[3] / (\$x[1] + \$x[3])); push (@tcr1, \$x[1] / (\$x[3] + \$lambda * \$x[2])); } close (FILE); stat_analysis ("False positives", "pct", \@fp1); stat_analysis ("False negatives", "pct", \@fn1); stat_analysis ("TCR (lambda=\$lambda)", "lin", \@tcr1); sub stat_analysis { my \$title = shift; my \$pct = shift; my \$s1 = shift; # This is the number of degrees of freedom of the two sample sets (i.e. # the number of samples in each set). my \$dof = scalar(@{\$s1}); # Compute the mean and standard deviation of the first sample # mean = 1/n * sum(s[i]) my \$mean_s1 = 0; foreach my \$i (1..\$dof) { \$mean_s1 += \$\$s1[\$i]; } \$mean_s1 /= \$dof; # var = 1/(n-1) * sum((mean - s[i])^2) my \$var_s1 = 0; foreach my \$i (1..\$dof) { \$var_s1 += (\$mean_s1 - \$\$s1[\$i])**2; } \$var_s1 /= \$dof - 1; # std = sqrt(var) my \$std_s1 = sqrt(\$var_s1); # SA developers like percentage points instead of probabilities. if ( \$pct eq "pct" ) { printf "%s: mean=%0.4f%% std=%0.4f\n",\$title,100*\$mean_s1,100*\$std_s1; } else { printf "%s: mean=%0.4f std=%0.4f\n",\$title,\$mean_s1,\$std_s1; } }