#!/usr/bin/perl -w # # freqdiff - print frequency difference between two inputs # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use vars qw($opt_a $opt_b $opt_c $opt_d $opt_h $opt_p $opt_r $opt_l); use Getopt::Std; getopts("abcdhprl:"); my $prog = $0; $prog =~ s@.*/@@; sub usage { my $status = shift; my $out = $status ? STDERR : STDOUT; print $out < 0; my @line; my $type; my %one = read_argv(0); my %two = read_argv(1); $opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01); $opt_d = 1 if ($type == 3); $opt_a = 1 if ($type == 4); my %score; if ($#ARGV > 1) { open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]"; while () { chomp; s/#.*//; my @field = split; if ($#field >= 2 && $field[0] eq "score") { $score{$field[1]} = $field[2]; } } close(FILE); } my @all = (keys %one); foreach my $elem (keys %two) { if (! defined($one{$elem})) { push(@all, $elem); } } my %out; foreach my $elem (@all) { my $one = 0; my $two = 0; if ($type == 3) { $one = 1.0; $two = 1.0; } if (exists($one{$elem})) { $one = $one{$elem}; delete $one{$elem}; } if (exists($two{$elem})) { $two = $two{$elem}; delete $two{$elem}; } if ($opt_d) { $out{$elem} = $two - $one; } else { $count{$elem} = $two + $one if $opt_b; $out{$elem} = $one / ($one + $two) * 100.0; } } foreach my $elem (sort { $out{$b} <=> $out{$a} || ((defined %count) && ($count{$b} <=> $count{$a})) || $a cmp $b } keys %out) { my $name = $elem; if (%score) { if (exists($score{$elem})) { $name = "$score{$elem}\t$name"; } else { $name = "1.0\t$name"; } } if ($type == 3) { printf "%.3f\t%s\n", $out{$elem}, $name; } elsif ($opt_d) { print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a); } else { if ($opt_b) { printf "%.2f\t%d\t%s\n", $out{$elem}, $count{$elem}, $name; } else { printf "%.2f\t%s\n", $out{$elem}, $name; } } } sub read_argv { my ($input) = @_; my %freq; my $last = 0; open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]"; my $line = 0; while() { if ($opt_c) { s/#.*//; next unless /\S/; } next if (/^OVERALL/); # hit-frequencies header line $line++; next if ($opt_l && $line > $opt_l); $last = $type; # "sort | uniq -c" format if (/^\s*(-?\d+|-?\d+\.\d+)\s+(.*)/) { $type = 1; $freq{$2} = $1; } # "mass-check" format elsif (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { $type = 2; foreach (split(/,/, $1)) { $freq{$_}++; } } # "scores" format elsif (/^score\s+(\S+)\s+(-?[\d.]+)/) { $type = 3; $freq{$1} = $2; } # line number is frequency else { $type = 4; chomp; $freq{$_} = $line; } if ($last && $last != $type) { die "$prog: inconsistent format in $ARGV[$input] (format $last then format $type)\n"; } } close(FILE); foreach my $key (keys %freq) { if ($type == 4) { $freq{$key} = $line - $freq{$key} + 1; } if ($opt_r) { $freq{$key} /= $line; } } $line[$input] = $line; return %freq; }