#!/usr/bin/perl -w
#
# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
# draw a histogram of the score ranges.
#
# This now draws a detailed "zoom" view as well as the overall histogram,
# so the low-frequency FPs and FNs around the middle ground can be viewed.
# In addition, it does not show ham lines or spam lines, if those buckets
# got no hits.
#
# usage: draw-bayes-histogram [--spam=spam.log] [--nonspam=nonspam.log]
#		[--nocollapse] [--nozoom] [--buckets=20]
#
# or: draw-bayes-histogram spam.log nonspam.log   (backwards compatible)

use Getopt::Long;
use vars qw($opt_spam $opt_nonspam $opt_nocollapse $opt_nozoom
	$opt_buckets);

GetOptions("spam=s", "nonspam=s", "nocollapse", "nozoom", "buckets=i");

my $spam = $opt_spam;
if (!$spam && $ARGV[0] !~ /^\-/) { $spam = $ARGV[0]; }
if (!$spam) { $spam = "spam.log"; }

my $nonspam = $opt_nonspam;
if (!$nonspam && $ARGV[1] !~ /^\-/) { $nonspam = $ARGV[1]; }
if (!$nonspam) { $nonspam = "nonspam.log"; }

my $buckets = $opt_buckets || 25;
my $zoomfactor = 20;
my $range_lo = 0.0;
my $range_hi = 1.0;

%bux_sp = ();
%bux_ns = ();

my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
  push (@buckets, $i);
  $bux_ns{$i} = $bux_sp{$i} = 0;
}

foreach my $file ($spam, $nonspam) {
  open (IN, "<$file") || die "Could not open file '$file': $!";

  my $isspam = 0; ($file eq $spam) and $isspam = 1;

  while (<IN>) {
    /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
    my $score = $2+0;

    my $bucket_id;
    foreach my $bucket (@buckets) {
      if ($score >= $bucket && $score < $bucket+$step) {
        $bucket_id = $bucket; last;
      }
    }

    if ($isspam) {
      $bux_sp{$bucket_id}++;
    } else {
      $bux_ns{$bucket_id}++;
    }
  }
}

my $max_sp = 0;
my $max_ns = 0;
my $tot_sp = 0;
my $tot_ns = 0;
foreach my $bucket (@buckets) {
  $tot_sp += $bux_sp{$bucket};
  if ($bux_sp{$bucket} > $max_sp) 
                        { $max_sp = $bux_sp{$bucket}; }
  $tot_ns += $bux_ns{$bucket};
  if ($bux_ns{$bucket} > $max_ns) 
                        { $max_ns = $bux_ns{$bucket}; }
}

my $chars_in_line = 55;
if ($opt_nozoom) {
  $chars_in_line += 10;
}
my $scale_sp = ($max_sp / $chars_in_line);
my $scale_ns = ($max_ns / $chars_in_line);
$scale_sp ||= 0.000001; $scale_ns ||= 0.000001;
$tot_sp ||= 0.000001; $tot_ns ||= 0.000001;

print STDOUT 
 "SCORE  NUMHIT   DETAIL     OVERALL HISTOGRAM  (. = ham, # = spam)\n";
# 0.000 (19.217%) ..........|....................

foreach my $bucket (@buckets) {
  my $numdots;

  $numdots = int (($bux_ns{$bucket} / $scale_ns) + .5);
  my $line_ns = ('.' x $numdots);

  $numdots = int ((($bux_ns{$bucket}*$zoomfactor) / $scale_ns) + .5);
  my $zoomline_ns = ('.' x $numdots);
  $zoomline_ns = sprintf ("%-10s", substr ($zoomline_ns, 0, 10));

  $numdots = int (($bux_sp{$bucket} / $scale_sp) + .5);
  my $line_sp = ('#' x $numdots);

  $numdots = int ((($bux_sp{$bucket}*$zoomfactor) / $scale_sp) + .5);
  my $zoomline_sp = ('#' x $numdots);
  $zoomline_sp = sprintf ("%-10s", substr ($zoomline_sp, 0, 10));

  if (!$opt_nozoom) {
    $line_ns = $zoomline_ns.'|'.$line_ns;
    $line_sp = $zoomline_sp.'|'.$line_sp;
  }

  if ($bux_ns{$bucket} != 0 && !$opt_nocollapse) {
    printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket,
		(($bux_ns{$bucket} / $tot_ns) * 100.0), $line_ns;
  }
  if ($bux_sp{$bucket} != 0 && !$opt_nocollapse) {
    printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket,
		(($bux_sp{$bucket} / $tot_sp) * 100.0), $line_sp;
  }
}

