#!/usr/bin/perl # # Generate a histogram from a set of data. # # Tom Moertel # 1 April 1999 # # $Id: histogram,v 1.6 2005/08/10 17:25:46 thor Exp $ # # MODIFICATION HISTORY # # 2005-08-10 TGM Stylistic tweaks. # 2001-02-23 TGM Added --all-integral option. # 1999-04-01 TGM Original. use strict; use warnings; use POSIX qw(ceil); use List::Util qw(max); my $BIN_COLS = 60; # parse arguments my $opt_all_int; my $num_bins = 10; $num_bins = (shift,shift) if @ARGV && $ARGV[0] eq "--bins"; $opt_all_int = shift if @ARGV && $ARGV[0] eq "--all-integral"; # read in data, remove non-numeric noise, and sort into increasing order undef $/; # put Perl into "read in one big gulp" mode my @data = sort {$a <=> $b} grep {/\d/} split /[^-0-9.eE]+/, <>; my $count = @data; # get min and max my $min = $data[0]; # first value is naturally the minimum my $max = $data[-1]; # and last value is the maximum # adjust bin count if --all-integral flag was given $num_bins = int ($max - $min + .5) + 1 if $opt_all_int; # walk thru the data, incrementing bin counts as we go my $distance = $max - $min; my $bin_inc = $opt_all_int ? 1 : $distance / ($num_bins - 1); my $bin_top = $min + $bin_inc; my $bin_indx = 0; my @bins = (0) x $num_bins; foreach my $val (@data) { $bin_top = $min + $bin_inc * (1 + ++$bin_indx) while $val >= $bin_top; $bins[$bin_indx]++; } # find maximum bin count (for scaling) and compute scaling factor my $max_count = max @bins; my $scale = $max_count <= $BIN_COLS ? 1.0 : $BIN_COLS / $max_count; # print out bins print "bin size: ", $bin_inc, "\n"; my $bin_label = $min; foreach my $bin_count (@bins) { printf "%11.4f %6d %s\n", $bin_label, $bin_count, "*" x POSIX::ceil($scale * $bin_count); $bin_label += $bin_inc; } printf "%-12s %5d\n", "total count:", $count; __END__ =head1 NAME histogram.pl - generate an ASCII histogram of numeric data =head1 SYNOPSIS B [--all-integral | B<--bins> I] [I...] =head1 DESCRIPTION The program takes the numeric data in the supplied files (or from standard input if no files are supplied) and generates an ASCII histogram that represents the relative frequencies of the data. The histogram is printed to standard output. The B<--all-integral> flag, if provided, tells B that all of the input data is integral and that discrete values should receive individual bucketing. This causes the appropriate number of buckets, all with size 1, to be added to the histogram such that no two differing values will share the same bucket. Use this option when all of your data are integers that you expect to be tightly grouped. Otherwise, the B<--bins> option may be used to set the desired number of bins, by default 10. =head1 AUTHOR Tom Moertel Etom@moertel.comE 1 April 1999