#!/usr/bin/perl # # stats - computes summary statistics for a series of numbers. # # Tom Moertel # 5 March 1999 # # Docs at __END__. # # $Id: stats,v 1.8 2005/08/10 17:13:19 thor Exp $ use strict; use warnings; use POSIX qw(ceil floor); use List::Util qw(sum); # order the data now to make percentiles easy to compute my @data = sort {$a <=> $b} grep {/\d/} split /[^-0-9.eE]+/, do { local $/; <> }; unless (@data) { print STDERR "No data provided; end of analysis.\n"; exit 1; } # compute and report descriptive statistics my $sum = sum @data; my $count = @data; my $mean = $sum/$count; print "count = ", $count, "\n"; print "min = ", $data[0], "\n"; print "10% cut = ", percentile(0.10), "\n"; print "25% cut = ", percentile(0.25), "\n"; print "median = ", percentile(0.50), "\n"; print "mean = ", $mean, "\n"; print "75% cut = ", percentile(0.75), "\n"; print "90% cut = ", percentile(0.90), "\n"; print "max = ", $data[-1], "\n"; # some stats make sense only when more than one sample is provided if ($count > 1) { my $numer = sum( map {($_ - $mean)**2} @data ); my $s2 = $numer / ($count - 1); my $s2pop = $numer / $count; my $stdev = sqrt $s2; my $popstdv = sqrt $s2pop; print "var = ", $s2, "\n"; print "stdev = ", $stdev, "\n"; print "popvar = ", $s2pop, "\n"; print "popstdv = ", $popstdv, "\n"; } else { print "var = \n"; print "stdev = \n"; print "popvar = \n"; print "popstdv = \n"; } # helpers ============================================================== sub percentile { # given a portion P, return X such that the portion of # values in the data that are <= X is P my ($P) = @_; my $pos = $P * ($count - 1); my $pos_low = floor($pos); my $pos_high = ceil($pos); my $X_low = $data[$pos_low]; my $X_high = $data[$pos_high]; return $X_low + ($X_high - $X_low) * ($pos - $pos_low); } __END__ =head1 NAME stats.pl - computes summary statistics for a set of numbers =head1 SYNOPSIS B [I...] =head1 DESCRIPTION If you pass in a set of whitespace separated numbers, either in files or via standard input, this program will compute various summary statistics for the set. The program will also attempt to ignore any non-numeric garbage that may be in the input. The output follows this format: count = 16773 min = 1.1 10% cut = 4.2 25% cut = 5.3 median = 6.3 mean = 6.11835688308601 75% cut = 7.2 90% cut = 7.7 max = 9.7 var = 1.91423312161212 stdev = 1.38355813813953 popvar = 1.91411899574784 popstdv = 1.38351689391486 =head1 NOTES =over 4 =item median If the number of elements in the input set is even, such that there is no distinct median element, the median is computed as the average of the two elements that together represent the median. Thus the median of the set {1, 2} is 1.5. =item percentile "cut" scores The percentile scores are computed as if the set of input data represents a continuous distribution. Thus if there is no distinct element that sits exactly at the Ith percentile cut-off point, the percentile score will be computed via linear interpolation on the two elements that bracket the point. (Note that this is consistent with the computation of the median such that the median equals the 50th percentile.) =back =head1 AUTHOR Tom Moertel First release: 5 March 1999 Overhauled: 26 July 2004 Style tweaks: 10 August 2005 $Id: stats,v 1.8 2005/08/10 17:13:19 thor Exp $