#!/usr/bin/perl -w # # tabulate.pl - convert tagged data into tabular format # # Tom Moertel # 27 April 1999 # # Docs at __END__. # # $Id: tabulate,v 1.3 2001/09/05 18:50:46 thor Exp $ use strict; my @tags; my %tag_pos; my $tag_count = 0; my $last_pos = -1; while (<>) { # preprocess the line next unless /\S/; # skip blanks chomp; # trim line endings # Break the line into tag-value pairs, see if the tag is known, # and if it is decide whether it represents the start of a new # record. If it's new, dump out the record we've been building # up; otherwise accumulate the new value. # If we've encountered a new tag, learn it, remember its # ordinal position, and then remember the data value. my ($tag_name, $val) = /^(.*?)\s*=\s*(.*)/; # get the tag/value my $pos = $tag_pos{$tag_name}; # find the tag's ordinal posn unless (defined $pos) { # unknown tag? $pos = $tag_count++; # => yes, give in next posn $tag_pos{$tag_name} = $pos; # remember posn push @tags, [$tag_name]; # remember tag } dump_record() if $pos <= $last_pos; # dump old rec if seeing new $tags[$pos][1] = $val; # remember new value for tag $last_pos = $pos; # remember this tag's posn } # At this point, we're guaranteed to have one record unprinted and so # we print it. (Note that if we've encounted no records, and hence # our data set was empty, calling dump_record() does nothing, and so # it's safe to call it.) dump_record(); # helpers ==================================================================== my $record_count = 0; sub dump_record { # dump out the current record, but print a header first if the # the record we're printing is the first one printed print join "\t", (map {$_->[0]} @tags), "\n" unless $record_count++; print join "\t", (map {$_->[1]} @tags) , "\n"; } __END__ =head1 NAME tabulate.pl -- Convert tagged data into tabular data =head1 SYNOPSIS B [I] E I =head1 DESCRIPTION This program takes files that contain lines of tagged data of the form I=I, where a series of such tag-value pairs represents a record, and converts the data into tabular data, one line per record. If no files are provided, the program takes the data from standard input. The format of the output is tabular, with rows (records) separated by line breaks and column values separated by tabs. For example, given the following input: Name = Fred Job = Programmer Name = Sally Job = Manager the program produces Name Job Fred Programmer Sally Manager The program requires that the order of the tags within each record be the same throughout the entire set of data. However, tag-value pairs may be omitted, in which case, the value associated with the previous occurrence of the missing tag is used. This behavior can be used to group records. For example: Department = I/S Name = Fred Job = Programmer Name = Sally Job = Manager Department = Sales Name = Kirk Job = Salesman Name = Bill Name = Jim becomes: Department Name Job I/S Fred Programmer I/S Sally Manager Sales Kirk Salesman Sales Bill Salesman Sales Jim Salesman Notice how B determined record breaks automatically and provided the missing Department and Job values when required. =head1 AUTHOR Tom Moertel Etom@moertel.comE 27 April 1999