Tying all this together, here is a computational linguistics program which analyses word usage in a text file, applied to the Project Gutenberg version of the complete works of Shakespeare:
#! /usr/bin/perl -w use strict; use Text::Wrap; { my ($sourceFile, $wordFile, $plotFile, $line, $contents, $word, $wordCount, $totalCount, $vocabularyCount, $incrementalCount, $coverage, $frequency, $i, $sortedWords, @words, @sortedWords, %index, %wordsByFrequency); # open source file and store contents in a string $sourceFile = 'shakespeare.txt'; open(SOURCE, "$sourceFile") or die("can't open $sourceFile"); while ($line = <SOURCE>) { $contents .= $line; } close(SOURCE) or die "can't close $sourceFile"; # open analysis file and write header $wordFile = 'words.txt'; open(WORDS, ">$wordFile") or die("can't open $wordFile"); print(WORDS "ANALYSIS OF \U$sourceFile\E\n\n"); # open plot file and write header $plotFile = 'commands.gnuplot'; open(PLOT, ">$plotFile") or die("can't open $plotFile"); print(PLOT <<EOF); set log y set title "ANALYSIS OF \U$sourceFile\E" set xlabel "word coverage" set ylabel "most frequent words" set term postscript color set out "plot.ps" plot '-' with lines EOF # compact file and count words @words = (); $contents =~ s/-/ /g; # replace hyphens with spaces while ($contents =~ m/([^\s]+)/g) { # extract space-separated words $word = $1; # must copy $1! if (($word !~ m/[A-Z][A-Z]+/g) && # don't match eg 'LEAR' ($word !~ m/^[A-Z]+\.$/)) { # don't match eg 'Lear.' $word =~ s/^\W+//; # strip leading 0 $word =~ s/\W+$//; # strip trailing punctuation $word =~ s/'.*$//; # strip everything after an apostrophe push(@words, $word); } } $totalCount = scalar(@words); print(WORDS "Total word count = $totalCount\n\n"); # make word index %index = (); for ($i = 0; $i < $totalCount; $i++) { $words[$i] = "\l$words[$i]"; # lower case first character if (($words[$i] !~ m/_/) && # no underscores ($words[$i] !~ m/\d/)) { # no digits push(@{$index{$words[$i]}}, $i); } } $vocabularyCount = scalar(keys(%index)); print(WORDS "Vocabulary count = $vocabularyCount\n\n"); # make inverted word index %wordsByFrequency = (); foreach $word (keys %index) { $frequency = scalar(@{$index{$word}}); push(@{$wordsByFrequency{$frequency}}, $word); } # record word usage $coverage = 0; $incrementalCount = 0; print(WORDS "Key: [incremental count] (word frequency) {word count}\n\n"); foreach $frequency (sort reverseNumeric keys(%wordsByFrequency)) { @sortedWords = sort(@{$wordsByFrequency{$frequency}}); $sortedWords = wrap('', '', join(' ', @sortedWords)); $wordCount = scalar(@sortedWords); $incrementalCount += $wordCount; $coverage += $wordCount*$frequency/$totalCount; print(WORDS "[$incrementalCount] ($frequency) {$wordCount}:\n"); print(WORDS "$sortedWords\n\n"); print(PLOT "$coverage $incrementalCount\n"); } print(PLOT "e\n"); close(WORDS) or die("can't close $wordFile"); close(PLOT) or die("can't close $plotFile"); system("gnuplot $plotFile") or die("can't execute gnuplot"); } # sub numeric: # For numeric sorting. Normal sorting is ascii. Usage 'sort numeric @list;'. sub numeric () { $a <=> $b; } # sub reverseNumeric: # For reverse numeric sorting (highest number first). Normal sorting is ascii. # Usage 'sort reverseNumeric @list;'. sub reverseNumeric () { $b <=> $a; }
The program generates an analysis of word usage and the following word coverage plot. It takes 7 seconds to process the 5Mb source file on my 64-bit Linux system at home.