#!/bin/sh # wordfreq # # A fun demonstration of awk's associative arrays, which are very cool # Think of it as an array whose indicies are actually words, in this case. # We loop through each word in the line, and add it to an array. # every instance of "the" gets incremented, as num[the]++ # The final END statement loops through the array we filled up, using # the variable "word" to see what things are where. Analogous to shell # scripting "for i in `ls *.dat`", and we pipe it through sort to put things # in alphabetical order, and I pipe to grep to point out something... echo "" echo "Just using associative arrays" awk ' {for(i=1; i <= NF ; i++) num[$i]++ } END{for (word in num) print word, num[word]}' essay.txt | sort | grep "e" | head -7 # See how the output depends on the case of the letters in the word? And # punctuation marks are getting in the way? We can take care of this! echo "" echo "Filtering, then using associative arrays" # read file, change case, kill periods, then commas, then bangs, then count cat essay.txt | tr 'A-Z' 'a-z' | sed 's/\./ /g' | sed 's/\,/ /g' | sed 's/\!/ /g' | awk ' {for(i=1; i <= NF ; i++) num[$i]++ } END{for (word in num) print word, num[word]}' | sort | grep "b" echo ""