# This script will search any file for occurrences of "bacon" (or any of
# its equivalents) using Penn Leary's methods as described in *The
# Cryptographic Shakespeare*).  To run this script, save it as a file
# "bacfind.sh" and set permissions by entering the command 
#             chmod u+x bacfind.sh
#
# Since this script is designed to search extremely large text files,
# (an ascii file of Shakespeare's works is about 5 megabytes in size)
# it is assumed that you will use your server's "/tmp" file  to keep
# the big file for the short time it takes this script to run.
# (Searching the works of Shakespeare using this script took about
# seven minutes, but there are ways to speed it up.)
#
# Now download a text to your server's "/tmp" directory and substitute the
# name of the file for "TMPFILE" in the first line of this script.
# 
# This script will translate the file into the Leary's 21-letter alphabet,
# remove all punctuation and spaces, and then search for every line that
# contains a string that would count as an occurrence of "bacon" under
# Leary's rules looking first for forwards "bacons" and then for backwards
# "bacons".  It then sends a count of both forwards and backwards "bacons"
# as well as all the lines containing those strings to a file in your home
# directory that will be called "bac.TMPFILE" where "TMPFILE" will be
# whatever filename you used in your server's "/tmp" directory.  All
# "/tmp" files used are then erased.
#
# This script is heavily commented; any lines beginning with "#" may be
# deleted.
# 
# Feel free to modify this script as you wish.
#
# Send questions or comments to Terry Ross at tross@mail.bcpl.lib.md.us
#
#------------------------------cut here-------------------------
book=TMPFILE
>bac.$book

# The script uses several agrep and sed commands to translates the
# original text file into a file that may be easily and quickly searched
# (this translation is the slowest part of the procedure) 
#
# the first command gathers all non-blank lines
#
agrep '.' /tmp/$book |\
#
# then deletes all spaces at the head of each line
#
sed 's/^ *//g'   |\
#
# then changes all multiple spaces to single spaces
#
sed 's/  */ /g' |\
#
# then removes everything between "{" and "}" (generally editorial matter)
#
sed 's/\{.*\}//g' |\
#
# then removes everything between "[" and "]" (generally editorial matter)
#
sed 's/\[.*\]//g' |\
#
# then removes everything between "|" and "|"
#
sed 's/\|.*\|//g' |\
#
# then removes most punctuation as well as "0", "Z", and "X"
#
sed 's/[\.\,\:\;\"\!\?\-\_\(\)\&\{\}\/0zZxX]*//g' |\
#
# then removes apostrophes
#
sed "s/\'//g" |\
#
# then translates numerals into letters
#
sed 'y/123456789/abcdefghi/' |\
#
# then we go through again and remove extra spaces
#
sed 's/^ *//g'   |\
sed 's/  */ /g' |\
#
# then we pass along all lines with alphabetical characters
#
agrep '[A-Za-z]' |\
#
# then we get rid of spaces at the head of a line
#
sed 's/[^A-Za-z]*//g' |\
#
# then we reformat the text to make fewer but longer lines to search
#
fmt -w 85 |\
#
# then we get rid of all spaces and save the modified text in the "/tmp/"
# directory
#
sed 's/ *//g' > /tmp/ci$book
#
# then we remove the original file from the "/tmp" directory
#
rm /tmp/$book
#
#
# The next two lines do the real work of the script.  Using Penn Leary's
# methods, the next line searches for any string that begins with a "T" (a
# "T" in English is a "b" in the cipher that Leary thinks Bacon used),
# which is immediately followed by 1 or more English letters that become
# vowels in Bakish, which is immediately followed by 1 or more English
# letters that in Bakish could have a hard "c" or "k" sound, which is
# immediately followed by 1 or more English letters that become vowels in
# Bakish, which is immediately followed by an "I" or "J" (either of which
# becomes "n" in Bakish).   The lines that contain the pattern are saved
# to a temporary file.
#
agrep -i 't[aekqrs]([aekqrs]*)[fmuvw]([dfmuvw]*)[aekqrs]([aekqrs]*)[ij]'\
  /tmp/ci$book > bac.$book.tmp.forward
#
# The next line of script searches for the pattern backwards and saves
# lines that contain the backwards pattern to a temporary file.
#
agrep -i '[ij][aekqrs]([aekqrs]*)([dfmuvw]*)[fmuvw][aekqrs]([aekqrs]*)t'\
  /tmp/ci$book > bac.$book.tmp.backward
#
# Now we remove the modified text file from the "/tmp" directory
#
rm /tmp/ci$book
#
# the rest of the script sends the counts of bacons and the lines
# containing them to a file in your home directory, and then it deletes
# the temporary files
#
echo `agrep -c . bac.$book.tmp.forward` Forwards >bac.$book
echo `agrep -c . bac.$book.tmp.backward` Backwards >>bac.$book
echo ' ' >> bac.$book
echo FORWARDS >> bac.$book
agrep . bac.$book.tmp.forward >> bac.$book
rm bac.$book.tmp.forward
echo ' ' >> bac.$book
echo BACKWARDS >> bac.$book
agrep . bac.$book.tmp.backward >> bac.$book
rm bac.$book.tmp.backward
echo ' ' >>bac.$book
#
# To see how many "bacons" were in the file you searched, use your text
# editor, or the "head", "more", or "less" command to look at the top of
# the file "bac.TMPFILE" (where "TMPFILE" is the name you used for the
# original file you searched).
#
# ------------------  end of bacfind.sh -------------------