docs/spellchk.ex

-- Spell check of a file. The default dictionary (EWORDS.TXT) or your own 
-- dictionary can be used.
-- The dictionary file:
--         - must have one word per line.
--         - must be all upper-case letters.
--         - does not have to be sorted, but performance-wise it helps.
-- You can sort a dictionary file by typing:
--     sort < yourDictionary > sortedDictionary
-- before using spellchk.
--
-- Usage:   eui spellchk yourFileName dictionaryFileName
--       or
--          eui spellchk yourFileName   <-- use default dictionary: WORDS.TXT
--       or
--          eui spellchk                <-- to enter file names as the response
--
-- Spell check is case-insensitive.

-- Written by Junko C. Miura of Rapid Deployment Software
-- Public Domain, use as you wish.

-- How it works:
-- 1. Each unique word in the user's document is stored in a hash table
--    in memory. 
-- 2. Each word of the dictionary is read in and looked up in the hash table.
--    When a word is matched, it is deleted from the hash table.
-- 3. The words that remain in the hash table after reading the whole
--    (51802-word) dictionary are the "bad" words that might be 
--    spelling mistakes.

without type_check

include wildcard.e

constant TRUE = 1
constant EOF = -1
constant KEYB = 0, DISPLAY = 1, ERROR = 2
constant INDEX_APOSTROPHE = 27, INDEX_BOGUS = 28

integer userFile, dictFile
sequence lookUpTable
sequence hashBucket, currentBucket

function delete_trailing_white(sequence name)
-- get rid of blanks, tabs, newlines at end of string
	while length(name) > 0 do
		if find(name[length(name)], "\n\r\t ") then
			name = name[1..length(name)-1]
		else
			exit
		end if
	end while
	return name
end function

function get_response(sequence query)
	-- It displays the query and returns the response after stripping the
	-- trailing white space. 
	sequence s
	
	puts(DISPLAY, query)
	s = gets(KEYB)
	puts(DISPLAY, '\n')
	return delete_trailing_white(s)
end function

sequence command

procedure openFiles()
-- get file names and open them 

	command = command_line()
	if length(command) = 3 then         -- if user typed only one file name
		command = append(command, "EWORDS.TXT")  -- assume default dictionary
	elsif length(command) < 3 then
		command = append(command, 
						 get_response("file name to be spell-checked? "))
		command = append(command,
						 get_response("dictionary name? (hit ENTER for " &
									  "the default (ie, EWORDS.TXT))"))  
		if length(command[4]) = 0 then
			command[4] = "EWORDS.TXT"
		end if
	end if
		
	userFile = open(command[3], "rb")
	if userFile = -1 then
		puts(ERROR, "couldn't open " & command[3] & '\n')
		abort(1)
	end if
	dictFile = open(command[4], "r")
	if dictFile = -1 then
		puts(ERROR, "couldn't open " & command[4] & '\n')
		abort(1)
	end if
end procedure

procedure initLookUpTable()
-- can be indexed with values from -1 to 255
	lookUpTable = repeat(INDEX_BOGUS, 257)
	for i = 'A' to 'Z' do                   -- 'A' to 'Z' map to 1 to 26
		lookUpTable[i + 2] = i - 'A' + 1
	end for
	for i = 'a' to 'z' do                   -- 'a' to 'z' map to 1 to 26
		lookUpTable[i + 2] = i - 'a' + 1
	end for
	lookUpTable['\'' + 2] = INDEX_APOSTROPHE
end procedure

function nextWord(integer fn)
-- read user's document to get the next alphabetic string. It only accepts 
-- letters that are registered as valid in the look-up-table.
	integer c
	sequence word

	word = ""
	while TRUE do
		c = getc(fn)
		if lookUpTable[c + 2] != INDEX_BOGUS then
			word = append(word, c)
		elsif length(word) > 0 then
			return word
		elsif c = EOF then
			return 0
		end if
	end while
end function

function calcHashValue(sequence word)
	-- Hash function based on the first two letters of a word.
	-- This hash function does not distribute the words evenly over
	-- the available buckets, like a good hash function should, but
	-- it has the advantage that the buckets are in sorted order.
	-- This helps when we read in the (hopefully) sorted dictionary. 
	integer i, j
			
	i = lookUpTable[word[1] + 2]
	if length(word) > 1 then
		j = lookUpTable[word[2] + 2]
	else                                -- blank is one of the bogus letters,
		j = INDEX_BOGUS                 -- but there are valid one-letter words
	end if
	return {i, j}
end function

procedure buildUserData()
-- Read words from user's document and insert them into the hash table
	object word 
	sequence bucket
	
	-- in below, 26 for 'A', 'a' to 'Z', 'z'. Plus '\'' and "bogus letters"
	-- including ' '.
	hashBucket = repeat(repeat({}, 28), 28)     
	
	while TRUE do
		word = nextWord(userFile)
		if sequence(word) then      -- not EOF yet
			bucket =calcHashValue(word)
			
			-- either (1)append the word in the list or (2)if the same 
			-- case-sensitive word is already in the list ignore the word
			if not find(word, hashBucket[bucket[1]][bucket[2]]) then
				hashBucket[bucket[1]][bucket[2]]
						= append(hashBucket[bucket[1]][bucket[2]], word)
			end if
		else                        -- EOF reached
			exit
		end if
	end while
end procedure

procedure goThroughDict()
-- read all words from the dictionary and look them up in the hash table
	object word
	sequence bucket, ub
	integer i
	
	currentBucket = {0, 0}          -- initialize to impossible value
	while TRUE do
		word = gets(dictFile)
		if sequence(word) then      -- not EOF yet
			-- word = word[1..length(word)-1]  -- remove \n
			word = delete_trailing_white(word)
			bucket = calcHashValue(word)
			
			-- treat all the same-spelling-but-different-case words as the
			-- same words (ie, case-insensitive) when spell checking
			if length(hashBucket[bucket[1]][bucket[2]]) > 0 then
				-- try to reuse the upper case bucket from last time
				if compare(currentBucket, bucket) != 0  then
					ub = upper(hashBucket[bucket[1]][bucket[2]])
					currentBucket = bucket
				end if
				while TRUE do
					i = find(word, ub)
					if i = 0 then
						exit
					else
						-- word found - mark as "deleted"
						hashBucket[bucket[1]][bucket[2]][i] = 0  
						ub[i] = 0
					end if
				end while
			end if
		else                        -- EOF reached
			exit
		end if
	end while
end procedure

procedure goThroughUserData()
-- Output the remaining words.
-- These "bad" words could not be found in the dictionary.
	integer outFile

	outFile = open("badwords.txt", "a")
	puts(outFile, "\n----------------\n" & "    " & command[3] & '\n')
	if outFile = -1 then
		puts(ERROR, "couldn't open badwords.txt\n")
		abort(1)
	end if
	for i = 1 to length(hashBucket) do
		for j = 1 to length(hashBucket[i]) do
			for k = 1 to length(hashBucket[i][j]) do
				if sequence(hashBucket[i][j][k]) then
					puts(outFile, hashBucket[i][j][k] & '\n')
				end if
			end for
		end for
	end for
	close(outFile)
end procedure

procedure displayHashTable()
-- Debug routine to see how many words hashed to each bucket.
-- Most buckets will be empty.
	for i = 1 to length(hashBucket) do
		for j = 1 to length(hashBucket[i]) do
			printf(1, "bucket (%2d, %2d) has %6d entries\n",
					  {i, j, length(hashBucket[i][j])})
		end for
	end for
end procedure


openFiles()

atom t
t = time()

initLookUpTable()
buildUserData()
-- displayHashTable()
goThroughDict()
goThroughUserData()