------------------------------------------------ -- Project: Spell Checker -- -- Subject: 433-141 Semester 1, 2001 -- -- Name : Huang Jian -- -- Enrolment number: 126432 -- ------------------------------------------------ import SPCLib import List -- The above module is supplied on the 141 web page. -- Make a copy of it in your directory, but do NOT -- make any changes to it. All of your changes are -- to be made in your copy of this file, SpellChecker.hs -- Some simple testing routines... test1 = doSpellCheck smalldict mary1 test2 = doSpellCheck smalldict mary2 test3 = doSpellCheck smalldict mary3 -- Version 0: This first one is stupid! It just says that every word -- in the input string is a spelling mistake. Really, the only reason -- we have given it to you is as a basis for the versions that you -- have to write... -- doSpellCheck :: SpellCheck -- doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, noAnswers) | (num, w) <- wordlist] -- wordlist = prepareWords words text -- Version 1: reports words as errors if they are not in the dictionary, -- but still does not offer any alternatives. Hint -- you have to change -- one line compared to doSpellCheck, so make a copy of that function, -- comment out the original version, and then edit the copy --doSpellCheck :: SpellCheck --doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, noAnswers) | (num, w) <- wordlist, -- not ( w `elem` smalldict)] -- wordlist = prepareWords words text -- Version 2: The programme is modified so that capital words are not -- reported as misspelling. To do this, convert all words into lower-case -- letters before being tested. --doSpellCheck :: SpellCheck --doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, noAnswers) | (num, w) <- wordlist, -- not ((toLowAll w) `elem` smalldict)] -- where toLowAll w = [ toLower x | x <- w ] -- wordlist = prepareWords words text -- Version 3: A new fuction getWords is written to cut up a string into -- alphanumeric word. Modify doSpellCheck using this function to -- eliminate unnecessary error message. --doSpellCheck :: SpellCheck --doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, noAnswers) | (num, w) <- wordlist, -- not ((toLowAll w) `elem` smalldict)] -- where -- toLowAll w = [ toLower x | x <- w ] -- wordlist = prepareWords getWords text getWords :: MyWord -> [MyWord] getWords str = words (combine str) combine :: MyWord -> MyWord combine [] = " " combine (x:xs) | isAlpha x || isDigit x = x : combine xs | otherwise = ' ' : combine xs -- Combine is defined to change all non-alphanumeric characters into space, -- then we can apply the prelude function words to pick up all alphanumeric -- words. I assummingly take a word with an apostrophy as two words. -- Version 4: A function makePlural is written and placed in the right -- place in the programme to match a word with its plural form in -- order to aviod unnecessary error message. --doSpellCheck :: SpellCheck --doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, noAnswers) | (num, w) <- wordlist, -- (not ((toLowAll w) `elem` smalldict)) && -- (not ((toLowAll w) `elem` pluralSmalldict))] -- where -- toLowAll w = [ toLower x | x <- w ] -- pluralSmalldict = [ makePlural x | x <- smalldict] -- wordlist = prepareWords getWords text makePlural :: MyWord -> MyWord makePlural wd | isSuffix wd suf = changesuffix wd suffixset | otherwise = wd ++ "s" where suf = fst (unzip suffixset) -- This function works simply by comparing the suffix of a word with the -- list containing all the changing rules. If it appears to be in the list, -- then change the suffix accordingly. Otherwise, simply add 's' to it. suffixset :: [(MyWord,MyWord)] suffixset = [("lf","lves"),("y","ies"),("x","xes"),("s","ses")] -- To extend the function, simply add a new tuple to the list containing the -- new changing rule. isSuffix :: MyWord -> [MyWord] -> Bool isSuffix wd [] = False isSuffix wd (x:xs) | reverse (take (length x) (reverse wd)) == x = True | otherwise = isSuffix wd xs -- Find the suffix to a word. changesuffix :: MyWord -> [(MyWord,MyWord)] -> MyWord changesuffix wd [] = [] changesuffix wd ((a,b):xs) | reverse (take (length a) (reverse wd)) == a = reverse (drop (length a) (reverse wd)) ++ b | otherwise = changesuffix wd xs -- Change the suffix to its plural suffix. -- Version 5: Function getLongPrefixes is written to pick up one or more -- closest words of a given word and use it to provide users suggestions -- instead of displaying noAnswer message all the time. If there is no -- matching word exist, output an empty list. --doSpellCheck :: SpellCheck --doSpellCheck dict text -- = putStr result -- where -- result = presentNicely errors -- errors = [(num, w, (getLongPrefixes smalldict w)) | (num, w) <- wordlist, -- (not ((toLowAll w) `elem` smalldict)) && -- (not ((toLowAll w) `elem` pluralSmalldict))] -- where -- toLowAll w = [ toLower x | x <- w ] -- pluralSmalldict = [ makePlural x | x <- smalldict] -- wordlist = prepareWords getWords text getLongPrefixes :: Dictionary -> MyWord -> [MyWord] getLongPrefixes dict [] = [] getLongPrefixes dict [x] = [ y | y <- dict, isPrefix [toLower x] y] getLongPrefixes dict wd | findElem /= [] = findElem | otherwise = getLongPrefixes dict (dropLast lowWd) where findElem = [ y | y <- dict, isPrefix lowWd y] lowWd = toLowAll wd where toLowAll w = [ toLower x | x <- w ] -- The function works by taking every element in the word and check if it -- is the prefix of any words in the dictionary. If it is, output the words -- in the dictionary. If it is not, drop the last character in the word and -- try again, untill it becomes the prefix for some words in the dictionary. isPrefix :: MyWord -> MyWord -> Bool isPrefix [] _ = True isPrefix (x:xs) (y:ys) | length (x:xs) > length (y:ys) = False | x == y = and [True, isPrefix xs ys] | otherwise = False -- Check if a word is the prefix of another word. dropLast :: MyWord -> MyWord dropLast [x] = [] dropLast (x:xs) = x : dropLast (xs) -- Drop the last character in the word. -- Version 6: Function findGoodSuggestions is written to provide users more -- reasonable suggestion by using the minimal insert/delete edit distance -- method. Please be patient to test this version. doSpellCheck :: SpellCheck doSpellCheck dict text = putStr result where result = presentNicely errors errors = [(num, w, (findGoodSuggestions smalldict w)) | (num, w) <- wordlist, (not ((toLowAll w) `elem` smalldict)) && (not ((toLowAll w) `elem` pluralSmalldict))] where toLowAll w = [ toLower x | x <- w ] pluralSmalldict = [ makePlural x | x <- smalldict] wordlist = prepareWords getWords text findGoodSuggestions :: Dictionary -> MyWord -> [MyWord] findGoodSuggestions dict wd = [x | x <- dict, minEditDist x wd == samllestED dict wd] minEditDist :: MyWord -> MyWord -> Int minEditDist wd1 wd2 | sort wd1' == sort wd2 = editDistSameAlpha wd1' wd2 | commonAlpha wd1' wd2 == 0 = (length wd1') + (length wd2) | otherwise = (length wd1' - commonAlpha wd1' wd2) + ((length wd2) - commonAlpha wd1' wd2) + editDistSameAlpha comW1W2 comW2W1 where comW1W2 = getCommonAlpha wd1' wd2 comW2W1 = getCommonAlpha wd2 wd1' wd1' = toLowAll wd1 where toLowAll w = [ toLower x | x <- w ] -- The function works by make two words containing same Characters only, eg. -- "acbd" and "abcd". By doing this, take "amcnt" to "axcyt" as example, I find -- the common characters and calculate how many characters the first word need -- to drop to get that, which is 3 in this case. So the ED so far is 3. Then I -- check how many words I need to add to get the second word, which is 3 in this -- case. So the ED becomes 6. Then we call function editDistSameAlpha to calculate -- the ED of the common characters, and simply add is on. samllestED :: Dictionary -> MyWord -> Int samllestED dict wd = minimum [minEditDist x wd | x <- dict] -- Find the smallest edit distance of a given word within the given dictionary. editDistSameAlpha :: MyWord -> MyWord -> Int editDistSameAlpha wd1 [] = (length wd1) - 2 editDistSameAlpha [] wd1 = 0 editDistSameAlpha (x:xs) (y:ys) | (x:xs) == (y:ys) = 0 | reverse (x:xs) == (y:ys) = 2 * ((length (x:xs)) -1) | length (x:xs) < length (y:ys) = editDistSameAlpha (y:ys) (x:xs) | x == y = editDistSameAlpha ys xs | length (x:xs) == length (y:ys)= 2 + editDistSameAlpha (dropElem (x:xs) y) ys | otherwise = 2 + editDistSameAlpha (dropElem (x:xs) y) ys - abs (length (y:ys) - length (x:xs)) where dropElem :: MyWord -> Char -> MyWord dropElem wrd cha = [x | x <- wrd, x /= cha] -- Check the edit distance of two words with same characters. eg: "abcd" and -- "cbda". If they are exactly the same, ED will be 0. If one is the reverse -- of the other, ED is length of the words minus one and multiply by two, -- since all characters except one are changed. In all other cases, drop the -- character in the first word which equal to the first character in the -- second word, and simply add 2 to ED, then recall the function again. dropOne :: MyWord -> MyWord -> Int dropOne wd1 (y1:y2:ys) | sort wd1 == sort (y2:ys) = editDistSameAlpha wd1 (y2:ys) | otherwise = dropOne wd1 (y1:ys) commonAlpha :: MyWord -> MyWord -> Int commonAlpha wd1 wd2 = min (length [x | x <- wd1, x `elem` wd2]) (length [x | x <- wd2, x `elem` wd1]) -- Check how many characters in common do two words have. getCommonAlpha :: MyWord -> MyWord -> MyWord getCommonAlpha wd1 wd2 = [x | x <- wd1, x `elem` wd2] -- Get the common characters of two words.