minimorph-0.2.1.0/0000755000000000000000000000000013404437726012052 5ustar0000000000000000minimorph-0.2.1.0/minimorph.cabal0000644000000000000000000000716113404437726015045 0ustar0000000000000000name: minimorph -- The package version. See the Haskell package versioning policy (PVP) -- for standards guiding when and how versions should be incremented. -- http://www.haskell.org/haskellwiki/Package_versioning_policy -- PVP summary: +-+------- breaking API changes -- | | +----- minor or non-breaking API additions -- | | | +--- code changes with no API change version: 0.2.1.0 synopsis: English spelling functions with an emphasis on simplicity. description: A set of simplistic functions capturing the more regular parts of English spelling (for generation, not parsing). You will need to complement this with some account for irregular nouns/verbs. This package is not meant to provide anything resembling a full account of English morphology (something like Functional Morphology or sequor could be better suited). The main goal is to provide something cheap and cheerful with no learning curve, that you can use until your application calls for more robustness. See for a simple use case. homepage: https://github.com/Mikolaj/minimorph bug-reports: https://github.com/Mikolaj/minimorph/issues license: BSD3 license-file: LICENSE author: Eric Kow maintainer: Mikolaj Konarski category: Natural Language Processing build-type: Simple cabal-version: >=1.10 source-repository head type: git location: git://github.com/Mikolaj/minimorph.git library exposed-modules: NLP.Minimorph.English NLP.Minimorph.Number NLP.Minimorph.Util build-depends: base < 5 , text default-language: Haskell2010 if impl(ghc >= 8.0) { default-extensions: MonoLocalBinds, ScopedTypeVariables, OverloadedStrings BangPatterns, RecordWildCards, NamedFieldPuns, MultiWayIf, LambdaCase, DefaultSignatures, InstanceSigs, MonadFailDesugaring, StrictData, CPP ghc-options: -Wall -Wcompat -Worphans -Wincomplete-uni-patterns -Wincomplete-record-updates -Wmissing-home-modules -Widentities -Wredundant-constraints ghc-options: -Wall-missed-specialisations ghc-options: -fno-ignore-asserts -fexpose-all-unfoldings -fspecialise-aggressively } test-suite test-minimorph type: exitcode-stdio-1.0 main-is: test-minimorph.hs other-modules: NLP.Minimorph.EnglishTest hs-Source-Dirs: test build-depends: base < 5 , HUnit , minimorph , test-framework , test-framework-hunit , text default-language: Haskell2010 if impl(ghc >= 8.0) { default-extensions: MonoLocalBinds, ScopedTypeVariables, OverloadedStrings BangPatterns, RecordWildCards, NamedFieldPuns, MultiWayIf, LambdaCase, DefaultSignatures, InstanceSigs, MonadFailDesugaring, StrictData, CPP ghc-options: -Wall -Wcompat -Worphans -Wincomplete-uni-patterns -Wincomplete-record-updates -Wmissing-home-modules -Widentities -Wredundant-constraints ghc-options: -Wall-missed-specialisations ghc-options: -fno-ignore-asserts -fexpose-all-unfoldings -fspecialise-aggressively } minimorph-0.2.1.0/LICENSE0000644000000000000000000000311313404437726013055 0ustar0000000000000000Copyright (c) 2012-2015, Computational Linguistics Ltd. 2012-2019, Mikolaj Konarski All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Computational Linguistics Ltd. nor the names of other contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. minimorph-0.2.1.0/Setup.hs0000644000000000000000000000005613404437726013507 0ustar0000000000000000import Distribution.Simple main = defaultMain minimorph-0.2.1.0/NLP/0000755000000000000000000000000013404437726012503 5ustar0000000000000000minimorph-0.2.1.0/NLP/Minimorph/0000755000000000000000000000000013404437726014445 5ustar0000000000000000minimorph-0.2.1.0/NLP/Minimorph/English.hs0000644000000000000000000002100213404437726016365 0ustar0000000000000000{-# LANGUAGE OverloadedStrings, ViewPatterns #-} -- TODO : learn how to use Functional Morphology instead -- |Simple default rules for English morphology module NLP.Minimorph.English where import Data.Char (isSpace, isUpper, toLower) import Data.Text (Text) import qualified Data.Text as T import NLP.Minimorph.Util -- --------------------------------------------------------------------- -- ** Punctuation -- --------------------------------------------------------------------- -- | No Oxford commas, alas. -- -- > commas "and" "foo bar" == "foo and bar" -- > commas "and" "foo, bar, baz" == "foo, bar and baz" commas :: Text -> [Text] -> Text commas _ [] = "" commas _ [x] = x commas et xs = T.intercalate ", " (init xs) <+> et <+> last xs -- --------------------------------------------------------------------- -- ** Numbers -- --------------------------------------------------------------------- -- | > cardinal 0 == "zero" -- > cardinal 1 == "one" -- > cardinal 2 == "two" -- > cardinal 10 == "ten" -- > cardinal 11 == "11" cardinal :: Int -> Text cardinal n = case n of 0 -> "zero" 1 -> "one" 2 -> "two" 3 -> "three" 4 -> "four" 5 -> "five" 6 -> "six" 7 -> "seven" 8 -> "eight" 9 -> "nine" 10 -> "ten" _ -> tshow n -- | > ordinalNotSpelled 1 == "1st" -- > ordinalNotSpelled 2 == "2nd" -- > ordinalNotSpelled 11 == "11th" ordinalNotSpelled :: Int -> Text ordinalNotSpelled k = case abs k `rem` 100 of n | n > 3 && n < 21 -> k `suf` "th" | n `rem` 10 == 1 -> k `suf` "st" | n `rem` 10 == 2 -> k `suf` "nd" | n `rem` 10 == 3 -> k `suf` "rd" | otherwise -> k `suf` "th" where num `suf` s = tshow num <> s -- | > ordinal 1 == "first" -- > ordinal 2 == "second" -- > ordinal 3 == "third" -- > ordinal 11 == "11th" -- > ordinal 42 == "42nd" ordinal :: Int -> Text ordinal n = case n of 0 -> "zeroth" 1 -> "first" 2 -> "second" 3 -> "third" 4 -> "fourth" 5 -> "fifth" 6 -> "sixth" 7 -> "seventh" 8 -> "eighth" 9 -> "ninth" 10 -> "tenth" k -> ordinalNotSpelled k -- --------------------------------------------------------------------- -- ** Nouns and verbs -- --------------------------------------------------------------------- -- | Heuristics for English plural for an unknown noun. -- -- > defaultNounPlural "egg" == "eggs" -- > defaultNounPlural "patch" == "patches" -- > defaultNounPlural "boy" == "boys" -- > defaultNounPlural "spy" == "spies" -- > defaultNounPlural "thesis" == "theses" -- -- http://www.paulnoll.com/Books/Clear-English/English-plurals-1.html -- -- http://en.wikipedia.org/wiki/English_plural defaultNounPlural :: Text -> Text defaultNounPlural x | "is" `T.isSuffixOf` x = thesis | hasSibilantSuffix x = sibilant_o | hasCoSuffix x = sibilant_o | hasCySuffix x = y_final | "f" `T.isSuffixOf` x = f_final | otherwise = plain where plain = x <> "s" sibilant_o = x <> "es" y_final = T.init x <> "ies" f_final = T.init x <> "ves" thesis = tDropEnd 2 x <> "es" -- | Heuristics for 3rd person singular and past participle -- for an unknown regular verb. Doubling of final consonants -- can be handled via a table of (partially) irregular verbs. -- -- > defaultVerbStuff "walk" == ("walks", "walked") -- > defaultVerbStuff "push" == ("pushes", "pushed") -- > defaultVerbStuff "play" == ("plays", "played") -- > defaultVerbStuff "cry" == ("cries", "cried") defaultVerbStuff :: Text -> (Text, Text) defaultVerbStuff x | hasSibilantSuffix x = sibilant_o | hasCoSuffix x = sibilant_o | hasCySuffix x = y_final | "e" `T.isSuffixOf` x = e_final | otherwise = plain where plain = (x <> "s" , x <> "ed") sibilant_o = (x <> "es" , x <> "ed") e_final = (x <> "s" , x <> "d") y_final = (T.init x <> "ies", T.init x <> "ied") -- | Heuristics for a possesive form for an unknown noun. -- -- > defaultPossesive "pass" == "pass'" -- > defaultPossesive "SOS" == "SOS'" -- > defaultPossesive "Mr Blinkin'" == "Mr Blinkin's" -- > defaultPossesive "cry" == "cry's" defaultPossesive :: Text -> Text defaultPossesive t = case T.last t of 's' -> t <> "'" 'S' -> t <> "'" '\'' -> t <> "s" _ -> t <> "'s" -- --------------------------------------------------------------------- -- ** Determiners -- --------------------------------------------------------------------- anNumerals :: [Text] anNumerals = [ "11", "11th", "18", "18th" ] -- | > indefiniteDet "dog" == "a" -- > indefiniteDet "egg" == "an" -- > indefiniteDet "ewe" == "a" -- > indefiniteDet "ewok" == "an" -- > indefiniteDet "8th" == "an" indefiniteDet :: Text -> Text indefiniteDet t = if wantsAn t then "an" else "a" -- | True if the indefinite determiner for a word would normally be -- \'an\' as opposed to \'a\'. wantsAn :: Text -> Bool wantsAn t_ = if startsWithAcronym t_ then acronymWantsAn t_ else useAn0 || useAn1 where t = fst $ T.break isSep $ T.toLower t_ useAn0 = t `elem` anNumerals useAn1 = case T.uncons t of Just (h, "") -> isLetterWithInitialVowelSound h Just ('8',_) -> True Just ('u',_) -> hasVowel_U_Prefix t Just (h, _) -> isVowel h `butNot` hasSemivowelPrefix t Nothing -> False x `butNot` y = x && not y isSep c = isSpace c || c `elem` ("-" :: String) -- | Variant of 'wantsAn' that assumes the input string is pronounced -- one letter at a time. -- -- > wantsAn "x-ray" == False -- > acronymWantsAn "x-ray" == True -- -- Note that this won't do the right thing for words like \"SCUBA\". -- You really have to reserve it for those separate-letter acronyms. acronymWantsAn :: Text -> Bool acronymWantsAn (T.toLower -> t) = useAn0 || useAn1 where useAn0 = t `elem` anNumerals useAn1 = case T.uncons t of Just ('8',_) -> True Just (h,_) -> isLetterWithInitialVowelSound h Nothing -> False -- --------------------------------------------------------------------- -- ** Acronyms -- --------------------------------------------------------------------- -- | True if all upper case from second letter and up. -- -- > looksLikeAcronym "DNA" == True -- > looksLikeAcronym "tRNA" == True -- > looksLikeAcronym "x" == False -- > looksLikeAcronym "DnA" == False looksLikeAcronym :: Text -> Bool looksLikeAcronym "" = False looksLikeAcronym x = T.all isUpper (if T.length x > 1 then T.drop 1 x else x) -- | True if the first word (separating on either hyphen or space) -- looks like an acronym. startsWithAcronym :: Text -> Bool startsWithAcronym = looksLikeAcronym . firstWord where firstWord = fst . T.break isSep isSep c = isSpace c || c `elem` ("-" :: String) -- --------------------------------------------------------------------- -- ** Sounds -- --------------------------------------------------------------------- -- | Ends with a \'sh\' sound. hasSibilantSuffix :: Text -> Bool hasSibilantSuffix x = any (`T.isSuffixOf` x) ["x","s","ch","sh","z","j"] -- | Starts with a semivowel. hasSemivowelPrefix :: Text -> Bool hasSemivowelPrefix ls = any (`T.isPrefixOf` ls) ["y","w","eu","ewe"] -- | Starts with a vowel-y \'U\' sound hasVowel_U_Prefix :: Text -> Bool hasVowel_U_Prefix t = case T.unpack t of ['u'] -> False ['u',_] -> True ('u':c:v:_) -> not (isConsonant c && isVowel v) _ -> False -- | Last two letters are a consonant and \'y\'. hasCySuffix :: Text -> Bool hasCySuffix (T.unpack . tTakeEnd 2 -> [x, 'y']) = isConsonant x hasCySuffix _ = False -- | Last two letters are a consonant and \'o\'. hasCoSuffix :: Text -> Bool hasCoSuffix (T.unpack . tTakeEnd 2 -> [x, 'o']) = isConsonant x hasCoSuffix _ = False -- | Is a vowel. isVowel :: Char -> Bool isVowel = (`elem` ("aeiou" :: String)) . toLower -- | Letters that when pronounced independently in English sound like they -- begin with vowels. -- -- > isLetterWithInitialVowelSound 'r' == True -- > isLetterWithInitialVowelSound 'k' == False -- -- (In the above, \'r\' is pronounced \"are\", but \'k\' is pronounced -- \"kay\".) isLetterWithInitialVowelSound :: Char -> Bool isLetterWithInitialVowelSound = (`elem` ("aeiofhlmnrsx" :: String)) . toLower -- | Is a consonant. isConsonant :: Char -> Bool isConsonant = not . isVowel minimorph-0.2.1.0/NLP/Minimorph/Number.hs0000644000000000000000000000041613404437726016232 0ustar0000000000000000module NLP.Minimorph.Number where -- | Singular and Plural. data SingPlu a = SP { sg :: a , pl :: a } deriving (Show, Eq) data Number = Singular | Plural deriving (Eq, Show) fromSP :: Number -> SingPlu a -> a fromSP Singular = sg fromSP Plural = pl minimorph-0.2.1.0/NLP/Minimorph/Util.hs0000644000000000000000000000147513404437726015725 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} -- | Text utility functions. module NLP.Minimorph.Util ( tTakeEnd, tDropEnd, (<>), (<+>), tshow ) where import Data.Monoid ((<>)) import Data.Text (Text) import qualified Data.Text as T -- | @tTakeEnd n t@ returns the last @n@ letters of @t@. tTakeEnd :: Int -> Text -> Text tTakeEnd n t = T.drop (T.length t - n) t -- | @tDropEnd n t@ drops the last @n@ letters of @t@. tDropEnd :: Int -> Text -> Text tDropEnd n x = T.take (T.length x - n) x infixr 6 <+> -- matches Monoid.<> -- | Separated by space unless one of them is empty (in which case just -- the non-empty one). (<+>) :: Text -> Text -> Text t1 <+> t2 | T.null t1 = t2 | T.null t2 = t1 | otherwise = t1 <> " " <> t2 -- | Show a value in `Text` format. tshow :: Show a => a -> Text tshow = T.pack . show minimorph-0.2.1.0/test/0000755000000000000000000000000013404437726013031 5ustar0000000000000000minimorph-0.2.1.0/test/test-minimorph.hs0000644000000000000000000000023013404437726016337 0ustar0000000000000000import Test.Framework import qualified NLP.Minimorph.EnglishTest main :: IO () main = defaultMain [ NLP.Minimorph.EnglishTest.suite ] minimorph-0.2.1.0/test/NLP/0000755000000000000000000000000013404437726013462 5ustar0000000000000000minimorph-0.2.1.0/test/NLP/Minimorph/0000755000000000000000000000000013404437726015424 5ustar0000000000000000minimorph-0.2.1.0/test/NLP/Minimorph/EnglishTest.hs0000644000000000000000000001140713404437726020214 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} module NLP.Minimorph.EnglishTest where import Data.Text (Text) import qualified Data.Text as T import Test.Framework import Test.Framework.Providers.HUnit import Test.HUnit import NLP.Minimorph.English suite :: Test.Framework.Test suite = testGroup "NLP.Minimorph.English" [ t_defaultVerbStuff , t_defaultNounPlural , t_indefiniteDet , t_acronymWantsAn , t_ordinal , t_commas ] -- ---------------------------------------------------------------------- -- tests -- ---------------------------------------------------------------------- t_defaultVerbStuff :: Test.Framework.Test t_defaultVerbStuff = testGroup "defaultVerbStuff" (map tc verbs) where tc (pl, sg3, pastP) = testCase summary $ assertEqual summary (sg3, pastP) (defaultVerbStuff pl) where summary = T.unpack $ T.concat [ pl, " (", sg3, ", ", pastP, ")" ] t_defaultNounPlural :: Test.Framework.Test t_defaultNounPlural = testGroup "defaultNounPlural" (map tc nouns) where tc (sg, pl) = testCase summary $ assertEqual summary pl (defaultNounPlural sg) where summary = T.unpack $ T.concat [sg, " (", pl, ")"] t_indefiniteDet :: Test.Framework.Test t_indefiniteDet = testGroup "indefiniteDet" [ tc "eu" "a" "eukaryote" , tc "eu" "a" "Eukaryote" , tc "ewe" "a" "ewe" -- google 33k 'a ewe' vs 9k 'an ewe' , tc "ewok" "an" "ewok" , tc "ewok" "an" "Ewok" , tc "7th" "a" "7th" , tc "8th" "an" "8th" , tc "18th" "an" "18th" , tc "xylophone" "a" "xylophone" , tc "universe" "a" "universe" , tc "urge" "an" "urge" , tc "user" "a" "user" , tc "usher" "an" "usher" , tc "xylophone" "a" "xylophone" , tc "u-turn" "a" "u-turn" , tc "u" "a" "u" , tc "x-ray" "an" "x-ray" , tc "g-ray" "a" "g-ray" , tc "y-chromo" "a" "y-chromosome" , tc "x-chromo" "an" "x-chromosome" , tc "x chromo" "an" "x chromosome" , tc "18-fold" "an" "18-fold" , tc "-fold ending" "a" "-fold ending" , tc "'-fold' ending" "a" "'-fold' ending" , tc "\"-fold\" ending" "a" "\"-fold\" ending" , tc "mvp award" "an" "MVP award" , tc "UUCP user" "a" "UUCP user" ] where tc msg res inp = testCase summary $ assertEqual summary res (indefiniteDet inp) where summary = msg ++ " (" ++ T.unpack (T.unwords [res, inp]) ++ ")" t_acronymWantsAn :: Test.Framework.Test t_acronymWantsAn = testGroup "acronymWantsAn" [ tc "rgb" True "rgb" , tc "kml" False "kml" , tc "ac" True "ac" , tc "dc" False "dc" ] where tc msg res inp = testCase summary $ assertEqual summary res (acronymWantsAn inp) where summary = msg ++ " (" ++ T.unpack inp ++ ")" t_ordinal :: Test.Framework.Test t_ordinal = testGroup "ordinal" [ tc "12th" 12 , tc "42nd" 42 , tc "44th" 44 , tc "41st" 41 , tc "-3rd" (-3) ] where tc res inp = testCase (show inp ++ " => " ++ T.unpack res) $ assertEqual "" res (ordinal inp) t_commas :: Test.Framework.Test t_commas = testGroup "commas" [ tc "foo" ["foo"] , tc "foo and bar" ["foo","bar"] , tc "foo, bar and baz" ["foo","bar","baz"] , tc "foo, bar, baz and quux" ["foo","bar","baz","quux"] ] where tc res xs = testCase (show (length xs) ++ ": " ++ T.unpack res) $ assertEqual "" res (commas "and" xs) -- ---------------------------------------------------------------------- -- lexicon -- ---------------------------------------------------------------------- nouns :: [(Text,Text)] nouns = [ noun "star" "stars" , noun "egg" "eggs" , noun "patch" "patches" , noun "boy" "boys" , noun "spy" "spies" , noun "thesis" "theses" , noun "elf" "elves" , noun "ace" "aces" ] where noun s p = (s,p) detNouns :: [(Text,Text)] detNouns = [ noun "box" "boxes" , noun "cat" "cats" , noun "dog" "dogs" , noun "ant" "ants" , noun "egg" "eggs" ] where noun s p = (s,p) verbs :: [(Text,Text,Text)] verbs = [ verb "walk" "walks" "walked" , verb "push" "pushes" "pushed" , verb "pass" "passes" "passed" , verb "abuse" "abuses" "abused" , verb "banjo" "banjoes" "banjoed" , verb "play" "plays" "played" , verb "cry" "cries" "cried" , verb "goto" "gotoes" "gotoed" , verb "boo" "boos" "booed" ] where verb x y z = (x, y, z)