Last active
July 8, 2024 07:29
-
-
Save gaulinmp/7107e3bac5ea94af6c9d to your computer and use it in GitHub Desktop.
Singularize function from patterns codebase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### SINGULARIZE ######################################################### | |
# Adapted from Bermi Ferrer's Inflector for Python: | |
# http://www.bermi.org/inflector/ | |
# Copyright (c) 2006 Bermi Ferrer Martinez | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software to deal in this software without restriction, including | |
# without limitation the rights to use, copy, modify, merge, publish, | |
# distribute, sublicense, and/or sell copies of this software, and to permit | |
# persons to whom this software is furnished to do so, subject to the following | |
# condition: | |
# | |
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THIS SOFTWARE. | |
_singular_rules = [ | |
(r'(?i)(.)ae$', '\\1a'), | |
(r'(?i)(.)itis$', '\\1itis'), | |
(r'(?i)(.)eaux$', '\\1eau'), | |
(r'(?i)(quiz)zes$', '\\1'), | |
(r'(?i)(matr)ices$', '\\1ix'), | |
(r'(?i)(ap|vert|ind)ices$', '\\1ex'), | |
(r'(?i)^(ox)en', '\\1'), | |
(r'(?i)(alias|status)es$', '\\1'), | |
(r'(?i)([octop|vir])i$', '\\1us'), | |
(r'(?i)(cris|ax|test)es$', '\\1is'), | |
(r'(?i)(shoe)s$', '\\1'), | |
(r'(?i)(o)es$', '\\1'), | |
(r'(?i)(bus)es$', '\\1'), | |
(r'(?i)([m|l])ice$', '\\1ouse'), | |
(r'(?i)(x|ch|ss|sh)es$', '\\1'), | |
(r'(?i)(m)ovies$', '\\1ovie'), | |
(r'(?i)(.)ombies$', '\\1ombie'), | |
(r'(?i)(s)eries$', '\\1eries'), | |
(r'(?i)([^aeiouy]|qu)ies$', '\\1y'), | |
# -f, -fe sometimes take -ves in the plural | |
# (e.g., lives, wolves). | |
(r"([aeo]l)ves$", "\\1f"), | |
(r"([^d]ea)ves$", "\\1f"), | |
(r"arves$", "arf"), | |
(r"erves$", "erve"), | |
(r"([nlw]i)ves$", "\\1fe"), | |
(r'(?i)([lr])ves$', '\\1f'), | |
(r"([aeo])ves$", "\\1ve"), | |
(r'(?i)(sive)s$', '\\1'), | |
(r'(?i)(tive)s$', '\\1'), | |
(r'(?i)(hive)s$', '\\1'), | |
(r'(?i)([^f])ves$', '\\1fe'), | |
# -ses suffixes. | |
(r'(?i)(^analy)ses$', '\\1sis'), | |
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', | |
'\\1\\2sis'), | |
(r'(?i)(.)opses$', '\\1opsis'), | |
(r'(?i)(.)yses$', '\\1ysis'), | |
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'), | |
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', | |
'\\1ose'), | |
(r'(?i)(.)oses$', '\\1osis'), | |
# -a | |
(r'(?i)([ti])a$', '\\1um'), | |
(r'(?i)(n)ews$', '\\1ews'), | |
(r'(?i)([^s])s$', '\\1'), # don't make ss singularize to s. | |
] | |
# For performance, compile the regular expressions only once: | |
_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules] | |
_singular_uninflected = set(( | |
"bison", "debris", "headquarters", "pincers", "trout", | |
"bream", "diabetes", "herpes", "pliers", "tuna", | |
"breeches", "djinn", "high-jinks", "proceedings", "whiting", | |
"britches", "eland", "homework", "rabies", "wildebeest" | |
"carp", "elk", "innings", "salmon", | |
"chassis", "flounder", "jackanapes", "scissors", | |
"christmas", "gallows", "mackerel", "series", | |
"clippers", "georgia", "measles", "shears", | |
"cod", "graffiti", "mews", "species", | |
"contretemps", "mumps", "swine", | |
"corps", "news", "swiss", | |
# Custom added from MD&A corpus | |
"api", "mae", "sae", "basis", "india", "media", | |
)) | |
_singular_uncountable = set(( | |
"advice", "equipment", "happiness", "luggage", "news", "software", | |
"bread", "fruit", "information", "mathematics", "progress", "understanding", | |
"butter", "furniture", "ketchup", "mayonnaise", "research", "water" | |
"cheese", "garbage", "knowledge", "meat", "rice", | |
"electricity", "gravel", "love", "mustard", "sand", | |
)) | |
_singular_ie = set(( | |
"alergie", "cutie", "hoagie", "newbie", "softie", "veggie", | |
"auntie", "doggie", "hottie", "nightie", "sortie", "weenie", | |
"beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie", | |
"birdie", "freebie", "junkie", "^pie", "sweetie", "zombie" | |
"bogie", "goonie", "laddie", "pixie", "techie", | |
"bombie", "groupie", "laramie", "quickie", "^tie", | |
"collie", "hankie", "lingerie", "reverie", "toughie", | |
"cookie", "hippie", "meanie", "rookie", "valkyrie", | |
)) | |
_singular_irregular = { | |
"abuses": "abuse", | |
"ads": "ad", | |
"atlantes": "atlas", | |
"atlases": "atlas", | |
"analysis": "analysis", | |
"axes": "axe", | |
"beeves": "beef", | |
"brethren": "brother", | |
"children": "child", | |
"children": "child", | |
"corpora": "corpus", | |
"corpuses": "corpus", | |
"ephemerides": "ephemeris", | |
"feet": "foot", | |
"ganglia": "ganglion", | |
"geese": "goose", | |
"genera": "genus", | |
"genii": "genie", | |
"graffiti": "graffito", | |
"helves": "helve", | |
"kine": "cow", | |
"leaves": "leaf", | |
"loaves": "loaf", | |
"men": "man", | |
"mongooses": "mongoose", | |
"monies": "money", | |
"moves": "move", | |
"mythoi": "mythos", | |
"numena": "numen", | |
"occipita": "occiput", | |
"octopodes": "octopus", | |
"opera": "opus", | |
"opuses": "opus", | |
"our": "my", | |
"oxen": "ox", | |
"penes": "penis", | |
"penises": "penis", | |
"people": "person", | |
"sexes": "sex", | |
"soliloquies": "soliloquy", | |
"teeth": "tooth", | |
"testes": "testis", | |
"trilbys": "trilby", | |
"turves": "turf", | |
"zoa": "zoon", | |
} | |
_plural_prepositions = set(( | |
"about", "before", "during", "of", "till", | |
"above", "behind", "except", "off", "to", | |
"across", "below", "for", "on", "under", | |
"after", "beneath", "from", "onto", "until", | |
"among", "beside", "in", "out", "unto", | |
"around", "besides", "into", "over", "upon", | |
"at", "between", "near", "since", "with", | |
"athwart", "betwixt", "beyond", "but", "by" | |
)) | |
def singularize(word, custom={}): | |
"""Returns the singular of a given word.""" | |
if word in custom: | |
return custom[word] | |
# Recurse compound words (e.g. mothers-in-law). | |
if "-" in word: | |
w = word.split("-") | |
if len(w) > 1 and w[1] in _plural_prepositions: | |
return singularize(w[0], custom) + "-" + "-".join(w[1:]) | |
# dogs' => dog's | |
if word.endswith("'"): | |
return singularize(word[:-1], custom) + "'s" | |
w = word.lower() | |
for x in _singular_uninflected: | |
if x.endswith(w): | |
return word | |
for x in _singular_uncountable: | |
if x.endswith(w): | |
return word | |
for x in _singular_ie: | |
if w.endswith(x + "s"): | |
return w | |
for x in _singular_irregular: | |
if w.endswith(x): | |
return re.sub('(?i)' + x + '$', _singular_irregular[x], word) | |
for suffix, inflection in _singular_rules: | |
m = suffix.search(word) | |
g = m and m.groups() or [] | |
if m: | |
for k in range(len(g)): | |
if g[k] is None: | |
inflection = inflection.replace('\\' + str(k + 1), '') | |
return suffix.sub(inflection, word) | |
return word |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment