gaulinmp/singularize.py

## singularize.py
#### SINGULARIZE #########################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/

# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.

_singular_rules = [
    (r'(?i)(.)ae$', '\\1a'),
    (r'(?i)(.)itis$', '\\1itis'),
    (r'(?i)(.)eaux$', '\\1eau'),
    (r'(?i)(quiz)zes$', '\\1'),
    (r'(?i)(matr)ices$', '\\1ix'),
    (r'(?i)(ap|vert|ind)ices$', '\\1ex'),
    (r'(?i)^(ox)en', '\\1'),
    (r'(?i)(alias|status)es$', '\\1'),
    (r'(?i)([octop|vir])i$',  '\\1us'),
    (r'(?i)(cris|ax|test)es$', '\\1is'),
    (r'(?i)(shoe)s$', '\\1'),
    (r'(?i)(o)es$', '\\1'),
    (r'(?i)(bus)es$', '\\1'),
    (r'(?i)([m|l])ice$', '\\1ouse'),
    (r'(?i)(x|ch|ss|sh)es$', '\\1'),
    (r'(?i)(m)ovies$', '\\1ovie'),
    (r'(?i)(.)ombies$', '\\1ombie'),
    (r'(?i)(s)eries$', '\\1eries'),
    (r'(?i)([^aeiouy]|qu)ies$', '\\1y'),
    # -f, -fe sometimes take -ves in the plural
    # (e.g., lives, wolves).
    (r"([aeo]l)ves$", "\\1f"),
    (r"([^d]ea)ves$", "\\1f"),
    (r"arves$", "arf"),
    (r"erves$", "erve"),
    (r"([nlw]i)ves$", "\\1fe"),
    (r'(?i)([lr])ves$', '\\1f'),
    (r"([aeo])ves$", "\\1ve"),
    (r'(?i)(sive)s$', '\\1'),
    (r'(?i)(tive)s$', '\\1'),
    (r'(?i)(hive)s$', '\\1'),
    (r'(?i)([^f])ves$', '\\1fe'),
    # -ses suffixes.
    (r'(?i)(^analy)ses$', '\\1sis'),
    (r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$',
     '\\1\\2sis'),
    (r'(?i)(.)opses$', '\\1opsis'),
    (r'(?i)(.)yses$', '\\1ysis'),
    (r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
    (r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$',
     '\\1ose'),
    (r'(?i)(.)oses$', '\\1osis'),
    # -a
    (r'(?i)([ti])a$', '\\1um'),
    (r'(?i)(n)ews$', '\\1ews'),
    (r'(?i)([^s])s$', '\\1'),  # don't make ss singularize to s.
]

# For performance, compile the regular expressions only once:
_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules]

_singular_uninflected = set((
    "bison", "debris", "headquarters", "pincers", "trout",
    "bream", "diabetes", "herpes", "pliers", "tuna",
    "breeches", "djinn", "high-jinks", "proceedings", "whiting",
    "britches", "eland", "homework", "rabies", "wildebeest"
    "carp", "elk", "innings", "salmon",
    "chassis", "flounder", "jackanapes", "scissors",
    "christmas", "gallows", "mackerel", "series",
    "clippers", "georgia", "measles", "shears",
    "cod", "graffiti", "mews", "species",
    "contretemps", "mumps", "swine",
    "corps", "news", "swiss",
    # Custom added from MD&A corpus
    "api", "mae", "sae", "basis", "india", "media",
))
_singular_uncountable = set((
    "advice", "equipment", "happiness", "luggage", "news", "software",
    "bread", "fruit", "information", "mathematics", "progress", "understanding",
    "butter", "furniture", "ketchup", "mayonnaise", "research", "water"
    "cheese", "garbage", "knowledge", "meat", "rice",
    "electricity", "gravel", "love", "mustard", "sand",
))
_singular_ie = set((
    "alergie", "cutie", "hoagie", "newbie", "softie", "veggie",
    "auntie", "doggie", "hottie", "nightie", "sortie", "weenie",
    "beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie",
    "birdie", "freebie", "junkie", "^pie", "sweetie", "zombie"
    "bogie", "goonie", "laddie", "pixie", "techie",
    "bombie", "groupie", "laramie", "quickie", "^tie",
    "collie", "hankie", "lingerie", "reverie", "toughie",
    "cookie", "hippie", "meanie", "rookie", "valkyrie",
))
_singular_irregular = {
    "abuses": "abuse",
    "ads": "ad",
    "atlantes": "atlas",
    "atlases": "atlas",
    "analysis": "analysis",
    "axes": "axe",
    "beeves": "beef",
    "brethren": "brother",
    "children": "child",
    "children": "child",
    "corpora": "corpus",
    "corpuses": "corpus",
    "ephemerides": "ephemeris",
    "feet": "foot",
    "ganglia": "ganglion",
    "geese": "goose",
    "genera": "genus",
    "genii": "genie",
    "graffiti": "graffito",
    "helves": "helve",
    "kine": "cow",
    "leaves": "leaf",
    "loaves": "loaf",
    "men": "man",
    "mongooses": "mongoose",
    "monies": "money",
    "moves": "move",
    "mythoi": "mythos",
    "numena": "numen",
    "occipita": "occiput",
    "octopodes": "octopus",
    "opera": "opus",
    "opuses": "opus",
    "our": "my",
    "oxen": "ox",
    "penes": "penis",
    "penises": "penis",
    "people": "person",
    "sexes": "sex",
    "soliloquies": "soliloquy",
    "teeth": "tooth",
    "testes": "testis",
    "trilbys": "trilby",
    "turves": "turf",
    "zoa": "zoon",
}

_plural_prepositions = set((
    "about", "before", "during", "of", "till",
    "above", "behind", "except", "off", "to",
    "across", "below", "for", "on", "under",
    "after", "beneath", "from", "onto", "until",
    "among", "beside", "in", "out", "unto",
    "around", "besides", "into", "over", "upon",
    "at", "between", "near", "since", "with",
    "athwart", "betwixt", "beyond", "but", "by"
))


def singularize(word, custom={}):
    """Returns the singular of a given word."""
    if word in custom:
        return custom[word]
    # Recurse compound words (e.g. mothers-in-law).
    if "-" in word:
        w = word.split("-")
        if len(w) > 1 and w[1] in _plural_prepositions:
            return singularize(w[0], custom) + "-" + "-".join(w[1:])
    # dogs' => dog's
    if word.endswith("'"):
        return singularize(word[:-1], custom) + "'s"
    w = word.lower()
    for x in _singular_uninflected:
        if x.endswith(w):
            return word
    for x in _singular_uncountable:
        if x.endswith(w):
            return word
    for x in _singular_ie:
        if w.endswith(x + "s"):
            return w
    for x in _singular_irregular:
        if w.endswith(x):
            return re.sub('(?i)' + x + '$', _singular_irregular[x], word)
    for suffix, inflection in _singular_rules:
        m = suffix.search(word)
        g = m and m.groups() or []
        if m:
            for k in range(len(g)):
                if g[k] is None:
                    inflection = inflection.replace('\\' + str(k + 1), '')
            return suffix.sub(inflection, word)
    return word
	#### SINGULARIZE #########################################################
	# Adapted from Bermi Ferrer's Inflector for Python:
	# http://www.bermi.org/inflector/

	# Copyright (c) 2006 Bermi Ferrer Martinez
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software to deal in this software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of this software, and to permit
	# persons to whom this software is furnished to do so, subject to the following
	# condition:
	#
	# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THIS SOFTWARE.

	_singular_rules = [
	(r'(?i)(.)ae$', '\\1a'),
	(r'(?i)(.)itis$', '\\1itis'),
	(r'(?i)(.)eaux$', '\\1eau'),
	(r'(?i)(quiz)zes$', '\\1'),
	(r'(?i)(matr)ices$', '\\1ix'),
	(r'(?i)(ap\|vert\|ind)ices$', '\\1ex'),
	(r'(?i)^(ox)en', '\\1'),
	(r'(?i)(alias\|status)es$', '\\1'),
	(r'(?i)([octop\|vir])i$', '\\1us'),
	(r'(?i)(cris\|ax\|test)es$', '\\1is'),
	(r'(?i)(shoe)s$', '\\1'),
	(r'(?i)(o)es$', '\\1'),
	(r'(?i)(bus)es$', '\\1'),
	(r'(?i)([m\|l])ice$', '\\1ouse'),
	(r'(?i)(x\|ch\|ss\|sh)es$', '\\1'),
	(r'(?i)(m)ovies$', '\\1ovie'),
	(r'(?i)(.)ombies$', '\\1ombie'),
	(r'(?i)(s)eries$', '\\1eries'),
	(r'(?i)([^aeiouy]\|qu)ies$', '\\1y'),
	# -f, -fe sometimes take -ves in the plural
	# (e.g., lives, wolves).
	(r"([aeo]l)ves$", "\\1f"),
	(r"([^d]ea)ves$", "\\1f"),
	(r"arves$", "arf"),
	(r"erves$", "erve"),
	(r"([nlw]i)ves$", "\\1fe"),
	(r'(?i)([lr])ves$', '\\1f'),
	(r"([aeo])ves$", "\\1ve"),
	(r'(?i)(sive)s$', '\\1'),
	(r'(?i)(tive)s$', '\\1'),
	(r'(?i)(hive)s$', '\\1'),
	(r'(?i)([^f])ves$', '\\1fe'),
	# -ses suffixes.
	(r'(?i)(^analy)ses$', '\\1sis'),
	(r'(?i)((a)naly\|(b)a\|(d)iagno\|(p)arenthe\|(p)rogno\|(s)ynop\|(t)he)ses$',
	'\\1\\2sis'),
	(r'(?i)(.)opses$', '\\1opsis'),
	(r'(?i)(.)yses$', '\\1ysis'),
	(r'(?i)(h\|d\|r\|o\|n\|b\|cl\|p)oses$', '\\1ose'),
	(r'(?i)(fruct\|gluc\|galact\|lact\|ket\|malt\|rib\|sacchar\|cellul)ose$',
	'\\1ose'),
	(r'(?i)(.)oses$', '\\1osis'),
	# -a
	(r'(?i)([ti])a$', '\\1um'),
	(r'(?i)(n)ews$', '\\1ews'),
	(r'(?i)([^s])s$', '\\1'), # don't make ss singularize to s.
	]

	# For performance, compile the regular expressions only once:
	_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules]

	_singular_uninflected = set((
	"bison", "debris", "headquarters", "pincers", "trout",
	"bream", "diabetes", "herpes", "pliers", "tuna",
	"breeches", "djinn", "high-jinks", "proceedings", "whiting",
	"britches", "eland", "homework", "rabies", "wildebeest"
	"carp", "elk", "innings", "salmon",
	"chassis", "flounder", "jackanapes", "scissors",
	"christmas", "gallows", "mackerel", "series",
	"clippers", "georgia", "measles", "shears",
	"cod", "graffiti", "mews", "species",
	"contretemps", "mumps", "swine",
	"corps", "news", "swiss",
	# Custom added from MD&A corpus
	"api", "mae", "sae", "basis", "india", "media",
	))
	_singular_uncountable = set((
	"advice", "equipment", "happiness", "luggage", "news", "software",
	"bread", "fruit", "information", "mathematics", "progress", "understanding",
	"butter", "furniture", "ketchup", "mayonnaise", "research", "water"
	"cheese", "garbage", "knowledge", "meat", "rice",
	"electricity", "gravel", "love", "mustard", "sand",
	))
	_singular_ie = set((
	"alergie", "cutie", "hoagie", "newbie", "softie", "veggie",
	"auntie", "doggie", "hottie", "nightie", "sortie", "weenie",
	"beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie",
	"birdie", "freebie", "junkie", "^pie", "sweetie", "zombie"
	"bogie", "goonie", "laddie", "pixie", "techie",
	"bombie", "groupie", "laramie", "quickie", "^tie",
	"collie", "hankie", "lingerie", "reverie", "toughie",
	"cookie", "hippie", "meanie", "rookie", "valkyrie",
	))
	_singular_irregular = {
	"abuses": "abuse",
	"ads": "ad",
	"atlantes": "atlas",
	"atlases": "atlas",
	"analysis": "analysis",
	"axes": "axe",
	"beeves": "beef",
	"brethren": "brother",
	"children": "child",
	"children": "child",
	"corpora": "corpus",
	"corpuses": "corpus",
	"ephemerides": "ephemeris",
	"feet": "foot",
	"ganglia": "ganglion",
	"geese": "goose",
	"genera": "genus",
	"genii": "genie",
	"graffiti": "graffito",
	"helves": "helve",
	"kine": "cow",
	"leaves": "leaf",
	"loaves": "loaf",
	"men": "man",
	"mongooses": "mongoose",
	"monies": "money",
	"moves": "move",
	"mythoi": "mythos",
	"numena": "numen",
	"occipita": "occiput",
	"octopodes": "octopus",
	"opera": "opus",
	"opuses": "opus",
	"our": "my",
	"oxen": "ox",
	"penes": "penis",
	"penises": "penis",
	"people": "person",
	"sexes": "sex",
	"soliloquies": "soliloquy",
	"teeth": "tooth",
	"testes": "testis",
	"trilbys": "trilby",
	"turves": "turf",
	"zoa": "zoon",
	}

	_plural_prepositions = set((
	"about", "before", "during", "of", "till",
	"above", "behind", "except", "off", "to",
	"across", "below", "for", "on", "under",
	"after", "beneath", "from", "onto", "until",
	"among", "beside", "in", "out", "unto",
	"around", "besides", "into", "over", "upon",
	"at", "between", "near", "since", "with",
	"athwart", "betwixt", "beyond", "but", "by"
	))


	def singularize(word, custom={}):
	"""Returns the singular of a given word."""
	if word in custom:
	return custom[word]
	# Recurse compound words (e.g. mothers-in-law).
	if "-" in word:
	w = word.split("-")
	if len(w) > 1 and w[1] in _plural_prepositions:
	return singularize(w[0], custom) + "-" + "-".join(w[1:])
	# dogs' => dog's
	if word.endswith("'"):
	return singularize(word[:-1], custom) + "'s"
	w = word.lower()
	for x in _singular_uninflected:
	if x.endswith(w):
	return word
	for x in _singular_uncountable:
	if x.endswith(w):
	return word
	for x in _singular_ie:
	if w.endswith(x + "s"):
	return w
	for x in _singular_irregular:
	if w.endswith(x):
	return re.sub('(?i)' + x + '$', _singular_irregular[x], word)
	for suffix, inflection in _singular_rules:
	m = suffix.search(word)
	g = m and m.groups() or []
	if m:
	for k in range(len(g)):
	if g[k] is None:
	inflection = inflection.replace('\\' + str(k + 1), '')
	return suffix.sub(inflection, word)
	return word