kkirsche/soundex.py

## soundex.py
#!/usr/bin/env python
"""The following module is an implementation of American SOUNDEX.

SOUNDEX is a  phonetic algorithm for indexing names by sound, as pronounced in English.
The goal is for homophones to be encoded to the same representation so that they can be
matched despite minor differences in spelling.

This implements the algorithm as defined on:
https://en.wikipedia.org/wiki/Soundex#American%20Soundex

Definition:
The Soundex code for a name consists of a letter followed by three numerical digits:
the letter is the first letter of the name, and the digits encode the remaining
consonants. Consonants at a similar place of articulation share the same digit so, for
example, the labial consonants B, F, P, and V are each encoded as the number 1.

The correct value can be found as follows:
    1. Retain the first letter of the name and drop all other occurrences of:
        a, e, i, o, u, y, h, w.
    2. Replace consonants with digits as follows (after the first letter):
        b, f, p, v → 1
        c, g, j, k, q, s, x, z → 2
        d, t → 3
        l → 4
        m, n → 5
        r → 6
    3. If two or more letters with the same number are adjacent in the original name
        (before step 1), only retain the first letter; also two letters with the same
        number separated by 'h' or 'w' are coded as a single number, whereas such
        letters separated by a vowel are coded twice. This rule also applies to the
        first letter.
    4. If you have too few letters in your word that you can't assign three numbers,
        append with zeros until there are three numbers. If you have four or more
        numbers, retain only the first three.

I wrote this in an attempt to better understand how to write an algorithm like this
and to use it as a learning exercise in where my code could be improved or simplified
to write a more concise and performant algorithm.
"""
from re import compile
from unittest import TestCase, main

separator_pattern = r"[hw]?"
one_pattern = r"[bfpv]"
two_pattern = r"[cgjkqsxz]"
three_pattern = r"[dt]"
four_pattern = r"[l]"
five_pattern = r"[mn]"
six_pattern = r"[r]"

# key is the value to replace with
# value is the pattern to use to execute the replacement
replacements = {
    "1": compile(one_pattern),
    "2": compile(two_pattern),
    "3": compile(three_pattern),
    "4": compile(four_pattern),
    "5": compile(five_pattern),
    "6": compile(six_pattern),
}

duplicate_runs = [
    compile(f"{one_pattern}{separator_pattern}{one_pattern}"),
    compile(f"{two_pattern}{separator_pattern}{two_pattern}"),
    compile(f"{three_pattern}{separator_pattern}{three_pattern}"),
    compile(f"{four_pattern}{separator_pattern}{four_pattern}"),
    compile(f"{five_pattern}{separator_pattern}{five_pattern}"),
    compile(f"{six_pattern}{separator_pattern}{six_pattern}"),
]

removals = compile(r"[aeiouyhw]")
# . matches any character, then \1 is a back-reference to the match
duplicate_chars = compile(r"(.)\1")


def american_soundex(word: str) -> str:
    if len(word) == 0:
        raise ValueError("Invalid word")
    word = word.lower()
    letters = "".join(letter for letter in word if letter.isalpha())
    first_letter = letters[0]
    # step 1.a, retain the first letter
    result = letters[1:]
    if len(result) == 0:
        return f"{first_letter.upper()}000"

    # step 1.b, remove a, e, i, o, u, y, h, w
    result = removals.sub("", result)

    # step 2, replace letters with digits
    for repl, pattern in replacements.items():
        result = pattern.sub(repl, result)

    # step 3 clean up runs
    for pattern in duplicate_runs:
        if pattern.search(letters) is not None:
            result = duplicate_chars.sub(r"\1", result)

    for digit, pattern in replacements.items():
        if pattern.match(first_letter) is not None and result.startswith(digit):
            # If the saved letter's digit is the same as the resulting first digit,
            # remove the digit (keep the letter).
            result = result[1:]

    # step 4, if less than three digits, pad with zeros
    result = result.ljust(3, "0")[:3]
    return f"{first_letter.upper()}{result}"


class TestSoundex(TestCase):
    def test_robert(self) -> None:
        self.assertEqual(american_soundex("Robert"), "R163")

    def test_rupert(self) -> None:
        self.assertEqual(american_soundex("Rupert"), "R163")

    def test_rubin(self) -> None:
        self.assertEqual(american_soundex("Rubin"), "R150")

    def test_ashcraft(self) -> None:
        self.assertEqual(american_soundex("Ashcraft"), "A261")

    def test_ashcroft(self) -> None:
        self.assertEqual(american_soundex("Ashcroft"), "A261")

    def test_tymczak(self) -> None:
        self.assertEqual(american_soundex("Tymczak"), "T522")

    def test_pfister(self) -> None:
        self.assertEqual(american_soundex("Pfister"), "P236")

    def test_honeyman(self) -> None:
        self.assertEqual(american_soundex("Honeyman"), "H555")

    def test_ciondecks(self) -> None:
        self.assertEqual(american_soundex("ciondecks"), "C532")


if __name__ == "__main__":
    main()

## soundex_concise.py
# source: https://www.rosettacode.org/wiki/Soundex#Python
from itertools import groupby

def soundex(word):
   codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r")
   soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod)
   cmap2 = lambda kar: soundDict.get(kar, '9')
   sdx =  ''.join(cmap2(kar) for kar in word.lower())
   sdx2 = word[0].upper() + ''.join(k for k,g in list(groupby(sdx))[1:] if k!='9')
   sdx3 = sdx2[0:4].ljust(4,'0')
   return sdx3
	#!/usr/bin/env python
	"""The following module is an implementation of American SOUNDEX.

	SOUNDEX is a phonetic algorithm for indexing names by sound, as pronounced in English.
	The goal is for homophones to be encoded to the same representation so that they can be
	matched despite minor differences in spelling.

	This implements the algorithm as defined on:
	https://en.wikipedia.org/wiki/Soundex#American%20Soundex

	Definition:
	The Soundex code for a name consists of a letter followed by three numerical digits:
	the letter is the first letter of the name, and the digits encode the remaining
	consonants. Consonants at a similar place of articulation share the same digit so, for
	example, the labial consonants B, F, P, and V are each encoded as the number 1.

	The correct value can be found as follows:
	1. Retain the first letter of the name and drop all other occurrences of:
	a, e, i, o, u, y, h, w.
	2. Replace consonants with digits as follows (after the first letter):
	b, f, p, v → 1
	c, g, j, k, q, s, x, z → 2
	d, t → 3
	l → 4
	m, n → 5
	r → 6
	3. If two or more letters with the same number are adjacent in the original name
	(before step 1), only retain the first letter; also two letters with the same
	number separated by 'h' or 'w' are coded as a single number, whereas such
	letters separated by a vowel are coded twice. This rule also applies to the
	first letter.
	4. If you have too few letters in your word that you can't assign three numbers,
	append with zeros until there are three numbers. If you have four or more
	numbers, retain only the first three.

	I wrote this in an attempt to better understand how to write an algorithm like this
	and to use it as a learning exercise in where my code could be improved or simplified
	to write a more concise and performant algorithm.
	"""
	from re import compile
	from unittest import TestCase, main

	separator_pattern = r"[hw]?"
	one_pattern = r"[bfpv]"
	two_pattern = r"[cgjkqsxz]"
	three_pattern = r"[dt]"
	four_pattern = r"[l]"
	five_pattern = r"[mn]"
	six_pattern = r"[r]"

	# key is the value to replace with
	# value is the pattern to use to execute the replacement
	replacements = {
	"1": compile(one_pattern),
	"2": compile(two_pattern),
	"3": compile(three_pattern),
	"4": compile(four_pattern),
	"5": compile(five_pattern),
	"6": compile(six_pattern),
	}

	duplicate_runs = [
	compile(f"{one_pattern}{separator_pattern}{one_pattern}"),
	compile(f"{two_pattern}{separator_pattern}{two_pattern}"),
	compile(f"{three_pattern}{separator_pattern}{three_pattern}"),
	compile(f"{four_pattern}{separator_pattern}{four_pattern}"),
	compile(f"{five_pattern}{separator_pattern}{five_pattern}"),
	compile(f"{six_pattern}{separator_pattern}{six_pattern}"),
	]

	removals = compile(r"[aeiouyhw]")
	# . matches any character, then \1 is a back-reference to the match
	duplicate_chars = compile(r"(.)\1")


	def american_soundex(word: str) -> str:
	if len(word) == 0:
	raise ValueError("Invalid word")
	word = word.lower()
	letters = "".join(letter for letter in word if letter.isalpha())
	first_letter = letters[0]
	# step 1.a, retain the first letter
	result = letters[1:]
	if len(result) == 0:
	return f"{first_letter.upper()}000"

	# step 1.b, remove a, e, i, o, u, y, h, w
	result = removals.sub("", result)

	# step 2, replace letters with digits
	for repl, pattern in replacements.items():
	result = pattern.sub(repl, result)

	# step 3 clean up runs
	for pattern in duplicate_runs:
	if pattern.search(letters) is not None:
	result = duplicate_chars.sub(r"\1", result)

	for digit, pattern in replacements.items():
	if pattern.match(first_letter) is not None and result.startswith(digit):
	# If the saved letter's digit is the same as the resulting first digit,
	# remove the digit (keep the letter).
	result = result[1:]

	# step 4, if less than three digits, pad with zeros
	result = result.ljust(3, "0")[:3]
	return f"{first_letter.upper()}{result}"


	class TestSoundex(TestCase):
	def test_robert(self) -> None:
	self.assertEqual(american_soundex("Robert"), "R163")

	def test_rupert(self) -> None:
	self.assertEqual(american_soundex("Rupert"), "R163")

	def test_rubin(self) -> None:
	self.assertEqual(american_soundex("Rubin"), "R150")

	def test_ashcraft(self) -> None:
	self.assertEqual(american_soundex("Ashcraft"), "A261")

	def test_ashcroft(self) -> None:
	self.assertEqual(american_soundex("Ashcroft"), "A261")

	def test_tymczak(self) -> None:
	self.assertEqual(american_soundex("Tymczak"), "T522")

	def test_pfister(self) -> None:
	self.assertEqual(american_soundex("Pfister"), "P236")

	def test_honeyman(self) -> None:
	self.assertEqual(american_soundex("Honeyman"), "H555")

	def test_ciondecks(self) -> None:
	self.assertEqual(american_soundex("ciondecks"), "C532")


	if __name__ == "__main__":
	main()
	# source: https://www.rosettacode.org/wiki/Soundex#Python
	from itertools import groupby

	def soundex(word):
	codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r")
	soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod)
	cmap2 = lambda kar: soundDict.get(kar, '9')
	sdx = ''.join(cmap2(kar) for kar in word.lower())
	sdx2 = word[0].upper() + ''.join(k for k,g in list(groupby(sdx))[1:] if k!='9')
	sdx3 = sdx2[0:4].ljust(4,'0')
	return sdx3