Skip to content

Instantly share code, notes, and snippets.

@kkirsche
Last active July 30, 2021 17:44
Show Gist options
  • Save kkirsche/aa9d3855e6df0dbb23b1e967d6f47942 to your computer and use it in GitHub Desktop.
Save kkirsche/aa9d3855e6df0dbb23b1e967d6f47942 to your computer and use it in GitHub Desktop.
SOUNDEX implementation
#!/usr/bin/env python
"""The following module is an implementation of American SOUNDEX.
SOUNDEX is a phonetic algorithm for indexing names by sound, as pronounced in English.
The goal is for homophones to be encoded to the same representation so that they can be
matched despite minor differences in spelling.
This implements the algorithm as defined on:
https://en.wikipedia.org/wiki/Soundex#American%20Soundex
Definition:
The Soundex code for a name consists of a letter followed by three numerical digits:
the letter is the first letter of the name, and the digits encode the remaining
consonants. Consonants at a similar place of articulation share the same digit so, for
example, the labial consonants B, F, P, and V are each encoded as the number 1.
The correct value can be found as follows:
1. Retain the first letter of the name and drop all other occurrences of:
a, e, i, o, u, y, h, w.
2. Replace consonants with digits as follows (after the first letter):
b, f, p, v → 1
c, g, j, k, q, s, x, z → 2
d, t → 3
l → 4
m, n → 5
r → 6
3. If two or more letters with the same number are adjacent in the original name
(before step 1), only retain the first letter; also two letters with the same
number separated by 'h' or 'w' are coded as a single number, whereas such
letters separated by a vowel are coded twice. This rule also applies to the
first letter.
4. If you have too few letters in your word that you can't assign three numbers,
append with zeros until there are three numbers. If you have four or more
numbers, retain only the first three.
I wrote this in an attempt to better understand how to write an algorithm like this
and to use it as a learning exercise in where my code could be improved or simplified
to write a more concise and performant algorithm.
"""
from re import compile
from unittest import TestCase, main
separator_pattern = r"[hw]?"
one_pattern = r"[bfpv]"
two_pattern = r"[cgjkqsxz]"
three_pattern = r"[dt]"
four_pattern = r"[l]"
five_pattern = r"[mn]"
six_pattern = r"[r]"
# key is the value to replace with
# value is the pattern to use to execute the replacement
replacements = {
"1": compile(one_pattern),
"2": compile(two_pattern),
"3": compile(three_pattern),
"4": compile(four_pattern),
"5": compile(five_pattern),
"6": compile(six_pattern),
}
duplicate_runs = [
compile(f"{one_pattern}{separator_pattern}{one_pattern}"),
compile(f"{two_pattern}{separator_pattern}{two_pattern}"),
compile(f"{three_pattern}{separator_pattern}{three_pattern}"),
compile(f"{four_pattern}{separator_pattern}{four_pattern}"),
compile(f"{five_pattern}{separator_pattern}{five_pattern}"),
compile(f"{six_pattern}{separator_pattern}{six_pattern}"),
]
removals = compile(r"[aeiouyhw]")
# . matches any character, then \1 is a back-reference to the match
duplicate_chars = compile(r"(.)\1")
def american_soundex(word: str) -> str:
if len(word) == 0:
raise ValueError("Invalid word")
word = word.lower()
letters = "".join(letter for letter in word if letter.isalpha())
first_letter = letters[0]
# step 1.a, retain the first letter
result = letters[1:]
if len(result) == 0:
return f"{first_letter.upper()}000"
# step 1.b, remove a, e, i, o, u, y, h, w
result = removals.sub("", result)
# step 2, replace letters with digits
for repl, pattern in replacements.items():
result = pattern.sub(repl, result)
# step 3 clean up runs
for pattern in duplicate_runs:
if pattern.search(letters) is not None:
result = duplicate_chars.sub(r"\1", result)
for digit, pattern in replacements.items():
if pattern.match(first_letter) is not None and result.startswith(digit):
# If the saved letter's digit is the same as the resulting first digit,
# remove the digit (keep the letter).
result = result[1:]
# step 4, if less than three digits, pad with zeros
result = result.ljust(3, "0")[:3]
return f"{first_letter.upper()}{result}"
class TestSoundex(TestCase):
def test_robert(self) -> None:
self.assertEqual(american_soundex("Robert"), "R163")
def test_rupert(self) -> None:
self.assertEqual(american_soundex("Rupert"), "R163")
def test_rubin(self) -> None:
self.assertEqual(american_soundex("Rubin"), "R150")
def test_ashcraft(self) -> None:
self.assertEqual(american_soundex("Ashcraft"), "A261")
def test_ashcroft(self) -> None:
self.assertEqual(american_soundex("Ashcroft"), "A261")
def test_tymczak(self) -> None:
self.assertEqual(american_soundex("Tymczak"), "T522")
def test_pfister(self) -> None:
self.assertEqual(american_soundex("Pfister"), "P236")
def test_honeyman(self) -> None:
self.assertEqual(american_soundex("Honeyman"), "H555")
def test_ciondecks(self) -> None:
self.assertEqual(american_soundex("ciondecks"), "C532")
if __name__ == "__main__":
main()
# source: https://www.rosettacode.org/wiki/Soundex#Python
from itertools import groupby
def soundex(word):
codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r")
soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod)
cmap2 = lambda kar: soundDict.get(kar, '9')
sdx = ''.join(cmap2(kar) for kar in word.lower())
sdx2 = word[0].upper() + ''.join(k for k,g in list(groupby(sdx))[1:] if k!='9')
sdx3 = sdx2[0:4].ljust(4,'0')
return sdx3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment