Last active
July 30, 2021 17:44
-
-
Save kkirsche/aa9d3855e6df0dbb23b1e967d6f47942 to your computer and use it in GitHub Desktop.
SOUNDEX implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""The following module is an implementation of American SOUNDEX. | |
SOUNDEX is a phonetic algorithm for indexing names by sound, as pronounced in English. | |
The goal is for homophones to be encoded to the same representation so that they can be | |
matched despite minor differences in spelling. | |
This implements the algorithm as defined on: | |
https://en.wikipedia.org/wiki/Soundex#American%20Soundex | |
Definition: | |
The Soundex code for a name consists of a letter followed by three numerical digits: | |
the letter is the first letter of the name, and the digits encode the remaining | |
consonants. Consonants at a similar place of articulation share the same digit so, for | |
example, the labial consonants B, F, P, and V are each encoded as the number 1. | |
The correct value can be found as follows: | |
1. Retain the first letter of the name and drop all other occurrences of: | |
a, e, i, o, u, y, h, w. | |
2. Replace consonants with digits as follows (after the first letter): | |
b, f, p, v → 1 | |
c, g, j, k, q, s, x, z → 2 | |
d, t → 3 | |
l → 4 | |
m, n → 5 | |
r → 6 | |
3. If two or more letters with the same number are adjacent in the original name | |
(before step 1), only retain the first letter; also two letters with the same | |
number separated by 'h' or 'w' are coded as a single number, whereas such | |
letters separated by a vowel are coded twice. This rule also applies to the | |
first letter. | |
4. If you have too few letters in your word that you can't assign three numbers, | |
append with zeros until there are three numbers. If you have four or more | |
numbers, retain only the first three. | |
I wrote this in an attempt to better understand how to write an algorithm like this | |
and to use it as a learning exercise in where my code could be improved or simplified | |
to write a more concise and performant algorithm. | |
""" | |
from re import compile | |
from unittest import TestCase, main | |
separator_pattern = r"[hw]?" | |
one_pattern = r"[bfpv]" | |
two_pattern = r"[cgjkqsxz]" | |
three_pattern = r"[dt]" | |
four_pattern = r"[l]" | |
five_pattern = r"[mn]" | |
six_pattern = r"[r]" | |
# key is the value to replace with | |
# value is the pattern to use to execute the replacement | |
replacements = { | |
"1": compile(one_pattern), | |
"2": compile(two_pattern), | |
"3": compile(three_pattern), | |
"4": compile(four_pattern), | |
"5": compile(five_pattern), | |
"6": compile(six_pattern), | |
} | |
duplicate_runs = [ | |
compile(f"{one_pattern}{separator_pattern}{one_pattern}"), | |
compile(f"{two_pattern}{separator_pattern}{two_pattern}"), | |
compile(f"{three_pattern}{separator_pattern}{three_pattern}"), | |
compile(f"{four_pattern}{separator_pattern}{four_pattern}"), | |
compile(f"{five_pattern}{separator_pattern}{five_pattern}"), | |
compile(f"{six_pattern}{separator_pattern}{six_pattern}"), | |
] | |
removals = compile(r"[aeiouyhw]") | |
# . matches any character, then \1 is a back-reference to the match | |
duplicate_chars = compile(r"(.)\1") | |
def american_soundex(word: str) -> str: | |
if len(word) == 0: | |
raise ValueError("Invalid word") | |
word = word.lower() | |
letters = "".join(letter for letter in word if letter.isalpha()) | |
first_letter = letters[0] | |
# step 1.a, retain the first letter | |
result = letters[1:] | |
if len(result) == 0: | |
return f"{first_letter.upper()}000" | |
# step 1.b, remove a, e, i, o, u, y, h, w | |
result = removals.sub("", result) | |
# step 2, replace letters with digits | |
for repl, pattern in replacements.items(): | |
result = pattern.sub(repl, result) | |
# step 3 clean up runs | |
for pattern in duplicate_runs: | |
if pattern.search(letters) is not None: | |
result = duplicate_chars.sub(r"\1", result) | |
for digit, pattern in replacements.items(): | |
if pattern.match(first_letter) is not None and result.startswith(digit): | |
# If the saved letter's digit is the same as the resulting first digit, | |
# remove the digit (keep the letter). | |
result = result[1:] | |
# step 4, if less than three digits, pad with zeros | |
result = result.ljust(3, "0")[:3] | |
return f"{first_letter.upper()}{result}" | |
class TestSoundex(TestCase): | |
def test_robert(self) -> None: | |
self.assertEqual(american_soundex("Robert"), "R163") | |
def test_rupert(self) -> None: | |
self.assertEqual(american_soundex("Rupert"), "R163") | |
def test_rubin(self) -> None: | |
self.assertEqual(american_soundex("Rubin"), "R150") | |
def test_ashcraft(self) -> None: | |
self.assertEqual(american_soundex("Ashcraft"), "A261") | |
def test_ashcroft(self) -> None: | |
self.assertEqual(american_soundex("Ashcroft"), "A261") | |
def test_tymczak(self) -> None: | |
self.assertEqual(american_soundex("Tymczak"), "T522") | |
def test_pfister(self) -> None: | |
self.assertEqual(american_soundex("Pfister"), "P236") | |
def test_honeyman(self) -> None: | |
self.assertEqual(american_soundex("Honeyman"), "H555") | |
def test_ciondecks(self) -> None: | |
self.assertEqual(american_soundex("ciondecks"), "C532") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: https://www.rosettacode.org/wiki/Soundex#Python | |
from itertools import groupby | |
def soundex(word): | |
codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r") | |
soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod) | |
cmap2 = lambda kar: soundDict.get(kar, '9') | |
sdx = ''.join(cmap2(kar) for kar in word.lower()) | |
sdx2 = word[0].upper() + ''.join(k for k,g in list(groupby(sdx))[1:] if k!='9') | |
sdx3 = sdx2[0:4].ljust(4,'0') | |
return sdx3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment