import re
from unidecode import unidecode
from ..utils import squeeze, translation, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm
class FuzzySoundex(PhoneticAlgorithm):
"""
Implementation of the "Fuzzy Soundex" algorithm.
[Reference]: http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
[Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
"""
def __init__(self):
super().__init__()
self.translations = translation(
'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
'0193017-07745501769301-7-9'
)
self.rules = [
(r'CA', r'KA'),
(r'CC', r'KK'),
(r'CK', r'KK'),
(r'CE', r'SE'),
(r'CHL', r'KL'),
(r'CL', r'KL'),
(r'CHR', r'KR'),
(r'CR', r'KR'),
(r'CI', r'SI'),
(r'CO', r'KO'),
(r'CU', r'KU'),
(r'CY', r'SY'),
(r'DG', r'GG'),
(r'GH', r'HH'),
(r'MAC', r'MK'),
(r'MC', r'MK'),
(r'NST', r'NSS'),
(r'PF', r'FF'),
(r'PH', r'FF'),
(r'SCH', r'SSS'),
(r'TIO', r'SIO'),
(r'TIA', r'SIO'),
(r'TCH', r'CHH'),
]
self.set1 = ['CS', 'CZ', 'TS', 'TZ']
self.set2 = ['HR', 'WR']
self.set3 = ['KN', 'NG']
self.set4 = 'HWY'
def phonetics(self, word):
check_str(word)
check_empty(word)
word = unidecode(word).upper()
# Substitutions for beginnings
first_two, rest = word[:2], word[2:]
if first_two in self.set1:
word = 'SS' + rest
elif first_two == 'GN':
word = 'NN' + rest
elif first_two in self.set2:
word = 'RR' + rest
elif first_two == 'HW':
word = 'WW' + rest
elif first_two in self.set3:
word = 'NN' + rest
# Substitutions for endings
last_two, initial = word[-2:], word[0:-2]
if last_two == 'CH':
word = initial + 'KK'
elif last_two == 'NT':
word = initial + 'TT'
elif last_two == 'RT':
word = initial + 'RR'
elif word[-3:] == 'RDT':
word = word[0:-3] + 'RR'
# Applying the rules
for rule in self.rules:
word = re.sub(rule[0], rule[1], word)
# Catch the first letter
first_letter = word[0]
# Translating
code = ''.join(self.translations.get(char, char) for char in word)
# Removing hyphens
code = code.replace('-', '')
# Squeezing the code
code = squeeze(code)
# Dealing with initials
code = first_letter if code[0] in self.set4 \
else first_letter + code[1:]
# Dropping vowels
code = code.replace('0', '')
return code