~linuxgoose/linguistics-robin

ref: 81a0dc8236096147eb60a03e19b6585a3c9dd3c6 linguistics-robin/linguistics_robin/phonetics/fuzzy_soundex.py -rw-r--r-- 2.9 KiB
81a0dc82 — Jordan Robinson Create dependabot.yml 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
from unidecode import unidecode

from ..utils import squeeze, translation, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm


class FuzzySoundex(PhoneticAlgorithm):
    """
    Implementation of the "Fuzzy Soundex" algorithm.

    [Reference]: http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
    [Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
    """
    def __init__(self):
        super().__init__()

        self.translations = translation(
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
            '0193017-07745501769301-7-9'
        )

        self.rules = [
            (r'CA', r'KA'),
            (r'CC', r'KK'),
            (r'CK', r'KK'),
            (r'CE', r'SE'),
            (r'CHL', r'KL'),
            (r'CL', r'KL'),
            (r'CHR', r'KR'),
            (r'CR', r'KR'),
            (r'CI', r'SI'),
            (r'CO', r'KO'),
            (r'CU', r'KU'),
            (r'CY', r'SY'),
            (r'DG', r'GG'),
            (r'GH', r'HH'),
            (r'MAC', r'MK'),
            (r'MC', r'MK'),
            (r'NST', r'NSS'),
            (r'PF', r'FF'),
            (r'PH', r'FF'),
            (r'SCH', r'SSS'),
            (r'TIO', r'SIO'),
            (r'TIA', r'SIO'),
            (r'TCH', r'CHH'),
        ]

        self.set1 = ['CS', 'CZ', 'TS', 'TZ']
        self.set2 = ['HR', 'WR']
        self.set3 = ['KN', 'NG']
        self.set4 = 'HWY'

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()

        # Substitutions for beginnings
        first_two, rest = word[:2], word[2:]

        if first_two in self.set1:
            word = 'SS' + rest
        elif first_two == 'GN':
            word = 'NN' + rest
        elif first_two in self.set2:
            word = 'RR' + rest
        elif first_two == 'HW':
            word = 'WW' + rest
        elif first_two in self.set3:
            word = 'NN' + rest

        # Substitutions for endings
        last_two, initial = word[-2:], word[0:-2]

        if last_two == 'CH':
            word = initial + 'KK'
        elif last_two == 'NT':
            word = initial + 'TT'
        elif last_two == 'RT':
            word = initial + 'RR'
        elif word[-3:] == 'RDT':
            word = word[0:-3] + 'RR'

        # Applying the rules
        for rule in self.rules:
            word = re.sub(rule[0], rule[1], word)

        # Catch the first letter
        first_letter = word[0]

        # Translating
        code = ''.join(self.translations.get(char, char) for char in word)

        # Removing hyphens
        code = code.replace('-', '')

        # Squeezing the code
        code = squeeze(code)

        # Dealing with initials
        code = first_letter if code[0] in self.set4 \
            else first_letter + code[1:]

        # Dropping vowels
        code = code.replace('0', '')
        return code