~linuxgoose/linguistics-robin

ref: b339a32420b0a306ab7f595710bb47a21a6e6c66 linguistics-robin/linguistics_robin/phonetics/nysiis.py -rw-r--r-- 4.5 KiB
b339a324 — Jordan Fix to missing self._vowels 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from typing import List
import string

from ..utils import check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class NYSIIS(PhoneticAlgorithm):
    """
    The NYSIIS algorithm.

    [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
    [Authors]: 
    """
    def __init__(self):
        super().__init__()

    # vowel array defined outside of function as not to recreate when used in loops
    __vowels : List[str ]= ["A","E","I","O","U"]

    # Python should have this functionality built in, replacing at an index
    def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
        return input[:index] + replace + input[(len(replace) or 1) + index:]

    def phonetics(self, word : str) -> str | None:
        check_str(word)
        check_empty(word)

        input : str = word

        # fail fast if there isn't an input value to code (code defensively)
        if (input is None or not input) :
            return None
        
        # strip leading and trailing whitespace
        input = input.strip()

        # make input uppercase (wiki algorithm doen't mention this as first step)
        input = input.upper()

        # Step 1.
        if input.startswith("MAC") :
            input = "MCC" + input.removeprefix("MAC")
        elif input.startswith("KN") :
            input = "NN" + input.removeprefix("KN")
        elif input.startswith("K") :
            input = "C" + input.removeprefix("K")
        elif input.startswith("PH") :
            input = "FF" + input.removeprefix("PH")
        elif input.startswith("PF") :
            input = "FF" + input.removeprefix("PF")
        elif input.startswith("SCH") :
            input = "SSS" + input.removeprefix("SCH")

        # Step 2.
        if input.endswith("EE") :
            input = input.removesuffix("EE") + "Y"
        elif input.endswith("IE") :
            input = input.removesuffix("IE") + "Y"
        for item in ["DT","RT","RD","NT","ND"] :
            if input.endswith(item) :
                input = input.removesuffix(item) + "D"

        # Steps 3-4.
        idx : int = 1

        while idx < len(input) :
    
            # Step 5. (1)
            # only process letters, skip all other characters including spaces
            if input[idx] not in string.ascii_letters : 
                input = self.self.__replaceAt(input,idx)
                # keeps current index and restarts
                continue

            # Step 5. (2)
            if input[idx] in self.__vowels :
                if input[idx:idx+2] == "EV" :
                    input = self.__replaceAt(input,idx,"EV")
                else :
                    input = self.__replaceAt(input,idx,"A")

            # Step 5. (3)
            elif input[idx] == "Q" :
                input = self.__replaceAt(input,idx,"G")
            elif input[idx] == "Z" :
                input = self.__replaceAt(input,idx,"S")
            elif input[idx] == "M" :
                input = self.__replaceAt(input,idx,"N")

            # Step 5. (4)
            elif input[idx:idx+2] == "KN" :
                input = self.__replaceAt(input,idx,"N")
            elif input[idx] ==  "K" :
                input = self.__replaceAt(input,idx,"C")

            # Step 5. (5)
            elif input[idx:idx+2] == "PH" :
                input = self.__replaceAt(input,idx,"FF")

            # Step 5. (6)
            elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 5. (7)
            elif input[idx] == "W" and input[idx - 1]  in self.__vowels :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 6.
            if input[idx] == input[idx - 1] :
                input = self.__replaceAt(input,idx,"")
                continue
                
            idx += 1

        # Step 7.
        if input.endswith("S") :
            input = input.removesuffix("S")
        
        # Step 8.
        if input.endswith("AY") :
            input = self.__replaceAt(input,idx,"AY") + "Y"

        # Step 9.
        if input.endswith("A") :
            input = input.removesuffix("A")
        
        # Step 10. Ensure the output includes at minimum the first letter of the input
        if len(input) < 1:
            input = word[0].upper()

        return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input