~linuxgoose/linguistics-robin (7d73f1d3b498b8104119c7b1764b5c411e2544b1): linguistics_robin/phonetics/nysiis.py

from typing import List
import string

from ..utils import check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class NYSIIS(PhoneticAlgorithm):
    """
    The NYSIIS algorithm.

    [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
    [Authors]: 
    """
    def __init__(self):
        super().__init__()

    # vowel array defined outside of function as not to recreate when used in loops
    __vowels : List[str ]= ["A","E","I","O","U"]

    # Python should have this functionality built in, replacing at an index
    def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
        return input[:index] + replace + input[(len(replace) or 1) + index:]

    def phonetics(self, word : str) -> str | None:
        check_str(word)
        check_empty(word)

        input : str = word

        # fail fast if there isn't an input value to code (code defensively)
        if (input is None or not input) :
            return None
        
        # strip leading and trailing whitespace
        input = input.strip()

        # make input uppercase (wiki algorithm doen't mention this as first step)
        input = input.upper()

        # Step 1.
        if input.startswith("MAC") :
            input = "MCC" + input.removeprefix("MAC")
        elif input.startswith("KN") :
            input = "NN" + input.removeprefix("KN")
        elif input.startswith("K") :
            input = "C" + input.removeprefix("K")
        elif input.startswith("PH") :
            input = "FF" + input.removeprefix("PH")
        elif input.startswith("PF") :
            input = "FF" + input.removeprefix("PF")
        elif input.startswith("SCH") :
            input = "SSS" + input.removeprefix("SCH")

        # Step 2.
        if input.endswith("EE") :
            input = input.removesuffix("EE") + "Y"
        elif input.endswith("IE") :
            input = input.removesuffix("IE") + "Y"
        for item in ["DT","RT","RD","NT","ND"] :
            if input.endswith(item) :
                input = input.removesuffix(item) + "D"

        # Steps 3-4.
        idx : int = 1

        while idx < len(input) :
    
            # Step 5. (1)
            # only process letters, skip all other characters including spaces
            if input[idx] not in string.ascii_letters : 
                input = self.self.__replaceAt(input,idx)
                # keeps current index and restarts
                continue

            # Step 5. (2)
            if input[idx] in self.__vowels :
                if input[idx:idx+2] == "EV" :
                    input = self.__replaceAt(input,idx,"EV")
                else :
                    input = self.__replaceAt(input,idx,"A")

            # Step 5. (3)
            elif input[idx] == "Q" :
                input = self.__replaceAt(input,idx,"G")
            elif input[idx] == "Z" :
                input = self.__replaceAt(input,idx,"S")
            elif input[idx] == "M" :
                input = self.__replaceAt(input,idx,"N")

            # Step 5. (4)
            elif input[idx:idx+2] == "KN" :
                input = self.__replaceAt(input,idx,"N")
            elif input[idx] ==  "K" :
                input = self.__replaceAt(input,idx,"C")

            # Step 5. (5)
            elif input[idx:idx+2] == "PH" :
                input = self.__replaceAt(input,idx,"FF")

            # Step 5. (6)
            elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 5. (7)
            elif input[idx] == "W" and input[idx - 1]  in self.__vowels :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 6.
            if input[idx] == input[idx - 1] :
                input = self.__replaceAt(input,idx,"")
                continue
                
            idx += 1

        # Step 7.
        if input.endswith("S") :
            input = input.removesuffix("S")
        
        # Step 8.
        if input.endswith("AY") :
            input = self.__replaceAt(input,idx,"AY") + "Y"

        # Step 9.
        if input.endswith("A") :
            input = input.removesuffix("A")
        
        # Step 10. Ensure the output includes at minimum the first letter of the input
        if len(input) < 1:
            input = word[0].upper()

        return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input