~linuxgoose/linguistics-robin

ee09466607e02c1dc37042cd7d8eb29bd904ef8e — Jordan Robinson 8 months ago 81a0dc8 + 7d73f1d v0.5.5
Merge pull request #18 from linuxgoose/1-add-nysiis-algorithm

Addition of NYSIIS
M README.md => README.md +1 -0
@@ 8,6 8,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
 * Fuzzy Soundex
 * Lein
 * Matching Rating Approach
 * New York State Identification and Intelligence System (NYSIIS)
 
In addition, the following distance metrics:


M linguistics_robin/__init__.py => linguistics_robin/__init__.py +4 -3
@@ 1,9 1,10 @@
"""A Python 3 phonetics library."""
"""A Python 3 linguistics collection library."""
from .phonetics import (Soundex,
                        Metaphone,
                        MatchingRatingApproach,
                        FuzzySoundex,
                        Lein,
                        RefinedSoundex)
                        RefinedSoundex,
                        NYSIIS)

__version__ = '0.5.4'
__version__ = '0.5.5'

M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +1 -0
@@ 4,3 4,4 @@ from .mra import *
from .fuzzy_soundex import *
from .lein import *
from .refined_soundex import *
from .nysiis import *

A linguistics_robin/phonetics/nysiis.py => linguistics_robin/phonetics/nysiis.py +132 -0
@@ 0,0 1,132 @@
from typing import List
import string

from ..utils import check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class NYSIIS(PhoneticAlgorithm):
    """
    The NYSIIS algorithm.

    [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
    [Authors]: 
    """
    def __init__(self):
        super().__init__()

    # vowel array defined outside of function as not to recreate when used in loops
    __vowels : List[str ]= ["A","E","I","O","U"]

    # Python should have this functionality built in, replacing at an index
    def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
        return input[:index] + replace + input[(len(replace) or 1) + index:]

    def phonetics(self, word : str) -> str | None:
        check_str(word)
        check_empty(word)

        input : str = word

        # fail fast if there isn't an input value to code (code defensively)
        if (input is None or not input) :
            return None
        
        # strip leading and trailing whitespace
        input = input.strip()

        # make input uppercase (wiki algorithm doen't mention this as first step)
        input = input.upper()

        # Step 1.
        if input.startswith("MAC") :
            input = "MCC" + input.removeprefix("MAC")
        elif input.startswith("KN") :
            input = "NN" + input.removeprefix("KN")
        elif input.startswith("K") :
            input = "C" + input.removeprefix("K")
        elif input.startswith("PH") :
            input = "FF" + input.removeprefix("PH")
        elif input.startswith("PF") :
            input = "FF" + input.removeprefix("PF")
        elif input.startswith("SCH") :
            input = "SSS" + input.removeprefix("SCH")

        # Step 2.
        if input.endswith("EE") :
            input = input.removesuffix("EE") + "Y"
        elif input.endswith("IE") :
            input = input.removesuffix("IE") + "Y"
        for item in ["DT","RT","RD","NT","ND"] :
            if input.endswith(item) :
                input = input.removesuffix(item) + "D"

        # Steps 3-4.
        idx : int = 1

        while idx < len(input) :
    
            # Step 5. (1)
            # only process letters, skip all other characters including spaces
            if input[idx] not in string.ascii_letters : 
                input = self.self.__replaceAt(input,idx)
                # keeps current index and restarts
                continue

            # Step 5. (2)
            if input[idx] in self.__vowels :
                if input[idx:idx+2] == "EV" :
                    input = self.__replaceAt(input,idx,"EV")
                else :
                    input = self.__replaceAt(input,idx,"A")

            # Step 5. (3)
            elif input[idx] == "Q" :
                input = self.__replaceAt(input,idx,"G")
            elif input[idx] == "Z" :
                input = self.__replaceAt(input,idx,"S")
            elif input[idx] == "M" :
                input = self.__replaceAt(input,idx,"N")

            # Step 5. (4)
            elif input[idx:idx+2] == "KN" :
                input = self.__replaceAt(input,idx,"N")
            elif input[idx] ==  "K" :
                input = self.__replaceAt(input,idx,"C")

            # Step 5. (5)
            elif input[idx:idx+2] == "PH" :
                input = self.__replaceAt(input,idx,"FF")

            # Step 5. (6)
            elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 5. (7)
            elif input[idx] == "W" and input[idx - 1]  in self.__vowels :
                input = self.__replaceAt(input,idx,input[idx - 1])

            # Step 6.
            if input[idx] == input[idx - 1] :
                input = self.__replaceAt(input,idx,"")
                continue
                
            idx += 1

        # Step 7.
        if input.endswith("S") :
            input = input.removesuffix("S")
        
        # Step 8.
        if input.endswith("AY") :
            input = self.__replaceAt(input,idx,"AY") + "Y"

        # Step 9.
        if input.endswith("A") :
            input = input.removesuffix("A")
        
        # Step 10. Ensure the output includes at minimum the first letter of the input
        if len(input) < 1:
            input = word[0].upper()

        return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input

M linguistics_robin/utils.py => linguistics_robin/utils.py +31 -0
@@ 26,3 26,34 @@ def check_empty(word):
    """Throw exception at empty string input."""
    if not len(word):
        raise EmptyStringError('The given string is empty.')

class LazyString(str):
    def get(self, idx, dist=None):
        if not self:
            return None
        if idx < 0 or idx >= len(self):
            return None
        if dist:
            if idx + dist > len(self):
                return None
            return self[idx:idx+dist]
        return self[idx]


def startswith(word, matchwith):
    return all(map(lambda x: x[0] == x[1], zip(word, matchwith)))


def endswith(word, matchwith):
    return all(map(lambda x: x[0] == x[1], zip(word[::-1], matchwith[::-1])))


def isvowel(c):
    return c and c.upper() in {'A', 'E', 'I', 'O', 'U', 'Y'}


def isslavogermanic(s):
    if not s:
        return False
    s = s.upper()
    return "W" in s or "K" in s or "CZ" in s or "WITZ" in s
\ No newline at end of file

M tests/test_corner_cases.py => tests/test_corner_cases.py +13 -1
@@ 1,7 1,19 @@
import pytest
from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex
from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS
from linguistics_robin.exceptions import EmptyStringError

def test_nysiis():
    nysiis = NYSIIS()

    assert nysiis.phonetics('bob') == 'BAB'
    assert nysiis.phonetics('aa') == 'A'
    assert nysiis.phonetics('b') == 'B'
    assert nysiis.phonetics('cat') == 'CAT'
    assert nysiis.phonetics('s') == 'S'
    assert nysiis.phonetics('w') == 'W'

    with pytest.raises(EmptyStringError):
        nysiis.phonetics('')

def test_soundex():
    soundex = Soundex()

M tests/test_phonetics.py => tests/test_phonetics.py +17 -1
@@ 1,6 1,22 @@
from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
    FuzzySoundex, Lein, RefinedSoundex
    FuzzySoundex, Lein, RefinedSoundex, NYSIIS

def test_nysiis():
    tests = [
        ('STAD', 'stewart'),
        ('WALVAR[ANPTAN]', 'wolverhampton'),
        ('WALAN', 'William'),
        ('ZANAR', 'Zimmer'),
        ('JALAN', 'Jalen'),
        ('CARSAN', 'Carson'),
        ('CATARA[N]', 'Catherine'),
        ('CATARA[N]', 'Katherine'),
        ('LASXV', 'LouisXVI'),
    ]

    nysiis = NYSIIS()
    for test in tests:
        assert nysiis.phonetics(test[1]) == test[0]

def test_metaphone():
    tests = [