~linuxgoose/linguistics-robin: Addition of NYSIIS

7 files changed, 199 insertions(+), 5 deletions(-)

M README.md
M linguistics_robin/__init__.py
M linguistics_robin/phonetics/__init__.py
A linguistics_robin/phonetics/nysiis.py
M linguistics_robin/utils.py
M tests/test_corner_cases.py
M tests/test_phonetics.py

M README.md => README.md +1 -0

@@ 8,6 8,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
  * Fuzzy Soundex
  * Lein
  * Matching Rating Approach
+ * New York State Identification and Intelligence System (NYSIIS)
  
 In addition, the following distance metrics:

M linguistics_robin/__init__.py => linguistics_robin/__init__.py +4 -3

@@ 1,9 1,10 @@
-"""A Python 3 phonetics library."""
+"""A Python 3 linguistics collection library."""
 from .phonetics import (Soundex,
                         Metaphone,
                         MatchingRatingApproach,
                         FuzzySoundex,
                         Lein,
-                        RefinedSoundex)
+                        RefinedSoundex,
+                        NYSIIS)
 
-__version__ = '0.5.4'
+__version__ = '0.5.5'

M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +1 -0

@@ 4,3 4,4 @@ from .mra import *
 from .fuzzy_soundex import *
 from .lein import *
 from .refined_soundex import *
+from .nysiis import *

A linguistics_robin/phonetics/nysiis.py => linguistics_robin/phonetics/nysiis.py +132 -0

@@ 0,0 1,132 @@
+from typing import List
+import string
+
+from ..utils import check_str, check_empty
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class NYSIIS(PhoneticAlgorithm):
+    """
+    The NYSIIS algorithm.
+
+    [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
+    [Authors]: 
+    """
+    def __init__(self):
+        super().__init__()
+
+    # vowel array defined outside of function as not to recreate when used in loops
+    __vowels : List[str ]= ["A","E","I","O","U"]
+
+    # Python should have this functionality built in, replacing at an index
+    def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
+        return input[:index] + replace + input[(len(replace) or 1) + index:]
+
+    def phonetics(self, word : str) -> str | None:
+        check_str(word)
+        check_empty(word)
+
+        input : str = word
+
+        # fail fast if there isn't an input value to code (code defensively)
+        if (input is None or not input) :
+            return None
+        
+        # strip leading and trailing whitespace
+        input = input.strip()
+
+        # make input uppercase (wiki algorithm doen't mention this as first step)
+        input = input.upper()
+
+        # Step 1.
+        if input.startswith("MAC") :
+            input = "MCC" + input.removeprefix("MAC")
+        elif input.startswith("KN") :
+            input = "NN" + input.removeprefix("KN")
+        elif input.startswith("K") :
+            input = "C" + input.removeprefix("K")
+        elif input.startswith("PH") :
+            input = "FF" + input.removeprefix("PH")
+        elif input.startswith("PF") :
+            input = "FF" + input.removeprefix("PF")
+        elif input.startswith("SCH") :
+            input = "SSS" + input.removeprefix("SCH")
+
+        # Step 2.
+        if input.endswith("EE") :
+            input = input.removesuffix("EE") + "Y"
+        elif input.endswith("IE") :
+            input = input.removesuffix("IE") + "Y"
+        for item in ["DT","RT","RD","NT","ND"] :
+            if input.endswith(item) :
+                input = input.removesuffix(item) + "D"
+
+        # Steps 3-4.
+        idx : int = 1
+
+        while idx < len(input) :
+    
+            # Step 5. (1)
+            # only process letters, skip all other characters including spaces
+            if input[idx] not in string.ascii_letters : 
+                input = self.self.__replaceAt(input,idx)
+                # keeps current index and restarts
+                continue
+
+            # Step 5. (2)
+            if input[idx] in self.__vowels :
+                if input[idx:idx+2] == "EV" :
+                    input = self.__replaceAt(input,idx,"EV")
+                else :
+                    input = self.__replaceAt(input,idx,"A")
+
+            # Step 5. (3)
+            elif input[idx] == "Q" :
+                input = self.__replaceAt(input,idx,"G")
+            elif input[idx] == "Z" :
+                input = self.__replaceAt(input,idx,"S")
+            elif input[idx] == "M" :
+                input = self.__replaceAt(input,idx,"N")
+
+            # Step 5. (4)
+            elif input[idx:idx+2] == "KN" :
+                input = self.__replaceAt(input,idx,"N")
+            elif input[idx] ==  "K" :
+                input = self.__replaceAt(input,idx,"C")
+
+            # Step 5. (5)
+            elif input[idx:idx+2] == "PH" :
+                input = self.__replaceAt(input,idx,"FF")
+
+            # Step 5. (6)
+            elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
+                input = self.__replaceAt(input,idx,input[idx - 1])
+
+            # Step 5. (7)
+            elif input[idx] == "W" and input[idx - 1]  in self.__vowels :
+                input = self.__replaceAt(input,idx,input[idx - 1])
+
+            # Step 6.
+            if input[idx] == input[idx - 1] :
+                input = self.__replaceAt(input,idx,"")
+                continue
+                
+            idx += 1
+
+        # Step 7.
+        if input.endswith("S") :
+            input = input.removesuffix("S")
+        
+        # Step 8.
+        if input.endswith("AY") :
+            input = self.__replaceAt(input,idx,"AY") + "Y"
+
+        # Step 9.
+        if input.endswith("A") :
+            input = input.removesuffix("A")
+        
+        # Step 10. Ensure the output includes at minimum the first letter of the input
+        if len(input) < 1:
+            input = word[0].upper()
+
+        return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input

M linguistics_robin/utils.py => linguistics_robin/utils.py +31 -0

@@ 26,3 26,34 @@ def check_empty(word):
     """Throw exception at empty string input."""
     if not len(word):
         raise EmptyStringError('The given string is empty.')
+
+class LazyString(str):
+    def get(self, idx, dist=None):
+        if not self:
+            return None
+        if idx < 0 or idx >= len(self):
+            return None
+        if dist:
+            if idx + dist > len(self):
+                return None
+            return self[idx:idx+dist]
+        return self[idx]
+
+
+def startswith(word, matchwith):
+    return all(map(lambda x: x[0] == x[1], zip(word, matchwith)))
+
+
+def endswith(word, matchwith):
+    return all(map(lambda x: x[0] == x[1], zip(word[::-1], matchwith[::-1])))
+
+
+def isvowel(c):
+    return c and c.upper() in {'A', 'E', 'I', 'O', 'U', 'Y'}
+
+
+def isslavogermanic(s):
+    if not s:
+        return False
+    s = s.upper()
+    return "W" in s or "K" in s or "CZ" in s or "WITZ" in s<
\ No newline at end of file

M tests/test_corner_cases.py => tests/test_corner_cases.py +13 -1

@@ 1,7 1,19 @@
 import pytest
-from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex
+from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS
 from linguistics_robin.exceptions import EmptyStringError
 
+def test_nysiis():
+    nysiis = NYSIIS()
+
+    assert nysiis.phonetics('bob') == 'BAB'
+    assert nysiis.phonetics('aa') == 'A'
+    assert nysiis.phonetics('b') == 'B'
+    assert nysiis.phonetics('cat') == 'CAT'
+    assert nysiis.phonetics('s') == 'S'
+    assert nysiis.phonetics('w') == 'W'
+
+    with pytest.raises(EmptyStringError):
+        nysiis.phonetics('')
 
 def test_soundex():
     soundex = Soundex()

M tests/test_phonetics.py => tests/test_phonetics.py +17 -1

@@ 1,6 1,22 @@
 from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
-    FuzzySoundex, Lein, RefinedSoundex
+    FuzzySoundex, Lein, RefinedSoundex, NYSIIS
 
+def test_nysiis():
+    tests = [
+        ('STAD', 'stewart'),
+        ('WALVAR[ANPTAN]', 'wolverhampton'),
+        ('WALAN', 'William'),
+        ('ZANAR', 'Zimmer'),
+        ('JALAN', 'Jalen'),
+        ('CARSAN', 'Carson'),
+        ('CATARA[N]', 'Catherine'),
+        ('CATARA[N]', 'Katherine'),
+        ('LASXV', 'LouisXVI'),
+    ]
+
+    nysiis = NYSIIS()
+    for test in tests:
+        assert nysiis.phonetics(test[1]) == test[0]
 
 def test_metaphone():
     tests = [