From 7d73f1d3b498b8104119c7b1764b5c411e2544b1 Mon Sep 17 00:00:00 2001
From: Jordan <37647414+linuxgoose@users.noreply.github.com>
Date: Fri, 28 Mar 2025 00:13:00 +0000
Subject: [PATCH] Addition of NYSIIS

#1
---
 README.md                               |   1 +
 linguistics_robin/__init__.py           |   7 +-
 linguistics_robin/phonetics/__init__.py |   1 +
 linguistics_robin/phonetics/nysiis.py   | 132 ++++++++++++++++++++++++
 linguistics_robin/utils.py              |  31 ++++++
 tests/test_corner_cases.py              |  14 ++-
 tests/test_phonetics.py                 |  18 +++-
 7 files changed, 199 insertions(+), 5 deletions(-)
 create mode 100644 linguistics_robin/phonetics/nysiis.py

diff --git a/README.md b/README.md
index 87e2ac202678f925ebda952557de5a4c6c34f5e0..7d5e800b39f63f4646215b9dca38707e8de9c362 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
  * Fuzzy Soundex
  * Lein
  * Matching Rating Approach
+ * New York State Identification and Intelligence System (NYSIIS)
  
 In addition, the following distance metrics:
 
diff --git a/linguistics_robin/__init__.py b/linguistics_robin/__init__.py
index 1edd35d15f188a76de23088ff3c0aae6ac2e4d5f..4b4e65936585c1b518cd63bd72991cafd9256382 100644
--- a/linguistics_robin/__init__.py
+++ b/linguistics_robin/__init__.py
@@ -1,9 +1,10 @@
-"""A Python 3 phonetics library."""
+"""A Python 3 linguistics collection library."""
 from .phonetics import (Soundex,
                         Metaphone,
                         MatchingRatingApproach,
                         FuzzySoundex,
                         Lein,
-                        RefinedSoundex)
+                        RefinedSoundex,
+                        NYSIIS)
 
-__version__ = '0.5.4'
+__version__ = '0.5.5'
diff --git a/linguistics_robin/phonetics/__init__.py b/linguistics_robin/phonetics/__init__.py
index 9544b135c3d4203e498cc874081098a5aec23fa1..f07861aa056e13c66b286d6b58b405a9fb116928 100644
--- a/linguistics_robin/phonetics/__init__.py
+++ b/linguistics_robin/phonetics/__init__.py
@@ -4,3 +4,4 @@ from .mra import *
 from .fuzzy_soundex import *
 from .lein import *
 from .refined_soundex import *
+from .nysiis import *
diff --git a/linguistics_robin/phonetics/nysiis.py b/linguistics_robin/phonetics/nysiis.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ad0b2f1f65d25fadf1cd4ab0a434f18e86b0fa3
--- /dev/null
+++ b/linguistics_robin/phonetics/nysiis.py
@@ -0,0 +1,132 @@
+from typing import List
+import string
+
+from ..utils import check_str, check_empty
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class NYSIIS(PhoneticAlgorithm):
+    """
+    The NYSIIS algorithm.
+
+    [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
+    [Authors]: 
+    """
+    def __init__(self):
+        super().__init__()
+
+    # vowel array defined outside of function as not to recreate when used in loops
+    __vowels : List[str ]= ["A","E","I","O","U"]
+
+    # Python should have this functionality built in, replacing at an index
+    def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
+        return input[:index] + replace + input[(len(replace) or 1) + index:]
+
+    def phonetics(self, word : str) -> str | None:
+        check_str(word)
+        check_empty(word)
+
+        input : str = word
+
+        # fail fast if there isn't an input value to code (code defensively)
+        if (input is None or not input) :
+            return None
+        
+        # strip leading and trailing whitespace
+        input = input.strip()
+
+        # make input uppercase (wiki algorithm doen't mention this as first step)
+        input = input.upper()
+
+        # Step 1.
+        if input.startswith("MAC") :
+            input = "MCC" + input.removeprefix("MAC")
+        elif input.startswith("KN") :
+            input = "NN" + input.removeprefix("KN")
+        elif input.startswith("K") :
+            input = "C" + input.removeprefix("K")
+        elif input.startswith("PH") :
+            input = "FF" + input.removeprefix("PH")
+        elif input.startswith("PF") :
+            input = "FF" + input.removeprefix("PF")
+        elif input.startswith("SCH") :
+            input = "SSS" + input.removeprefix("SCH")
+
+        # Step 2.
+        if input.endswith("EE") :
+            input = input.removesuffix("EE") + "Y"
+        elif input.endswith("IE") :
+            input = input.removesuffix("IE") + "Y"
+        for item in ["DT","RT","RD","NT","ND"] :
+            if input.endswith(item) :
+                input = input.removesuffix(item) + "D"
+
+        # Steps 3-4.
+        idx : int = 1
+
+        while idx < len(input) :
+    
+            # Step 5. (1)
+            # only process letters, skip all other characters including spaces
+            if input[idx] not in string.ascii_letters : 
+                input = self.self.__replaceAt(input,idx)
+                # keeps current index and restarts
+                continue
+
+            # Step 5. (2)
+            if input[idx] in self.__vowels :
+                if input[idx:idx+2] == "EV" :
+                    input = self.__replaceAt(input,idx,"EV")
+                else :
+                    input = self.__replaceAt(input,idx,"A")
+
+            # Step 5. (3)
+            elif input[idx] == "Q" :
+                input = self.__replaceAt(input,idx,"G")
+            elif input[idx] == "Z" :
+                input = self.__replaceAt(input,idx,"S")
+            elif input[idx] == "M" :
+                input = self.__replaceAt(input,idx,"N")
+
+            # Step 5. (4)
+            elif input[idx:idx+2] == "KN" :
+                input = self.__replaceAt(input,idx,"N")
+            elif input[idx] ==  "K" :
+                input = self.__replaceAt(input,idx,"C")
+
+            # Step 5. (5)
+            elif input[idx:idx+2] == "PH" :
+                input = self.__replaceAt(input,idx,"FF")
+
+            # Step 5. (6)
+            elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
+                input = self.__replaceAt(input,idx,input[idx - 1])
+
+            # Step 5. (7)
+            elif input[idx] == "W" and input[idx - 1]  in self.__vowels :
+                input = self.__replaceAt(input,idx,input[idx - 1])
+
+            # Step 6.
+            if input[idx] == input[idx - 1] :
+                input = self.__replaceAt(input,idx,"")
+                continue
+                
+            idx += 1
+
+        # Step 7.
+        if input.endswith("S") :
+            input = input.removesuffix("S")
+        
+        # Step 8.
+        if input.endswith("AY") :
+            input = self.__replaceAt(input,idx,"AY") + "Y"
+
+        # Step 9.
+        if input.endswith("A") :
+            input = input.removesuffix("A")
+        
+        # Step 10. Ensure the output includes at minimum the first letter of the input
+        if len(input) < 1:
+            input = word[0].upper()
+
+        return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input
diff --git a/linguistics_robin/utils.py b/linguistics_robin/utils.py
index 2f73929e3ef6fe8a921b6df4f741085c0edcf361..4bcc7f1ec15fec4f56ae5228182d94306904b257 100644
--- a/linguistics_robin/utils.py
+++ b/linguistics_robin/utils.py
@@ -26,3 +26,34 @@ def check_empty(word):
     """Throw exception at empty string input."""
     if not len(word):
         raise EmptyStringError('The given string is empty.')
+
+class LazyString(str):
+    def get(self, idx, dist=None):
+        if not self:
+            return None
+        if idx < 0 or idx >= len(self):
+            return None
+        if dist:
+            if idx + dist > len(self):
+                return None
+            return self[idx:idx+dist]
+        return self[idx]
+
+
+def startswith(word, matchwith):
+    return all(map(lambda x: x[0] == x[1], zip(word, matchwith)))
+
+
+def endswith(word, matchwith):
+    return all(map(lambda x: x[0] == x[1], zip(word[::-1], matchwith[::-1])))
+
+
+def isvowel(c):
+    return c and c.upper() in {'A', 'E', 'I', 'O', 'U', 'Y'}
+
+
+def isslavogermanic(s):
+    if not s:
+        return False
+    s = s.upper()
+    return "W" in s or "K" in s or "CZ" in s or "WITZ" in s
\ No newline at end of file
diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py
index 49660ca9ab4167cf11e92f39da15d8935db127c7..503350d98d8c425522120afc215b3675853d0900 100644
--- a/tests/test_corner_cases.py
+++ b/tests/test_corner_cases.py
@@ -1,7 +1,19 @@
 import pytest
-from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex
+from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS
 from linguistics_robin.exceptions import EmptyStringError
 
+def test_nysiis():
+    nysiis = NYSIIS()
+
+    assert nysiis.phonetics('bob') == 'BAB'
+    assert nysiis.phonetics('aa') == 'A'
+    assert nysiis.phonetics('b') == 'B'
+    assert nysiis.phonetics('cat') == 'CAT'
+    assert nysiis.phonetics('s') == 'S'
+    assert nysiis.phonetics('w') == 'W'
+
+    with pytest.raises(EmptyStringError):
+        nysiis.phonetics('')
 
 def test_soundex():
     soundex = Soundex()
diff --git a/tests/test_phonetics.py b/tests/test_phonetics.py
index 14e938eff8a19d34b99b24a87b0da5c408a89a58..1662ec0d603565af7c2d7e542997d2dfb042dd09 100644
--- a/tests/test_phonetics.py
+++ b/tests/test_phonetics.py
@@ -1,6 +1,22 @@
 from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
-    FuzzySoundex, Lein, RefinedSoundex
+    FuzzySoundex, Lein, RefinedSoundex, NYSIIS
 
+def test_nysiis():
+    tests = [
+        ('STAD', 'stewart'),
+        ('WALVAR[ANPTAN]', 'wolverhampton'),
+        ('WALAN', 'William'),
+        ('ZANAR', 'Zimmer'),
+        ('JALAN', 'Jalen'),
+        ('CARSAN', 'Carson'),
+        ('CATARA[N]', 'Catherine'),
+        ('CATARA[N]', 'Katherine'),
+        ('LASXV', 'LouisXVI'),
+    ]
+
+    nysiis = NYSIIS()
+    for test in tests:
+        assert nysiis.phonetics(test[1]) == test[0]
 
 def test_metaphone():
     tests = [