M README.md => README.md +1 -0
@@ 8,6 8,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
* Fuzzy Soundex
* Lein
* Matching Rating Approach
+ * New York State Identification and Intelligence System (NYSIIS)
In addition, the following distance metrics:
M linguistics_robin/__init__.py => linguistics_robin/__init__.py +4 -3
@@ 1,9 1,10 @@
-"""A Python 3 phonetics library."""
+"""A Python 3 linguistics collection library."""
from .phonetics import (Soundex,
Metaphone,
MatchingRatingApproach,
FuzzySoundex,
Lein,
- RefinedSoundex)
+ RefinedSoundex,
+ NYSIIS)
-__version__ = '0.5.4'
+__version__ = '0.5.5'
M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +1 -0
@@ 4,3 4,4 @@ from .mra import *
from .fuzzy_soundex import *
from .lein import *
from .refined_soundex import *
+from .nysiis import *
A linguistics_robin/phonetics/nysiis.py => linguistics_robin/phonetics/nysiis.py +132 -0
@@ 0,0 1,132 @@
+from typing import List
+import string
+
+from ..utils import check_str, check_empty
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class NYSIIS(PhoneticAlgorithm):
+ """
+ The NYSIIS algorithm.
+
+ [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html
+ [Authors]:
+ """
+ def __init__(self):
+ super().__init__()
+
+ # vowel array defined outside of function as not to recreate when used in loops
+ __vowels : List[str ]= ["A","E","I","O","U"]
+
+ # Python should have this functionality built in, replacing at an index
+ def __replaceAt(self, input : str, index : int, replace : str = "") -> str :
+ return input[:index] + replace + input[(len(replace) or 1) + index:]
+
+ def phonetics(self, word : str) -> str | None:
+ check_str(word)
+ check_empty(word)
+
+ input : str = word
+
+ # fail fast if there isn't an input value to code (code defensively)
+ if (input is None or not input) :
+ return None
+
+ # strip leading and trailing whitespace
+ input = input.strip()
+
+ # make input uppercase (wiki algorithm doen't mention this as first step)
+ input = input.upper()
+
+ # Step 1.
+ if input.startswith("MAC") :
+ input = "MCC" + input.removeprefix("MAC")
+ elif input.startswith("KN") :
+ input = "NN" + input.removeprefix("KN")
+ elif input.startswith("K") :
+ input = "C" + input.removeprefix("K")
+ elif input.startswith("PH") :
+ input = "FF" + input.removeprefix("PH")
+ elif input.startswith("PF") :
+ input = "FF" + input.removeprefix("PF")
+ elif input.startswith("SCH") :
+ input = "SSS" + input.removeprefix("SCH")
+
+ # Step 2.
+ if input.endswith("EE") :
+ input = input.removesuffix("EE") + "Y"
+ elif input.endswith("IE") :
+ input = input.removesuffix("IE") + "Y"
+ for item in ["DT","RT","RD","NT","ND"] :
+ if input.endswith(item) :
+ input = input.removesuffix(item) + "D"
+
+ # Steps 3-4.
+ idx : int = 1
+
+ while idx < len(input) :
+
+ # Step 5. (1)
+ # only process letters, skip all other characters including spaces
+ if input[idx] not in string.ascii_letters :
+ input = self.self.__replaceAt(input,idx)
+ # keeps current index and restarts
+ continue
+
+ # Step 5. (2)
+ if input[idx] in self.__vowels :
+ if input[idx:idx+2] == "EV" :
+ input = self.__replaceAt(input,idx,"EV")
+ else :
+ input = self.__replaceAt(input,idx,"A")
+
+ # Step 5. (3)
+ elif input[idx] == "Q" :
+ input = self.__replaceAt(input,idx,"G")
+ elif input[idx] == "Z" :
+ input = self.__replaceAt(input,idx,"S")
+ elif input[idx] == "M" :
+ input = self.__replaceAt(input,idx,"N")
+
+ # Step 5. (4)
+ elif input[idx:idx+2] == "KN" :
+ input = self.__replaceAt(input,idx,"N")
+ elif input[idx] == "K" :
+ input = self.__replaceAt(input,idx,"C")
+
+ # Step 5. (5)
+ elif input[idx:idx+2] == "PH" :
+ input = self.__replaceAt(input,idx,"FF")
+
+ # Step 5. (6)
+ elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) :
+ input = self.__replaceAt(input,idx,input[idx - 1])
+
+ # Step 5. (7)
+ elif input[idx] == "W" and input[idx - 1] in self.__vowels :
+ input = self.__replaceAt(input,idx,input[idx - 1])
+
+ # Step 6.
+ if input[idx] == input[idx - 1] :
+ input = self.__replaceAt(input,idx,"")
+ continue
+
+ idx += 1
+
+ # Step 7.
+ if input.endswith("S") :
+ input = input.removesuffix("S")
+
+ # Step 8.
+ if input.endswith("AY") :
+ input = self.__replaceAt(input,idx,"AY") + "Y"
+
+ # Step 9.
+ if input.endswith("A") :
+ input = input.removesuffix("A")
+
+ # Step 10. Ensure the output includes at minimum the first letter of the input
+ if len(input) < 1:
+ input = word[0].upper()
+
+ return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input
M linguistics_robin/utils.py => linguistics_robin/utils.py +31 -0
@@ 26,3 26,34 @@ def check_empty(word):
"""Throw exception at empty string input."""
if not len(word):
raise EmptyStringError('The given string is empty.')
+
+class LazyString(str):
+ def get(self, idx, dist=None):
+ if not self:
+ return None
+ if idx < 0 or idx >= len(self):
+ return None
+ if dist:
+ if idx + dist > len(self):
+ return None
+ return self[idx:idx+dist]
+ return self[idx]
+
+
+def startswith(word, matchwith):
+ return all(map(lambda x: x[0] == x[1], zip(word, matchwith)))
+
+
+def endswith(word, matchwith):
+ return all(map(lambda x: x[0] == x[1], zip(word[::-1], matchwith[::-1])))
+
+
+def isvowel(c):
+ return c and c.upper() in {'A', 'E', 'I', 'O', 'U', 'Y'}
+
+
+def isslavogermanic(s):
+ if not s:
+ return False
+ s = s.upper()
+ return "W" in s or "K" in s or "CZ" in s or "WITZ" in s<
\ No newline at end of file
M tests/test_corner_cases.py => tests/test_corner_cases.py +13 -1
@@ 1,7 1,19 @@
import pytest
-from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex
+from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS
from linguistics_robin.exceptions import EmptyStringError
+def test_nysiis():
+ nysiis = NYSIIS()
+
+ assert nysiis.phonetics('bob') == 'BAB'
+ assert nysiis.phonetics('aa') == 'A'
+ assert nysiis.phonetics('b') == 'B'
+ assert nysiis.phonetics('cat') == 'CAT'
+ assert nysiis.phonetics('s') == 'S'
+ assert nysiis.phonetics('w') == 'W'
+
+ with pytest.raises(EmptyStringError):
+ nysiis.phonetics('')
def test_soundex():
soundex = Soundex()
M tests/test_phonetics.py => tests/test_phonetics.py +17 -1
@@ 1,6 1,22 @@
from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
- FuzzySoundex, Lein, RefinedSoundex
+ FuzzySoundex, Lein, RefinedSoundex, NYSIIS
+def test_nysiis():
+ tests = [
+ ('STAD', 'stewart'),
+ ('WALVAR[ANPTAN]', 'wolverhampton'),
+ ('WALAN', 'William'),
+ ('ZANAR', 'Zimmer'),
+ ('JALAN', 'Jalen'),
+ ('CARSAN', 'Carson'),
+ ('CATARA[N]', 'Catherine'),
+ ('CATARA[N]', 'Katherine'),
+ ('LASXV', 'LouisXVI'),
+ ]
+
+ nysiis = NYSIIS()
+ for test in tests:
+ assert nysiis.phonetics(test[1]) == test[0]
def test_metaphone():
tests = [