From 7d73f1d3b498b8104119c7b1764b5c411e2544b1 Mon Sep 17 00:00:00 2001 From: Jordan <37647414+linuxgoose@users.noreply.github.com> Date: Fri, 28 Mar 2025 00:13:00 +0000 Subject: [PATCH] Addition of NYSIIS #1 --- README.md | 1 + linguistics_robin/__init__.py | 7 +- linguistics_robin/phonetics/__init__.py | 1 + linguistics_robin/phonetics/nysiis.py | 132 ++++++++++++++++++++++++ linguistics_robin/utils.py | 31 ++++++ tests/test_corner_cases.py | 14 ++- tests/test_phonetics.py | 18 +++- 7 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 linguistics_robin/phonetics/nysiis.py diff --git a/README.md b/README.md index 87e2ac202678f925ebda952557de5a4c6c34f5e0..7d5e800b39f63f4646215b9dca38707e8de9c362 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti * Fuzzy Soundex * Lein * Matching Rating Approach + * New York State Identification and Intelligence System (NYSIIS) In addition, the following distance metrics: diff --git a/linguistics_robin/__init__.py b/linguistics_robin/__init__.py index 1edd35d15f188a76de23088ff3c0aae6ac2e4d5f..4b4e65936585c1b518cd63bd72991cafd9256382 100644 --- a/linguistics_robin/__init__.py +++ b/linguistics_robin/__init__.py @@ -1,9 +1,10 @@ -"""A Python 3 phonetics library.""" +"""A Python 3 linguistics collection library.""" from .phonetics import (Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein, - RefinedSoundex) + RefinedSoundex, + NYSIIS) -__version__ = '0.5.4' +__version__ = '0.5.5' diff --git a/linguistics_robin/phonetics/__init__.py b/linguistics_robin/phonetics/__init__.py index 9544b135c3d4203e498cc874081098a5aec23fa1..f07861aa056e13c66b286d6b58b405a9fb116928 100644 --- a/linguistics_robin/phonetics/__init__.py +++ b/linguistics_robin/phonetics/__init__.py @@ -4,3 +4,4 @@ from .mra import * from .fuzzy_soundex import * from .lein import * from .refined_soundex import * +from .nysiis import * diff --git a/linguistics_robin/phonetics/nysiis.py b/linguistics_robin/phonetics/nysiis.py new file mode 100644 index 0000000000000000000000000000000000000000..4ad0b2f1f65d25fadf1cd4ab0a434f18e86b0fa3 --- /dev/null +++ b/linguistics_robin/phonetics/nysiis.py @@ -0,0 +1,132 @@ +from typing import List +import string + +from ..utils import check_str, check_empty +from .phonetic_algorithm import PhoneticAlgorithm + + +class NYSIIS(PhoneticAlgorithm): + """ + The NYSIIS algorithm. + + [Reference]: https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System & http://www.dropby.com/NYSIIS.html + [Authors]: + """ + def __init__(self): + super().__init__() + + # vowel array defined outside of function as not to recreate when used in loops + __vowels : List[str ]= ["A","E","I","O","U"] + + # Python should have this functionality built in, replacing at an index + def __replaceAt(self, input : str, index : int, replace : str = "") -> str : + return input[:index] + replace + input[(len(replace) or 1) + index:] + + def phonetics(self, word : str) -> str | None: + check_str(word) + check_empty(word) + + input : str = word + + # fail fast if there isn't an input value to code (code defensively) + if (input is None or not input) : + return None + + # strip leading and trailing whitespace + input = input.strip() + + # make input uppercase (wiki algorithm doen't mention this as first step) + input = input.upper() + + # Step 1. + if input.startswith("MAC") : + input = "MCC" + input.removeprefix("MAC") + elif input.startswith("KN") : + input = "NN" + input.removeprefix("KN") + elif input.startswith("K") : + input = "C" + input.removeprefix("K") + elif input.startswith("PH") : + input = "FF" + input.removeprefix("PH") + elif input.startswith("PF") : + input = "FF" + input.removeprefix("PF") + elif input.startswith("SCH") : + input = "SSS" + input.removeprefix("SCH") + + # Step 2. + if input.endswith("EE") : + input = input.removesuffix("EE") + "Y" + elif input.endswith("IE") : + input = input.removesuffix("IE") + "Y" + for item in ["DT","RT","RD","NT","ND"] : + if input.endswith(item) : + input = input.removesuffix(item) + "D" + + # Steps 3-4. + idx : int = 1 + + while idx < len(input) : + + # Step 5. (1) + # only process letters, skip all other characters including spaces + if input[idx] not in string.ascii_letters : + input = self.self.__replaceAt(input,idx) + # keeps current index and restarts + continue + + # Step 5. (2) + if input[idx] in self.__vowels : + if input[idx:idx+2] == "EV" : + input = self.__replaceAt(input,idx,"EV") + else : + input = self.__replaceAt(input,idx,"A") + + # Step 5. (3) + elif input[idx] == "Q" : + input = self.__replaceAt(input,idx,"G") + elif input[idx] == "Z" : + input = self.__replaceAt(input,idx,"S") + elif input[idx] == "M" : + input = self.__replaceAt(input,idx,"N") + + # Step 5. (4) + elif input[idx:idx+2] == "KN" : + input = self.__replaceAt(input,idx,"N") + elif input[idx] == "K" : + input = self.__replaceAt(input,idx,"C") + + # Step 5. (5) + elif input[idx:idx+2] == "PH" : + input = self.__replaceAt(input,idx,"FF") + + # Step 5. (6) + elif input[idx] == "H" and (input[idx - 1] not in self.__vowels or input[idx:idx+1] not in self.__vowels) : + input = self.__replaceAt(input,idx,input[idx - 1]) + + # Step 5. (7) + elif input[idx] == "W" and input[idx - 1] in self.__vowels : + input = self.__replaceAt(input,idx,input[idx - 1]) + + # Step 6. + if input[idx] == input[idx - 1] : + input = self.__replaceAt(input,idx,"") + continue + + idx += 1 + + # Step 7. + if input.endswith("S") : + input = input.removesuffix("S") + + # Step 8. + if input.endswith("AY") : + input = self.__replaceAt(input,idx,"AY") + "Y" + + # Step 9. + if input.endswith("A") : + input = input.removesuffix("A") + + # Step 10. Ensure the output includes at minimum the first letter of the input + if len(input) < 1: + input = word[0].upper() + + return input[0:6] + f'[{input[6:]}]' if len(input) > 6 else input diff --git a/linguistics_robin/utils.py b/linguistics_robin/utils.py index 2f73929e3ef6fe8a921b6df4f741085c0edcf361..4bcc7f1ec15fec4f56ae5228182d94306904b257 100644 --- a/linguistics_robin/utils.py +++ b/linguistics_robin/utils.py @@ -26,3 +26,34 @@ def check_empty(word): """Throw exception at empty string input.""" if not len(word): raise EmptyStringError('The given string is empty.') + +class LazyString(str): + def get(self, idx, dist=None): + if not self: + return None + if idx < 0 or idx >= len(self): + return None + if dist: + if idx + dist > len(self): + return None + return self[idx:idx+dist] + return self[idx] + + +def startswith(word, matchwith): + return all(map(lambda x: x[0] == x[1], zip(word, matchwith))) + + +def endswith(word, matchwith): + return all(map(lambda x: x[0] == x[1], zip(word[::-1], matchwith[::-1]))) + + +def isvowel(c): + return c and c.upper() in {'A', 'E', 'I', 'O', 'U', 'Y'} + + +def isslavogermanic(s): + if not s: + return False + s = s.upper() + return "W" in s or "K" in s or "CZ" in s or "WITZ" in s \ No newline at end of file diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py index 49660ca9ab4167cf11e92f39da15d8935db127c7..503350d98d8c425522120afc215b3675853d0900 100644 --- a/tests/test_corner_cases.py +++ b/tests/test_corner_cases.py @@ -1,7 +1,19 @@ import pytest -from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex +from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS from linguistics_robin.exceptions import EmptyStringError +def test_nysiis(): + nysiis = NYSIIS() + + assert nysiis.phonetics('bob') == 'BAB' + assert nysiis.phonetics('aa') == 'A' + assert nysiis.phonetics('b') == 'B' + assert nysiis.phonetics('cat') == 'CAT' + assert nysiis.phonetics('s') == 'S' + assert nysiis.phonetics('w') == 'W' + + with pytest.raises(EmptyStringError): + nysiis.phonetics('') def test_soundex(): soundex = Soundex() diff --git a/tests/test_phonetics.py b/tests/test_phonetics.py index 14e938eff8a19d34b99b24a87b0da5c408a89a58..1662ec0d603565af7c2d7e542997d2dfb042dd09 100644 --- a/tests/test_phonetics.py +++ b/tests/test_phonetics.py @@ -1,6 +1,22 @@ from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\ - FuzzySoundex, Lein, RefinedSoundex + FuzzySoundex, Lein, RefinedSoundex, NYSIIS +def test_nysiis(): + tests = [ + ('STAD', 'stewart'), + ('WALVAR[ANPTAN]', 'wolverhampton'), + ('WALAN', 'William'), + ('ZANAR', 'Zimmer'), + ('JALAN', 'Jalen'), + ('CARSAN', 'Carson'), + ('CATARA[N]', 'Catherine'), + ('CATARA[N]', 'Katherine'), + ('LASXV', 'LouisXVI'), + ] + + nysiis = NYSIIS() + for test in tests: + assert nysiis.phonetics(test[1]) == test[0] def test_metaphone(): tests = [