From e7a9c042c34cb3bfe0dd9a4f0afa04c2f82b6491 Mon Sep 17 00:00:00 2001 From: Jordan <37647414+linuxgoose@users.noreply.github.com> Date: Fri, 28 Mar 2025 20:41:11 +0000 Subject: [PATCH] Implementation of Caverphone 2 algorithm --- linguistics_robin/phonetics/__init__.py | 1 + linguistics_robin/phonetics/caverphone2.py | 156 +++++++++++++++++++++ tests/test_corner_cases.py | 13 +- tests/test_phonetics.py | 20 ++- 4 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 linguistics_robin/phonetics/caverphone2.py diff --git a/linguistics_robin/phonetics/__init__.py b/linguistics_robin/phonetics/__init__.py index 90a3307dd93b93e9b16a24d19858df9e34d843d9..4d806f4e446b14929faa3c77a43abc1c957d8023 100644 --- a/linguistics_robin/phonetics/__init__.py +++ b/linguistics_robin/phonetics/__init__.py @@ -7,3 +7,4 @@ from .refined_soundex import * from .nysiis import * from .doublemetaphone import * from .caverphone import * +from .caverphone2 import * diff --git a/linguistics_robin/phonetics/caverphone2.py b/linguistics_robin/phonetics/caverphone2.py new file mode 100644 index 0000000000000000000000000000000000000000..5a11b03ad7548dedaebfa12e6e4354e8a8954927 --- /dev/null +++ b/linguistics_robin/phonetics/caverphone2.py @@ -0,0 +1,156 @@ +from .phonetic_algorithm import PhoneticAlgorithm +from ..utils import check_str, check_empty +from typing import List +import string + +# defined vowels as well as additional characters defined in the specification ("æ","ā","ø") +__vowels : List[str] = ["a","e","i","o","u","æ","ā","ø"] + +class Caverphone2(PhoneticAlgorithm): + """ + """ + + def __init__(self): + super().__init__() + + def phonetics(self, word): + # Step 1. + check_empty(word) + check_str(word) + + input = word + + # Step 2. + input = input.lower() + + # Step 3. + for char in input : + if (char not in string.ascii_lowercase) : + input = input.replace(char,"") + + # Step 4. + if input.endswith("e") : + input = input.removesuffix("e") + + # Step 5. (1-6) + ough_gn_replace : List[str] = [ + "cough","cou2f","rough","rou2f","tough","tou2f", + "enough","enou2f","trough","trou2f","gn","2n" + ] + for idx,itm in enumerate(ough_gn_replace) : + if (idx % 2 == 1 and input.startswith(ough_gn_replace[idx-1])) : + input = ough_gn_replace[idx] + input.lstrip(ough_gn_replace[idx-1]) + break + + # Step 6. + if (input.endswith("mb")) : + input = input.removesuffix("mb") + "m2" + + # Step 7. (1-17) + step_7_replace : List[str]= [ + "cq","2q","ci","si","ce","se","cy","sy","tch","2ch","c",'k',"q","k", + "x","k","v","f","dg","2g","tio","sio","tia","sia","d","t","ph","fh", + "b","p","sh","s2","z","s" + ] + for idx,itm in enumerate(step_7_replace) : + if idx % 2 == 1 : + input = input.replace(step_7_replace[idx-1],step_7_replace[idx]) + + # Step 7. (18-19) + step_7_output : str = "" + for index, char in enumerate(input) : + if char in __vowels : + step_7_output += "A" if index == 0 else "3" + else : + step_7_output += char + input = step_7_output + + # Step 7. (20) + input = input.replace("j","y") + + # Step 7. (21-22) + if (input.startswith("y3")) : + input = input.replace("y3","Y3",1) + if (input.startswith("y")) : + input = input.removeprefix("y","A") + + # Step 7. (23) + input = input.replace("y","3") + + # Step 7. (24) + input = input.replace("3gh3","3kh3") + + # Step 7. (25) + input = input.replace("gh","22") + + # Step 7. (26) + input = input.replace("g","k") + + # Step 7. (27-33) + identical_adj_chars : List[str] = ["s","t","p","k","f","m","n"] + + output : str = "" + for index, char in enumerate(input) : + if char in identical_adj_chars : + upper_char : chr = char.upper() + if len(output) > 0 and output[-1] == upper_char : + continue + output += upper_char + continue + output += char + input = output + + # Step 7. (34) + input = input.replace("w3","W3") + + # Step 7. (35) + input = input.replace("wh3","Wh3") + + # Step 7. (36) + if input.endswith("w") : + input = input.removesuffix("w") + "3" + + # Step 7. (37) + input = input.replace("w","2") + + # Step 7. (38) + if (input.endswith("h")) : + input = input.removeprefix("h") + "A" + + # Step 7. (39) + input = input.replace("h","2") + + # Step 7. (40) + input = input.replace("r3","R3") + + # Step 7. (41) + if input.endswith("r") : + input = input.removesuffix("r") + "3" + + # Step 7. (42) + input = input.replace("r","2") + + # Step 7. (43) + input = input.replace("l3","L3") + + # Step 7. (44) + if input.endswith("l") : + input = input.removesuffix("l") + "3" + + # Step 7. (45) + input = input.replace("l","2") + + # Step 8. + input = input.replace("2","") + + # Step 9. + if input.endswith("3") : + input = input.removesuffix("3") + "A" + + # Step 10. + input = input.replace("3","") + + # Steps 11-12. + input = input.ljust(10,"1") + + return input \ No newline at end of file diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py index d02776363a607989a88ee00ee06c71299b5c1cc0..4c2d9e15e818449ba3846ad5efae9600f2db42c9 100644 --- a/tests/test_corner_cases.py +++ b/tests/test_corner_cases.py @@ -1,6 +1,6 @@ import pytest from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \ - Metaphone, DoubleMetaphone, Caverphone + Metaphone, DoubleMetaphone, Caverphone, Caverphone2 from linguistics_robin.exceptions import EmptyStringError def test_caverphone(): @@ -14,6 +14,17 @@ def test_caverphone(): with pytest.raises(EmptyStringError): caverphone.phonetics('') +def test_caverphone2(): + caverphone = Caverphone2() + + assert caverphone.phonetics('Thompson') == 'TMPSN11111' + assert caverphone.phonetics('Lee') == 'LA11111111' + assert caverphone.phonetics('Stevenson') == 'STFNSN1111' + assert caverphone.phonetics('Peter') == 'PTA1111111' + + with pytest.raises(EmptyStringError): + caverphone.phonetics('') + def test_doublemetaphone(): dm = DoubleMetaphone() diff --git a/tests/test_phonetics.py b/tests/test_phonetics.py index 107611232c551ebcda8e464905eca36ed43a33db..e4c5451060b7c615a09f39e04f3e26ef00021e75 100644 --- a/tests/test_phonetics.py +++ b/tests/test_phonetics.py @@ -1,5 +1,6 @@ from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\ - FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone + FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone,\ + Caverphone2 def test_caverphone(): tests = [ @@ -12,6 +13,23 @@ def test_caverphone(): for test in tests: assert caverphone.phonetics(test[1]) == test[0] +def test_caverphone2(): + tests = [ + ('TMPSN11111', 'Thompson'), + ('LA11111111', 'Lee'), + ('STFNSN1111', 'Stevenson'), + ('PTA1111111', 'Peter'), + ('RTA1111111', 'Ready'), + ('APA1111111', 'Able'), + ('SSA1111111', 'social'), + ('KLN1111111', 'Karleen'), + ('TTA1111111', 'Tudor'), + ] + + caverphone = Caverphone2() + for test in tests: + assert caverphone.phonetics(test[1]) == test[0] + def test_nysiis(): tests = [ ('STAD', 'stewart'),