~linuxgoose/linguistics-robin: Implementation of Caverphone 2 algorithm

4 files changed, 188 insertions(+), 2 deletions(-)

M linguistics_robin/phonetics/__init__.py
A linguistics_robin/phonetics/caverphone2.py
M tests/test_corner_cases.py
M tests/test_phonetics.py

M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +1 -0

@@ 7,3 7,4 @@ from .refined_soundex import *
 from .nysiis import *
 from .doublemetaphone import *
 from .caverphone import *
+from .caverphone2 import *

A linguistics_robin/phonetics/caverphone2.py => linguistics_robin/phonetics/caverphone2.py +156 -0

@@ 0,0 1,156 @@
+from .phonetic_algorithm import PhoneticAlgorithm
+from ..utils import check_str, check_empty
+from typing import List
+import string
+    
+# defined vowels as well as additional characters defined in the specification ("æ","ā","ø")
+__vowels : List[str] = ["a","e","i","o","u","æ","ā","ø"]  
+
+class Caverphone2(PhoneticAlgorithm):
+    """
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def phonetics(self, word):
+        # Step 1.
+        check_empty(word)
+        check_str(word)
+
+        input = word
+
+        # Step 2.
+        input = input.lower()
+
+        # Step 3. 
+        for char in input :
+            if (char not in string.ascii_lowercase) :
+                input = input.replace(char,"") 
+        
+        # Step 4.
+        if input.endswith("e") :
+            input = input.removesuffix("e")
+
+        # Step 5. (1-6)
+        ough_gn_replace : List[str] = [
+            "cough","cou2f","rough","rou2f","tough","tou2f",
+            "enough","enou2f","trough","trou2f","gn","2n"
+        ]
+        for idx,itm in enumerate(ough_gn_replace) :
+            if (idx % 2 == 1 and input.startswith(ough_gn_replace[idx-1])) :
+                input = ough_gn_replace[idx] + input.lstrip(ough_gn_replace[idx-1])
+                break
+
+        # Step 6.
+        if (input.endswith("mb")) :
+            input = input.removesuffix("mb") + "m2"
+
+        # Step 7. (1-17)
+        step_7_replace : List[str]= [
+            "cq","2q","ci","si","ce","se","cy","sy","tch","2ch","c",'k',"q","k",
+            "x","k","v","f","dg","2g","tio","sio","tia","sia","d","t","ph","fh",
+            "b","p","sh","s2","z","s"
+        ]
+        for idx,itm in enumerate(step_7_replace) :
+            if idx % 2 == 1 :
+                input = input.replace(step_7_replace[idx-1],step_7_replace[idx])
+
+        # Step 7. (18-19)
+        step_7_output : str = ""
+        for index, char in enumerate(input) :
+            if char in __vowels :
+                step_7_output += "A" if index == 0 else "3"  
+            else :
+                step_7_output += char
+        input = step_7_output
+
+        # Step 7. (20)
+        input = input.replace("j","y")
+
+        # Step 7. (21-22)
+        if (input.startswith("y3")) :
+            input = input.replace("y3","Y3",1)
+        if (input.startswith("y")) :
+            input = input.removeprefix("y","A")
+
+        # Step 7. (23)
+        input = input.replace("y","3")
+
+        # Step 7. (24)
+        input = input.replace("3gh3","3kh3")
+
+        # Step 7. (25)
+        input = input.replace("gh","22")
+
+        # Step 7. (26)
+        input = input.replace("g","k")
+
+        # Step 7. (27-33)
+        identical_adj_chars : List[str] = ["s","t","p","k","f","m","n"]
+
+        output : str = ""
+        for index, char in enumerate(input) :
+            if char in identical_adj_chars : 
+                upper_char : chr = char.upper()
+                if len(output) > 0 and output[-1] == upper_char :
+                    continue
+                output += upper_char
+                continue
+            output += char
+        input = output
+        
+        # Step 7. (34)
+        input = input.replace("w3","W3")
+
+        # Step 7. (35)
+        input = input.replace("wh3","Wh3")
+
+        # Step 7. (36)
+        if input.endswith("w") :
+            input = input.removesuffix("w") + "3"
+
+        # Step 7. (37)
+        input = input.replace("w","2")
+
+        # Step 7. (38)
+        if (input.endswith("h")) :
+            input = input.removeprefix("h") + "A"
+
+        # Step 7. (39)
+        input = input.replace("h","2")
+
+        # Step 7. (40)
+        input = input.replace("r3","R3")
+
+        # Step 7. (41)
+        if input.endswith("r") :
+            input = input.removesuffix("r") + "3"
+
+        # Step 7. (42)
+        input = input.replace("r","2")
+
+        # Step 7. (43)
+        input = input.replace("l3","L3")
+
+        # Step 7. (44)
+        if input.endswith("l") :
+            input = input.removesuffix("l") + "3"
+
+        # Step 7. (45)
+        input = input.replace("l","2")
+
+        # Step 8.
+        input = input.replace("2","")
+
+        # Step 9.
+        if input.endswith("3") :
+            input = input.removesuffix("3") + "A"
+
+        # Step 10.
+        input = input.replace("3","")
+        
+        # Steps 11-12.
+        input = input.ljust(10,"1")
+
+        return input<
\ No newline at end of file

M tests/test_corner_cases.py => tests/test_corner_cases.py +12 -1

@@ 1,6 1,6 @@
 import pytest
 from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \
-    Metaphone, DoubleMetaphone, Caverphone
+    Metaphone, DoubleMetaphone, Caverphone, Caverphone2
 from linguistics_robin.exceptions import EmptyStringError
 
 def test_caverphone():


@@ 14,6 14,17 @@ def test_caverphone():
     with pytest.raises(EmptyStringError):
         caverphone.phonetics('')
 
+def test_caverphone2():
+    caverphone = Caverphone2()
+
+    assert caverphone.phonetics('Thompson') == 'TMPSN11111'
+    assert caverphone.phonetics('Lee') == 'LA11111111'
+    assert caverphone.phonetics('Stevenson') == 'STFNSN1111'
+    assert caverphone.phonetics('Peter') == 'PTA1111111'
+
+    with pytest.raises(EmptyStringError):
+        caverphone.phonetics('')
+
 def test_doublemetaphone():
     dm = DoubleMetaphone()

M tests/test_phonetics.py => tests/test_phonetics.py +19 -1

@@ 1,5 1,6 @@
 from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
-    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone
+    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone,\
+    Caverphone2
 
 def test_caverphone():
     tests = [


@@ 12,6 13,23 @@ def test_caverphone():
     for test in tests:
         assert caverphone.phonetics(test[1]) == test[0]
 
+def test_caverphone2():
+    tests = [
+        ('TMPSN11111', 'Thompson'),
+        ('LA11111111', 'Lee'),
+        ('STFNSN1111', 'Stevenson'),
+        ('PTA1111111', 'Peter'),
+        ('RTA1111111', 'Ready'),
+        ('APA1111111', 'Able'),
+        ('SSA1111111', 'social'),
+        ('KLN1111111', 'Karleen'),
+        ('TTA1111111', 'Tudor'),
+    ]
+
+    caverphone = Caverphone2()
+    for test in tests:
+        assert caverphone.phonetics(test[1]) == test[0]
+
 def test_nysiis():
     tests = [
         ('STAD', 'stewart'),