From b56acc024f683b0c52d05ac296789fb3e8772a2d Mon Sep 17 00:00:00 2001
From: Jordan <37647414+linuxgoose@users.noreply.github.com>
Date: Fri, 28 Mar 2025 20:33:19 +0000
Subject: [PATCH] Implementation of Caverphone 1 algorithm

---
 README.md                                 |   1 +
 linguistics_robin/phonetics/__init__.py   |   1 +
 linguistics_robin/phonetics/caverphone.py | 153 ++++++++++++++++++++++
 tests/test_corner_cases.py                |  14 +-
 tests/test_phonetics.py                   |  13 +-
 5 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 linguistics_robin/phonetics/caverphone.py

diff --git a/README.md b/README.md
index a1ee1676b1d03e72236968d78dbf5c054cc422b5..47e5246d69f501a1fc44d75a39a862c57f9d12d5 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
  * Lein
  * Matching Rating Approach
  * New York State Identification and Intelligence System (NYSIIS)
+ * Caverphone
  
 In addition, the following distance metrics:
 
diff --git a/linguistics_robin/phonetics/__init__.py b/linguistics_robin/phonetics/__init__.py
index 3eaa7ea9948b648700ed208be5917e5e33780765..90a3307dd93b93e9b16a24d19858df9e34d843d9 100644
--- a/linguistics_robin/phonetics/__init__.py
+++ b/linguistics_robin/phonetics/__init__.py
@@ -6,3 +6,4 @@ from .lein import *
 from .refined_soundex import *
 from .nysiis import *
 from .doublemetaphone import *
+from .caverphone import *
diff --git a/linguistics_robin/phonetics/caverphone.py b/linguistics_robin/phonetics/caverphone.py
new file mode 100644
index 0000000000000000000000000000000000000000..21e0c1c861d2cf12e3d018b0f664fc7d2562c27e
--- /dev/null
+++ b/linguistics_robin/phonetics/caverphone.py
@@ -0,0 +1,153 @@
+from collections import OrderedDict
+import re
+from .phonetic_algorithm import PhoneticAlgorithm
+from ..utils import check_str, check_empty
+
+# order of rules is very important
+# this + ordered dict guarantees iteration order
+def add_to(od, tups):
+    for tup in tups:
+        od.update({tup[0]: tup[1]})
+    return od
+
+r3 = OrderedDict()
+kv3 = [("cough", "cou2f"),
+       ("rough", "rou2f"),
+       ("tough", "tou2f"),
+       ("enough", "enou2f"),
+       ("gn", "2n"),
+       ("mb", "m2")]
+r3 = add_to(r3, kv3)
+
+r4 = OrderedDict()
+kv4 = [("cq", "2q"),
+       ("ci", "si"),
+       ("ce", "se"),
+       ("cy", "sy"),
+       ("tch", "2ch"),
+       ("c", "k"),
+       ("q", "k"),
+       ("x", "k"),
+       ("v", "f"),
+       ("dg", "2g"),
+       ("tio", "sio"),
+       ("tia", "sia"),
+       ("d", "t"),
+       ("ph", "fh"),
+       ("b", "p"),
+       ("sh", "s2"),
+       ("z", "s")]
+r4 = add_to(r4, kv4)
+
+r6 = OrderedDict()
+kv6 = [("j", "y"),
+       ("^y3", "Y3"),
+       ("^y", "A"),
+       ("y", "3"),
+       ("3gh3", "3kh3"),
+       ("gh", "22"),
+       ("g", "k"),
+       ("s+", "S"),
+       ("t+", "T"),
+       ("p+", "P"),
+       ("k+", "K"),
+       ("f+", "F"),
+       ("m+", "M"),
+       ("n+", "N"),
+       ("w3", "W3"),
+       ("wh3", "Wh3"),
+       ("w$", "3"),
+       ("w", "2"),
+       ("^h", "A"),
+       ("h", "2"),
+       ("r3", "R3"),
+       ("r$", "3"),
+       ("r", "2"),
+       ("l3", "L3"),
+       ("l$", "3"),
+       ("l", "2")]
+r6 = add_to(r6, kv6)
+
+
+# x in dict is O(1)
+vowels = {"a":None, "e":None, "i":None, "o":None, "u":None}
+
+
+class Caverphone(PhoneticAlgorithm):
+    """
+    Original writeup by David Hood, with tests and Python code:
+    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+    See this site for more details, and related algorithms:
+    http://ntz-develop.blogspot.ca/2011/03/phonetic-algorithms.html
+    Example output
+    Maclaverty: MKLFTA
+    """
+    
+    def __init__(self):
+        super().__init__()
+
+    def phonetics(self, word):
+        check_str(word)
+        check_empty(word)
+
+        inp = word
+
+        # step 1. lower
+        s1 = inp.lower()
+        s = s1[::-1]
+
+        # step 2. remove end e
+        if s[0] == "e":
+            s2 = ""
+            is_end_e = True
+            for n in range(len(s)):
+                if s[n] == "e" and is_end_e:
+                    continue
+                is_end_e = False
+                s2 += s[n]
+            s2 = s2[::-1] + "e"
+        else:
+            s2 = s1
+
+        # step 3. tranform beginning of word
+        s3 = s2
+        for k in r3.keys():
+            if s2[:len(k)] == k:
+                s3 = r3[s2[:len(k)]] + s2[len(k):]
+
+        # step 4. more replacements
+        s4 = s3
+        for k in r4.keys():
+            s4 = s4.replace(k, r4[k])
+
+        # step 5. vowel at beginning with A
+        s5 = ""
+        for n in range(len(s4)):
+            if n == 0 and s4[n] in vowels:
+                s5 += "A"
+            elif s4[n] in vowels:
+                s5 += "3"
+            else:
+                s5 += s4[n]
+
+        # step 6. more replacements
+        s6 = s5
+        for k in r6.keys():
+            if "^" in k:
+                if k[1:] == s6[:len(k[1:])]:
+                    s6 = r6[k] + s6[len(k[1:]):]
+            elif "$" in k:
+                if k[:-1] == s6[-len(k[:-1]):]:
+                    s6 = s6[:-len(k[:-1])] + r6[k]
+            elif "+" in k:
+                s6 = re.sub(k, k.upper().replace("+", ""), s6)
+            else:
+                s6 = s6.replace(k, r6[k])
+
+        # step 7. if last is 3, replace with A and remove all 2, 3
+        s7 = s6
+        if s7[-1] == "3":
+            s7 = s7[:-1] + "A"
+        s7 = s7.replace("2", "")
+        s7 = s7.replace("3", "")
+        return s7
\ No newline at end of file
diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py
index 09fdde6b67263d9532f7fef72adff26303123f3f..d02776363a607989a88ee00ee06c71299b5c1cc0 100644
--- a/tests/test_corner_cases.py
+++ b/tests/test_corner_cases.py
@@ -1,7 +1,19 @@
 import pytest
-from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, Metaphone, DoubleMetaphone
+from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \
+    Metaphone, DoubleMetaphone, Caverphone
 from linguistics_robin.exceptions import EmptyStringError
 
+def test_caverphone():
+    caverphone = Caverphone()
+
+    assert caverphone.phonetics('maurice') == 'MRSA'
+    assert caverphone.phonetics('bob') == 'PP'
+    assert caverphone.phonetics('walter') == 'WTA'
+    assert caverphone.phonetics('Maclaverty') == 'MKLFTA'
+
+    with pytest.raises(EmptyStringError):
+        caverphone.phonetics('')
+
 def test_doublemetaphone():
     dm = DoubleMetaphone()
 
diff --git a/tests/test_phonetics.py b/tests/test_phonetics.py
index 4edfc888f7b7a290cc176b9921cfa54ee18d6d0c..107611232c551ebcda8e464905eca36ed43a33db 100644
--- a/tests/test_phonetics.py
+++ b/tests/test_phonetics.py
@@ -1,5 +1,16 @@
 from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
-    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone
+    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone
+
+def test_caverphone():
+    tests = [
+        ('MRSA', 'maurice'),
+        ('WTA', 'walter'),
+        ('MKLFTA', 'Maclaverty'),
+    ]
+
+    caverphone = Caverphone()
+    for test in tests:
+        assert caverphone.phonetics(test[1]) == test[0]
 
 def test_nysiis():
     tests = [