From b56acc024f683b0c52d05ac296789fb3e8772a2d Mon Sep 17 00:00:00 2001 From: Jordan <37647414+linuxgoose@users.noreply.github.com> Date: Fri, 28 Mar 2025 20:33:19 +0000 Subject: [PATCH] Implementation of Caverphone 1 algorithm --- README.md | 1 + linguistics_robin/phonetics/__init__.py | 1 + linguistics_robin/phonetics/caverphone.py | 153 ++++++++++++++++++++++ tests/test_corner_cases.py | 14 +- tests/test_phonetics.py | 13 +- 5 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 linguistics_robin/phonetics/caverphone.py diff --git a/README.md b/README.md index a1ee1676b1d03e72236968d78dbf5c054cc422b5..47e5246d69f501a1fc44d75a39a862c57f9d12d5 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti * Lein * Matching Rating Approach * New York State Identification and Intelligence System (NYSIIS) + * Caverphone In addition, the following distance metrics: diff --git a/linguistics_robin/phonetics/__init__.py b/linguistics_robin/phonetics/__init__.py index 3eaa7ea9948b648700ed208be5917e5e33780765..90a3307dd93b93e9b16a24d19858df9e34d843d9 100644 --- a/linguistics_robin/phonetics/__init__.py +++ b/linguistics_robin/phonetics/__init__.py @@ -6,3 +6,4 @@ from .lein import * from .refined_soundex import * from .nysiis import * from .doublemetaphone import * +from .caverphone import * diff --git a/linguistics_robin/phonetics/caverphone.py b/linguistics_robin/phonetics/caverphone.py new file mode 100644 index 0000000000000000000000000000000000000000..21e0c1c861d2cf12e3d018b0f664fc7d2562c27e --- /dev/null +++ b/linguistics_robin/phonetics/caverphone.py @@ -0,0 +1,153 @@ +from collections import OrderedDict +import re +from .phonetic_algorithm import PhoneticAlgorithm +from ..utils import check_str, check_empty + +# order of rules is very important +# this + ordered dict guarantees iteration order +def add_to(od, tups): + for tup in tups: + od.update({tup[0]: tup[1]}) + return od + +r3 = OrderedDict() +kv3 = [("cough", "cou2f"), + ("rough", "rou2f"), + ("tough", "tou2f"), + ("enough", "enou2f"), + ("gn", "2n"), + ("mb", "m2")] +r3 = add_to(r3, kv3) + +r4 = OrderedDict() +kv4 = [("cq", "2q"), + ("ci", "si"), + ("ce", "se"), + ("cy", "sy"), + ("tch", "2ch"), + ("c", "k"), + ("q", "k"), + ("x", "k"), + ("v", "f"), + ("dg", "2g"), + ("tio", "sio"), + ("tia", "sia"), + ("d", "t"), + ("ph", "fh"), + ("b", "p"), + ("sh", "s2"), + ("z", "s")] +r4 = add_to(r4, kv4) + +r6 = OrderedDict() +kv6 = [("j", "y"), + ("^y3", "Y3"), + ("^y", "A"), + ("y", "3"), + ("3gh3", "3kh3"), + ("gh", "22"), + ("g", "k"), + ("s+", "S"), + ("t+", "T"), + ("p+", "P"), + ("k+", "K"), + ("f+", "F"), + ("m+", "M"), + ("n+", "N"), + ("w3", "W3"), + ("wh3", "Wh3"), + ("w$", "3"), + ("w", "2"), + ("^h", "A"), + ("h", "2"), + ("r3", "R3"), + ("r$", "3"), + ("r", "2"), + ("l3", "L3"), + ("l$", "3"), + ("l", "2")] +r6 = add_to(r6, kv6) + + +# x in dict is O(1) +vowels = {"a":None, "e":None, "i":None, "o":None, "u":None} + + +class Caverphone(PhoneticAlgorithm): + """ + Original writeup by David Hood, with tests and Python code: + http://caversham.otago.ac.nz/files/working/ctp150804.pdf + See this site for more details, and related algorithms: + http://ntz-develop.blogspot.ca/2011/03/phonetic-algorithms.html + Example output + Maclaverty: MKLFTA + """ + + def __init__(self): + super().__init__() + + def phonetics(self, word): + check_str(word) + check_empty(word) + + inp = word + + # step 1. lower + s1 = inp.lower() + s = s1[::-1] + + # step 2. remove end e + if s[0] == "e": + s2 = "" + is_end_e = True + for n in range(len(s)): + if s[n] == "e" and is_end_e: + continue + is_end_e = False + s2 += s[n] + s2 = s2[::-1] + "e" + else: + s2 = s1 + + # step 3. tranform beginning of word + s3 = s2 + for k in r3.keys(): + if s2[:len(k)] == k: + s3 = r3[s2[:len(k)]] + s2[len(k):] + + # step 4. more replacements + s4 = s3 + for k in r4.keys(): + s4 = s4.replace(k, r4[k]) + + # step 5. vowel at beginning with A + s5 = "" + for n in range(len(s4)): + if n == 0 and s4[n] in vowels: + s5 += "A" + elif s4[n] in vowels: + s5 += "3" + else: + s5 += s4[n] + + # step 6. more replacements + s6 = s5 + for k in r6.keys(): + if "^" in k: + if k[1:] == s6[:len(k[1:])]: + s6 = r6[k] + s6[len(k[1:]):] + elif "$" in k: + if k[:-1] == s6[-len(k[:-1]):]: + s6 = s6[:-len(k[:-1])] + r6[k] + elif "+" in k: + s6 = re.sub(k, k.upper().replace("+", ""), s6) + else: + s6 = s6.replace(k, r6[k]) + + # step 7. if last is 3, replace with A and remove all 2, 3 + s7 = s6 + if s7[-1] == "3": + s7 = s7[:-1] + "A" + s7 = s7.replace("2", "") + s7 = s7.replace("3", "") + return s7 \ No newline at end of file diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py index 09fdde6b67263d9532f7fef72adff26303123f3f..d02776363a607989a88ee00ee06c71299b5c1cc0 100644 --- a/tests/test_corner_cases.py +++ b/tests/test_corner_cases.py @@ -1,7 +1,19 @@ import pytest -from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, Metaphone, DoubleMetaphone +from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \ + Metaphone, DoubleMetaphone, Caverphone from linguistics_robin.exceptions import EmptyStringError +def test_caverphone(): + caverphone = Caverphone() + + assert caverphone.phonetics('maurice') == 'MRSA' + assert caverphone.phonetics('bob') == 'PP' + assert caverphone.phonetics('walter') == 'WTA' + assert caverphone.phonetics('Maclaverty') == 'MKLFTA' + + with pytest.raises(EmptyStringError): + caverphone.phonetics('') + def test_doublemetaphone(): dm = DoubleMetaphone() diff --git a/tests/test_phonetics.py b/tests/test_phonetics.py index 4edfc888f7b7a290cc176b9921cfa54ee18d6d0c..107611232c551ebcda8e464905eca36ed43a33db 100644 --- a/tests/test_phonetics.py +++ b/tests/test_phonetics.py @@ -1,5 +1,16 @@ from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\ - FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone + FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone + +def test_caverphone(): + tests = [ + ('MRSA', 'maurice'), + ('WTA', 'walter'), + ('MKLFTA', 'Maclaverty'), + ] + + caverphone = Caverphone() + for test in tests: + assert caverphone.phonetics(test[1]) == test[0] def test_nysiis(): tests = [