M README.md => README.md +1 -0
@@ 10,6 10,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
* Lein
* Matching Rating Approach
* New York State Identification and Intelligence System (NYSIIS)
+ * Caverphone
In addition, the following distance metrics:
M linguistics_robin/__init__.py => linguistics_robin/__init__.py +4 -2
@@ 6,6 6,8 @@ from .phonetics import (Soundex,
Lein,
RefinedSoundex,
NYSIIS,
- DoubleMetaphone)
+ DoubleMetaphone,
+ Caverphone,
+ Caverphone2)
-__version__ = '0.5.6'
+__version__ = '0.5.7'
M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +2 -0
@@ 6,3 6,5 @@ from .lein import *
from .refined_soundex import *
from .nysiis import *
from .doublemetaphone import *
+from .caverphone import *
+from .caverphone2 import *
A linguistics_robin/phonetics/caverphone.py => linguistics_robin/phonetics/caverphone.py +153 -0
@@ 0,0 1,153 @@
+from collections import OrderedDict
+import re
+from .phonetic_algorithm import PhoneticAlgorithm
+from ..utils import check_str, check_empty
+
+# order of rules is very important
+# this + ordered dict guarantees iteration order
+def add_to(od, tups):
+ for tup in tups:
+ od.update({tup[0]: tup[1]})
+ return od
+
+r3 = OrderedDict()
+kv3 = [("cough", "cou2f"),
+ ("rough", "rou2f"),
+ ("tough", "tou2f"),
+ ("enough", "enou2f"),
+ ("gn", "2n"),
+ ("mb", "m2")]
+r3 = add_to(r3, kv3)
+
+r4 = OrderedDict()
+kv4 = [("cq", "2q"),
+ ("ci", "si"),
+ ("ce", "se"),
+ ("cy", "sy"),
+ ("tch", "2ch"),
+ ("c", "k"),
+ ("q", "k"),
+ ("x", "k"),
+ ("v", "f"),
+ ("dg", "2g"),
+ ("tio", "sio"),
+ ("tia", "sia"),
+ ("d", "t"),
+ ("ph", "fh"),
+ ("b", "p"),
+ ("sh", "s2"),
+ ("z", "s")]
+r4 = add_to(r4, kv4)
+
+r6 = OrderedDict()
+kv6 = [("j", "y"),
+ ("^y3", "Y3"),
+ ("^y", "A"),
+ ("y", "3"),
+ ("3gh3", "3kh3"),
+ ("gh", "22"),
+ ("g", "k"),
+ ("s+", "S"),
+ ("t+", "T"),
+ ("p+", "P"),
+ ("k+", "K"),
+ ("f+", "F"),
+ ("m+", "M"),
+ ("n+", "N"),
+ ("w3", "W3"),
+ ("wh3", "Wh3"),
+ ("w$", "3"),
+ ("w", "2"),
+ ("^h", "A"),
+ ("h", "2"),
+ ("r3", "R3"),
+ ("r$", "3"),
+ ("r", "2"),
+ ("l3", "L3"),
+ ("l$", "3"),
+ ("l", "2")]
+r6 = add_to(r6, kv6)
+
+
+# x in dict is O(1)
+vowels = {"a":None, "e":None, "i":None, "o":None, "u":None}
+
+
+class Caverphone(PhoneticAlgorithm):
+ """
+ Original writeup by David Hood, with tests and Python code:
+ http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+ See this site for more details, and related algorithms:
+ http://ntz-develop.blogspot.ca/2011/03/phonetic-algorithms.html
+ Example output
+ Maclaverty: MKLFTA
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def phonetics(self, word):
+ check_str(word)
+ check_empty(word)
+
+ inp = word
+
+ # step 1. lower
+ s1 = inp.lower()
+ s = s1[::-1]
+
+ # step 2. remove end e
+ if s[0] == "e":
+ s2 = ""
+ is_end_e = True
+ for n in range(len(s)):
+ if s[n] == "e" and is_end_e:
+ continue
+ is_end_e = False
+ s2 += s[n]
+ s2 = s2[::-1] + "e"
+ else:
+ s2 = s1
+
+ # step 3. tranform beginning of word
+ s3 = s2
+ for k in r3.keys():
+ if s2[:len(k)] == k:
+ s3 = r3[s2[:len(k)]] + s2[len(k):]
+
+ # step 4. more replacements
+ s4 = s3
+ for k in r4.keys():
+ s4 = s4.replace(k, r4[k])
+
+ # step 5. vowel at beginning with A
+ s5 = ""
+ for n in range(len(s4)):
+ if n == 0 and s4[n] in vowels:
+ s5 += "A"
+ elif s4[n] in vowels:
+ s5 += "3"
+ else:
+ s5 += s4[n]
+
+ # step 6. more replacements
+ s6 = s5
+ for k in r6.keys():
+ if "^" in k:
+ if k[1:] == s6[:len(k[1:])]:
+ s6 = r6[k] + s6[len(k[1:]):]
+ elif "$" in k:
+ if k[:-1] == s6[-len(k[:-1]):]:
+ s6 = s6[:-len(k[:-1])] + r6[k]
+ elif "+" in k:
+ s6 = re.sub(k, k.upper().replace("+", ""), s6)
+ else:
+ s6 = s6.replace(k, r6[k])
+
+ # step 7. if last is 3, replace with A and remove all 2, 3
+ s7 = s6
+ if s7[-1] == "3":
+ s7 = s7[:-1] + "A"
+ s7 = s7.replace("2", "")
+ s7 = s7.replace("3", "")
+ return s7<
\ No newline at end of file
A linguistics_robin/phonetics/caverphone2.py => linguistics_robin/phonetics/caverphone2.py +156 -0
@@ 0,0 1,156 @@
+from .phonetic_algorithm import PhoneticAlgorithm
+from ..utils import check_str, check_empty
+from typing import List
+import string
+
+# defined vowels as well as additional characters defined in the specification ("æ","ā","ø")
+__vowels : List[str] = ["a","e","i","o","u","æ","ā","ø"]
+
+class Caverphone2(PhoneticAlgorithm):
+ """
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def phonetics(self, word):
+ # Step 1.
+ check_empty(word)
+ check_str(word)
+
+ input = word
+
+ # Step 2.
+ input = input.lower()
+
+ # Step 3.
+ for char in input :
+ if (char not in string.ascii_lowercase) :
+ input = input.replace(char,"")
+
+ # Step 4.
+ if input.endswith("e") :
+ input = input.removesuffix("e")
+
+ # Step 5. (1-6)
+ ough_gn_replace : List[str] = [
+ "cough","cou2f","rough","rou2f","tough","tou2f",
+ "enough","enou2f","trough","trou2f","gn","2n"
+ ]
+ for idx,itm in enumerate(ough_gn_replace) :
+ if (idx % 2 == 1 and input.startswith(ough_gn_replace[idx-1])) :
+ input = ough_gn_replace[idx] + input.lstrip(ough_gn_replace[idx-1])
+ break
+
+ # Step 6.
+ if (input.endswith("mb")) :
+ input = input.removesuffix("mb") + "m2"
+
+ # Step 7. (1-17)
+ step_7_replace : List[str]= [
+ "cq","2q","ci","si","ce","se","cy","sy","tch","2ch","c",'k',"q","k",
+ "x","k","v","f","dg","2g","tio","sio","tia","sia","d","t","ph","fh",
+ "b","p","sh","s2","z","s"
+ ]
+ for idx,itm in enumerate(step_7_replace) :
+ if idx % 2 == 1 :
+ input = input.replace(step_7_replace[idx-1],step_7_replace[idx])
+
+ # Step 7. (18-19)
+ step_7_output : str = ""
+ for index, char in enumerate(input) :
+ if char in __vowels :
+ step_7_output += "A" if index == 0 else "3"
+ else :
+ step_7_output += char
+ input = step_7_output
+
+ # Step 7. (20)
+ input = input.replace("j","y")
+
+ # Step 7. (21-22)
+ if (input.startswith("y3")) :
+ input = input.replace("y3","Y3",1)
+ if (input.startswith("y")) :
+ input = input.removeprefix("y","A")
+
+ # Step 7. (23)
+ input = input.replace("y","3")
+
+ # Step 7. (24)
+ input = input.replace("3gh3","3kh3")
+
+ # Step 7. (25)
+ input = input.replace("gh","22")
+
+ # Step 7. (26)
+ input = input.replace("g","k")
+
+ # Step 7. (27-33)
+ identical_adj_chars : List[str] = ["s","t","p","k","f","m","n"]
+
+ output : str = ""
+ for index, char in enumerate(input) :
+ if char in identical_adj_chars :
+ upper_char : chr = char.upper()
+ if len(output) > 0 and output[-1] == upper_char :
+ continue
+ output += upper_char
+ continue
+ output += char
+ input = output
+
+ # Step 7. (34)
+ input = input.replace("w3","W3")
+
+ # Step 7. (35)
+ input = input.replace("wh3","Wh3")
+
+ # Step 7. (36)
+ if input.endswith("w") :
+ input = input.removesuffix("w") + "3"
+
+ # Step 7. (37)
+ input = input.replace("w","2")
+
+ # Step 7. (38)
+ if (input.endswith("h")) :
+ input = input.removeprefix("h") + "A"
+
+ # Step 7. (39)
+ input = input.replace("h","2")
+
+ # Step 7. (40)
+ input = input.replace("r3","R3")
+
+ # Step 7. (41)
+ if input.endswith("r") :
+ input = input.removesuffix("r") + "3"
+
+ # Step 7. (42)
+ input = input.replace("r","2")
+
+ # Step 7. (43)
+ input = input.replace("l3","L3")
+
+ # Step 7. (44)
+ if input.endswith("l") :
+ input = input.removesuffix("l") + "3"
+
+ # Step 7. (45)
+ input = input.replace("l","2")
+
+ # Step 8.
+ input = input.replace("2","")
+
+ # Step 9.
+ if input.endswith("3") :
+ input = input.removesuffix("3") + "A"
+
+ # Step 10.
+ input = input.replace("3","")
+
+ # Steps 11-12.
+ input = input.ljust(10,"1")
+
+ return input<
\ No newline at end of file
M tests/test_corner_cases.py => tests/test_corner_cases.py +24 -1
@@ 1,7 1,30 @@
import pytest
-from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, Metaphone, DoubleMetaphone
+from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \
+ Metaphone, DoubleMetaphone, Caverphone, Caverphone2
from linguistics_robin.exceptions import EmptyStringError
+def test_caverphone():
+ caverphone = Caverphone()
+
+ assert caverphone.phonetics('maurice') == 'MRSA'
+ assert caverphone.phonetics('bob') == 'PP'
+ assert caverphone.phonetics('walter') == 'WTA'
+ assert caverphone.phonetics('Maclaverty') == 'MKLFTA'
+
+ with pytest.raises(EmptyStringError):
+ caverphone.phonetics('')
+
+def test_caverphone2():
+ caverphone = Caverphone2()
+
+ assert caverphone.phonetics('Thompson') == 'TMPSN11111'
+ assert caverphone.phonetics('Lee') == 'LA11111111'
+ assert caverphone.phonetics('Stevenson') == 'STFNSN1111'
+ assert caverphone.phonetics('Peter') == 'PTA1111111'
+
+ with pytest.raises(EmptyStringError):
+ caverphone.phonetics('')
+
def test_doublemetaphone():
dm = DoubleMetaphone()
M tests/test_phonetics.py => tests/test_phonetics.py +30 -1
@@ 1,5 1,34 @@
from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
- FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone
+ FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone,\
+ Caverphone2
+
+def test_caverphone():
+ tests = [
+ ('MRSA', 'maurice'),
+ ('WTA', 'walter'),
+ ('MKLFTA', 'Maclaverty'),
+ ]
+
+ caverphone = Caverphone()
+ for test in tests:
+ assert caverphone.phonetics(test[1]) == test[0]
+
+def test_caverphone2():
+ tests = [
+ ('TMPSN11111', 'Thompson'),
+ ('LA11111111', 'Lee'),
+ ('STFNSN1111', 'Stevenson'),
+ ('PTA1111111', 'Peter'),
+ ('RTA1111111', 'Ready'),
+ ('APA1111111', 'Able'),
+ ('SSA1111111', 'social'),
+ ('KLN1111111', 'Karleen'),
+ ('TTA1111111', 'Tudor'),
+ ]
+
+ caverphone = Caverphone2()
+ for test in tests:
+ assert caverphone.phonetics(test[1]) == test[0]
def test_nysiis():
tests = [