~linuxgoose/linguistics-robin

cd2ed9b16b07e593d8989a8fae7245cf1403f49f — Jordan Robinson 8 months ago 4df2332 + bea35cd v0.5.7
Merge pull request #22 from linuxgoose/10-add-caverphone1-and-caverphone2

10 add caverphone1 and caverphone2
M README.md => README.md +1 -0
@@ 10,6 10,7 @@ Linguistics Robin is a Python linguistics collection that stemmed from a phoneti
 * Lein
 * Matching Rating Approach
 * New York State Identification and Intelligence System (NYSIIS)
 * Caverphone
 
In addition, the following distance metrics:


M linguistics_robin/__init__.py => linguistics_robin/__init__.py +4 -2
@@ 6,6 6,8 @@ from .phonetics import (Soundex,
                        Lein,
                        RefinedSoundex,
                        NYSIIS,
                        DoubleMetaphone)
                        DoubleMetaphone,
                        Caverphone,
                        Caverphone2)

__version__ = '0.5.6'
__version__ = '0.5.7'

M linguistics_robin/phonetics/__init__.py => linguistics_robin/phonetics/__init__.py +2 -0
@@ 6,3 6,5 @@ from .lein import *
from .refined_soundex import *
from .nysiis import *
from .doublemetaphone import *
from .caverphone import *
from .caverphone2 import *

A linguistics_robin/phonetics/caverphone.py => linguistics_robin/phonetics/caverphone.py +153 -0
@@ 0,0 1,153 @@
from collections import OrderedDict
import re
from .phonetic_algorithm import PhoneticAlgorithm
from ..utils import check_str, check_empty

# order of rules is very important
# this + ordered dict guarantees iteration order
def add_to(od, tups):
    for tup in tups:
        od.update({tup[0]: tup[1]})
    return od

r3 = OrderedDict()
kv3 = [("cough", "cou2f"),
       ("rough", "rou2f"),
       ("tough", "tou2f"),
       ("enough", "enou2f"),
       ("gn", "2n"),
       ("mb", "m2")]
r3 = add_to(r3, kv3)

r4 = OrderedDict()
kv4 = [("cq", "2q"),
       ("ci", "si"),
       ("ce", "se"),
       ("cy", "sy"),
       ("tch", "2ch"),
       ("c", "k"),
       ("q", "k"),
       ("x", "k"),
       ("v", "f"),
       ("dg", "2g"),
       ("tio", "sio"),
       ("tia", "sia"),
       ("d", "t"),
       ("ph", "fh"),
       ("b", "p"),
       ("sh", "s2"),
       ("z", "s")]
r4 = add_to(r4, kv4)

r6 = OrderedDict()
kv6 = [("j", "y"),
       ("^y3", "Y3"),
       ("^y", "A"),
       ("y", "3"),
       ("3gh3", "3kh3"),
       ("gh", "22"),
       ("g", "k"),
       ("s+", "S"),
       ("t+", "T"),
       ("p+", "P"),
       ("k+", "K"),
       ("f+", "F"),
       ("m+", "M"),
       ("n+", "N"),
       ("w3", "W3"),
       ("wh3", "Wh3"),
       ("w$", "3"),
       ("w", "2"),
       ("^h", "A"),
       ("h", "2"),
       ("r3", "R3"),
       ("r$", "3"),
       ("r", "2"),
       ("l3", "L3"),
       ("l$", "3"),
       ("l", "2")]
r6 = add_to(r6, kv6)


# x in dict is O(1)
vowels = {"a":None, "e":None, "i":None, "o":None, "u":None}


class Caverphone(PhoneticAlgorithm):
    """
    Original writeup by David Hood, with tests and Python code:
    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
    See this site for more details, and related algorithms:
    http://ntz-develop.blogspot.ca/2011/03/phonetic-algorithms.html
    Example output
    Maclaverty: MKLFTA
    """
    
    def __init__(self):
        super().__init__()

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        inp = word

        # step 1. lower
        s1 = inp.lower()
        s = s1[::-1]

        # step 2. remove end e
        if s[0] == "e":
            s2 = ""
            is_end_e = True
            for n in range(len(s)):
                if s[n] == "e" and is_end_e:
                    continue
                is_end_e = False
                s2 += s[n]
            s2 = s2[::-1] + "e"
        else:
            s2 = s1

        # step 3. tranform beginning of word
        s3 = s2
        for k in r3.keys():
            if s2[:len(k)] == k:
                s3 = r3[s2[:len(k)]] + s2[len(k):]

        # step 4. more replacements
        s4 = s3
        for k in r4.keys():
            s4 = s4.replace(k, r4[k])

        # step 5. vowel at beginning with A
        s5 = ""
        for n in range(len(s4)):
            if n == 0 and s4[n] in vowels:
                s5 += "A"
            elif s4[n] in vowels:
                s5 += "3"
            else:
                s5 += s4[n]

        # step 6. more replacements
        s6 = s5
        for k in r6.keys():
            if "^" in k:
                if k[1:] == s6[:len(k[1:])]:
                    s6 = r6[k] + s6[len(k[1:]):]
            elif "$" in k:
                if k[:-1] == s6[-len(k[:-1]):]:
                    s6 = s6[:-len(k[:-1])] + r6[k]
            elif "+" in k:
                s6 = re.sub(k, k.upper().replace("+", ""), s6)
            else:
                s6 = s6.replace(k, r6[k])

        # step 7. if last is 3, replace with A and remove all 2, 3
        s7 = s6
        if s7[-1] == "3":
            s7 = s7[:-1] + "A"
        s7 = s7.replace("2", "")
        s7 = s7.replace("3", "")
        return s7
\ No newline at end of file

A linguistics_robin/phonetics/caverphone2.py => linguistics_robin/phonetics/caverphone2.py +156 -0
@@ 0,0 1,156 @@
from .phonetic_algorithm import PhoneticAlgorithm
from ..utils import check_str, check_empty
from typing import List
import string
    
# defined vowels as well as additional characters defined in the specification ("æ","ā","ø")
__vowels : List[str] = ["a","e","i","o","u","æ","ā","ø"]  

class Caverphone2(PhoneticAlgorithm):
    """
    """

    def __init__(self):
        super().__init__()

    def phonetics(self, word):
        # Step 1.
        check_empty(word)
        check_str(word)

        input = word

        # Step 2.
        input = input.lower()

        # Step 3. 
        for char in input :
            if (char not in string.ascii_lowercase) :
                input = input.replace(char,"") 
        
        # Step 4.
        if input.endswith("e") :
            input = input.removesuffix("e")

        # Step 5. (1-6)
        ough_gn_replace : List[str] = [
            "cough","cou2f","rough","rou2f","tough","tou2f",
            "enough","enou2f","trough","trou2f","gn","2n"
        ]
        for idx,itm in enumerate(ough_gn_replace) :
            if (idx % 2 == 1 and input.startswith(ough_gn_replace[idx-1])) :
                input = ough_gn_replace[idx] + input.lstrip(ough_gn_replace[idx-1])
                break

        # Step 6.
        if (input.endswith("mb")) :
            input = input.removesuffix("mb") + "m2"

        # Step 7. (1-17)
        step_7_replace : List[str]= [
            "cq","2q","ci","si","ce","se","cy","sy","tch","2ch","c",'k',"q","k",
            "x","k","v","f","dg","2g","tio","sio","tia","sia","d","t","ph","fh",
            "b","p","sh","s2","z","s"
        ]
        for idx,itm in enumerate(step_7_replace) :
            if idx % 2 == 1 :
                input = input.replace(step_7_replace[idx-1],step_7_replace[idx])

        # Step 7. (18-19)
        step_7_output : str = ""
        for index, char in enumerate(input) :
            if char in __vowels :
                step_7_output += "A" if index == 0 else "3"  
            else :
                step_7_output += char
        input = step_7_output

        # Step 7. (20)
        input = input.replace("j","y")

        # Step 7. (21-22)
        if (input.startswith("y3")) :
            input = input.replace("y3","Y3",1)
        if (input.startswith("y")) :
            input = input.removeprefix("y","A")

        # Step 7. (23)
        input = input.replace("y","3")

        # Step 7. (24)
        input = input.replace("3gh3","3kh3")

        # Step 7. (25)
        input = input.replace("gh","22")

        # Step 7. (26)
        input = input.replace("g","k")

        # Step 7. (27-33)
        identical_adj_chars : List[str] = ["s","t","p","k","f","m","n"]

        output : str = ""
        for index, char in enumerate(input) :
            if char in identical_adj_chars : 
                upper_char : chr = char.upper()
                if len(output) > 0 and output[-1] == upper_char :
                    continue
                output += upper_char
                continue
            output += char
        input = output
        
        # Step 7. (34)
        input = input.replace("w3","W3")

        # Step 7. (35)
        input = input.replace("wh3","Wh3")

        # Step 7. (36)
        if input.endswith("w") :
            input = input.removesuffix("w") + "3"

        # Step 7. (37)
        input = input.replace("w","2")

        # Step 7. (38)
        if (input.endswith("h")) :
            input = input.removeprefix("h") + "A"

        # Step 7. (39)
        input = input.replace("h","2")

        # Step 7. (40)
        input = input.replace("r3","R3")

        # Step 7. (41)
        if input.endswith("r") :
            input = input.removesuffix("r") + "3"

        # Step 7. (42)
        input = input.replace("r","2")

        # Step 7. (43)
        input = input.replace("l3","L3")

        # Step 7. (44)
        if input.endswith("l") :
            input = input.removesuffix("l") + "3"

        # Step 7. (45)
        input = input.replace("l","2")

        # Step 8.
        input = input.replace("2","")

        # Step 9.
        if input.endswith("3") :
            input = input.removesuffix("3") + "A"

        # Step 10.
        input = input.replace("3","")
        
        # Steps 11-12.
        input = input.ljust(10,"1")

        return input
\ No newline at end of file

M tests/test_corner_cases.py => tests/test_corner_cases.py +24 -1
@@ 1,7 1,30 @@
import pytest
from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, Metaphone, DoubleMetaphone
from linguistics_robin import Soundex, RefinedSoundex, FuzzySoundex, NYSIIS, \
    Metaphone, DoubleMetaphone, Caverphone, Caverphone2
from linguistics_robin.exceptions import EmptyStringError

def test_caverphone():
    caverphone = Caverphone()

    assert caverphone.phonetics('maurice') == 'MRSA'
    assert caverphone.phonetics('bob') == 'PP'
    assert caverphone.phonetics('walter') == 'WTA'
    assert caverphone.phonetics('Maclaverty') == 'MKLFTA'

    with pytest.raises(EmptyStringError):
        caverphone.phonetics('')

def test_caverphone2():
    caverphone = Caverphone2()

    assert caverphone.phonetics('Thompson') == 'TMPSN11111'
    assert caverphone.phonetics('Lee') == 'LA11111111'
    assert caverphone.phonetics('Stevenson') == 'STFNSN1111'
    assert caverphone.phonetics('Peter') == 'PTA1111111'

    with pytest.raises(EmptyStringError):
        caverphone.phonetics('')

def test_doublemetaphone():
    dm = DoubleMetaphone()


M tests/test_phonetics.py => tests/test_phonetics.py +30 -1
@@ 1,5 1,34 @@
from linguistics_robin import Metaphone, Soundex, MatchingRatingApproach,\
    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone
    FuzzySoundex, Lein, RefinedSoundex, NYSIIS, DoubleMetaphone, Caverphone,\
    Caverphone2

def test_caverphone():
    tests = [
        ('MRSA', 'maurice'),
        ('WTA', 'walter'),
        ('MKLFTA', 'Maclaverty'),
    ]

    caverphone = Caverphone()
    for test in tests:
        assert caverphone.phonetics(test[1]) == test[0]

def test_caverphone2():
    tests = [
        ('TMPSN11111', 'Thompson'),
        ('LA11111111', 'Lee'),
        ('STFNSN1111', 'Stevenson'),
        ('PTA1111111', 'Peter'),
        ('RTA1111111', 'Ready'),
        ('APA1111111', 'Able'),
        ('SSA1111111', 'social'),
        ('KLN1111111', 'Karleen'),
        ('TTA1111111', 'Tudor'),
    ]

    caverphone = Caverphone2()
    for test in tests:
        assert caverphone.phonetics(test[1]) == test[0]

def test_nysiis():
    tests = [