~linuxgoose/linguistics-robin

ea93ccc64cd583a5b879879ad06fb6c6f51404a2 — Lilykos 9 years ago dfd0ff9
Algorithm implementation:
* Added Soundex, Fuzzy Soundex, Metaphone, Lein, MRA.
* Added tests for every algorithm.
* Updated Licence.
* Added and tested some utils.
A LICENSE.rst => LICENSE.rst +21 -0
@@ 0,0 1,21 @@
The MIT License (MIT)

Copyright (c) 2016 Guillaume Plique (Yomguithereal)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
\ No newline at end of file

M flit.ini => flit.ini +3 -0
@@ 3,6 3,9 @@ module=pyphonetics
author=Lilykos
author-email=lilykosk@gmail.com
home-page=http://github.com/Lilykos/pyphonetics
requires=unidecode
    pytest
requires-python= >=3
description-file=README.rst
classifiers=Intended Audience :: Developers
    Programming Language :: Python :: 3

M pyphonetics/__init__.py => pyphonetics/__init__.py +2 -2
@@ 1,4 1,4 @@
"""A Python 3 phonetics library."""
from .phonetics import *
from .phonetics import Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein

__version__ = '0.1'
__version__ = '0.2'

A pyphonetics/exceptions.py => pyphonetics/exceptions.py +6 -0
@@ 0,0 1,6 @@
class UnicodeException(Exception):
    pass


class WrongLengthException(Exception):
    pass

M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +4 -0
@@ 1,1 1,5 @@
from .soundex import *
from .metaphone import *
from .mra import *
from .fuzzy_soundex import *
from .lein import *
\ No newline at end of file

A pyphonetics/phonetics/fuzzy_soundex.py => pyphonetics/phonetics/fuzzy_soundex.py +112 -0
@@ 0,0 1,112 @@
import re
from unidecode import unidecode

from ..utils import squeeze, translation
from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class FuzzySoundex(PhoneticAlgorithm):
    """
    Implementation of the "Fuzzy Soundex" algorithm.

    [Reference]: http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
    [Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
    """
    def __init__(self):
        self.translations = translation(
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
            '0193017-07745501769301-7-9'
        )

        self.rules = [
            (r'CA', r'KA'),
            (r'CC', r'KK'),
            (r'CK', r'KK'),
            (r'CE', r'SE'),
            (r'CHL', r'KL'),
            (r'CL', r'KL'),
            (r'CHR', r'KR'),
            (r'CR', r'KR'),
            (r'CI', r'SI'),
            (r'CO', r'KO'),
            (r'CU', r'KU'),
            (r'CY', r'SY'),
            (r'DG', r'GG'),
            (r'GH', r'HH'),
            (r'MAC', r'MK'),
            (r'MC', r'MK'),
            (r'NST', r'NSS'),
            (r'PF', r'FF'),
            (r'PH', r'FF'),
            (r'SCH', r'SSS'),
            (r'TIO', r'SIO'),
            (r'TIA', r'SIO'),
            (r'TCH', r'CHH'),
        ]

        self.set1 = ['CS', 'CZ', 'TS', 'TZ']
        self.set2 = ['HR', 'WR']
        self.set3 = ['KN', 'NG']
        self.set4 = list('HWY')

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        if not word:
            return ''

        word = unidecode(word).upper()

        # Substitutions for beginnings
        first_two, rest = word[:2], word[2:]

        if first_two in self.set1:
            word = 'SS' + rest
        elif first_two == 'GN':
            word = 'NN' + rest
        elif first_two in self.set2:
            word = 'RR' + rest
        elif first_two == 'HW':
            word = 'WW' + rest
        elif first_two in self.set3:
            word = 'NN' + rest

        # Substitutions for endings
        last_two, initial = word[-2:], word[0:-2]

        if last_two == 'CH':
            word = initial + 'KK'
        elif last_two == 'NT':
            word = initial + 'TT'
        elif last_two == 'RT':
            word = initial + 'RR'
        elif word[-3:] == 'RDT':
            word = word[0:-3] + 'RR'

        # Applying the rules
        for rule in self.rules:
            word = re.sub(rule[0], rule[1], word)

        # Catch the first letter
        first_letter = word[0]

        # Translating
        code = ''.join(self.translations.get(char, char) for char in word)

        # Removing hyphens
        code = code.replace('-', '')

        # Squeezing the code
        code = squeeze(code)

        # Dealing with initials
        if code[0] in self.set4:
            code = first_letter
        else:
            code = first_letter + code[1:]

        # Dropping vowels
        code = code.replace('0', '')
        return code

A pyphonetics/phonetics/lein.py => pyphonetics/phonetics/lein.py +43 -0
@@ 0,0 1,43 @@
import re
from unidecode import unidecode

from ..utils import squeeze, translation
from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class Lein(PhoneticAlgorithm):
    """
    The Lein name coding procedure.

    [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
    """
    def __init__(self):
        self.translations = translation(
            'DTMNLRBFPVCJKGQSXZ',
            '112233444455555555'
        )

        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]\s', r'', word)

        # Keep the 1st letter
        first, code = word[0], word[1:]

        # Drop vowels and Y, W & H
        code = re.sub(r'[AEIOUYWH]', r'', code)

        # Drop consecutive duplicates and truncate to 4 chars
        code = squeeze(code)[0: 4]

        # Translations
        backup = code
        code = ''.join(self.translations.get(char, char) for char in backup)

        return self.pad(first + code)

A pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +53 -0
@@ 0,0 1,53 @@
import re
from unidecode import unidecode

from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class Metaphone(PhoneticAlgorithm):
    """
    The metaphone algorithm.

    [Reference]: https://en.wikipedia.org/wiki/Metaphone
    [Author]: Lawrence Philips, 1990
    """
    def __init__(self):
        self.rules = [
            (r'[^a-z]', r''),
            (r'([bcdfhjklmnpqrstvwxyz])\1+', r'\1'),
            (r'^ae', r'E'),
            (r'^[gkp]n', r'N'),
            (r'^wr', r'R'),
            (r'^x', r'S'),
            (r'^wh', r'W'),
            (r'mb$', r'M'),
            (r'(?!^)sch', r'SK'),
            (r'th', r'0'),
            (r't?ch|sh', r'X'),
            (r'c(?=ia)', r'X'),
            (r'[st](?=i[ao])', r'X'),
            (r's?c(?=[iey])', r'S'),
            (r'[cq]', r'K'),
            (r'dg(?=[iey])', r'J'),
            (r'd', r'T'),
            (r'g(?=h[^aeiou])', r''),
            (r'gn(ed)?', r'N'),
            (r'([^g]|^)g(?=[iey])', '\1J'),
            (r'g+', 'K'),
            (r'ph', 'F'),
            (r'([aeiou])h(?=\b|[^aeiou])', '\1'),
            (r'[wy](?![aeiou])', ''),
            (r'z', 'S'),
            (r'v', 'F'),
            (r'(?!^)[aeiou]+', '')
        ]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        code = unidecode(word).lower()
        for item in self.rules:
            code = re.sub(item[0], item[1], code)
        return code.upper()

A pyphonetics/phonetics/mra.py => pyphonetics/phonetics/mra.py +33 -0
@@ 0,0 1,33 @@
import re
from unidecode import unidecode

from ..utils import squeeze
from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class MatchingRatingApproach(PhoneticAlgorithm):
    """
    Functions related to the computation of the Match Rating Approach codex.

    [Reference]: https://en.wikipedia.org/wiki/Match_rating_approach
    [Article]: Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977).
        Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
        US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
    """
    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        codex = unidecode(word).upper()
        codex = re.sub(r'[^A-Z]', r'', codex)

        # Dropping non - leading vowels
        codex = codex[0] + re.sub(r'[AEIOU]', r'', codex[1:])

        # Dropping consecutive consonants
        codex = squeeze(codex)

        # Returning the codex
        offset = min(3, len(codex) - 3)
        return codex[:3] + codex[len(codex) - offset:offset + len(codex)]

A pyphonetics/phonetics/phonetic_algorithm.py => pyphonetics/phonetics/phonetic_algorithm.py +12 -0
@@ 0,0 1,12 @@
class PhoneticAlgorithm:
    """
    The main Phonetic Algorithm class, to ensure a unified API
    for all the included algorithms.
    """
    def phonetics(self, word):
        """Get the phonetic representation of the word."""
        pass

    def sounds_like(self, word1, word2):
        """Compare the phonetic representations of 2 words, and return a boolean value."""
        return self.phonetics(word1) == self.phonetics(word2)

M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +66 -0
@@ 0,0 1,66 @@
import re
from unidecode import unidecode

from ..utils import translation, squeeze
from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class Soundex(PhoneticAlgorithm):
    """
    The Soundex algorithm.

    [Reference]: https://en.wikipedia.org/wiki/Soundex
    [Authors]: Robert C. Russel, Margaret King Odell
    """
    def __init__(self, refined=False):
        self.translations = translation(
            'AEIOUYWHBPFVCSKGJQXZDTLMNR',
            '000000DD111122222222334556'
        )

        self.refined_translations = translation(
            'AEIOUYWHBPFVCKSGJQXZDTLMNR',
            '000000DD112233344555667889'
        )

        self.refined = refined
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)

        first_letter = word[0]

        if self.refined:
            return self._refined_soundex(first_letter, word)
        else:
            return self._soundex(first_letter, word)

    #
    # Private methods Simple/Refined Soundex
    #
    def _soundex(self, first_letter, word):
        """Soundex algorithm."""
        tail = ''.join(self.translations[char]
                       for char in word
                       if self.translations[char] != 'D')

        # Dropping first code's letter if duplicate
        if tail[0] == self.translations[first_letter]:
            tail = tail[1:]

        code = squeeze(tail).replace('0', '')
        return self.pad(first_letter + code)

    def _refined_soundex(self, first_letter, word):
        """Refined Soundex algorithm."""
        tail = ''.join(self.refined_translations[char]
                       for char in word
                       if self.refined_translations[char] != 'D')
        code = squeeze(tail)
        return first_letter + code

A pyphonetics/utils.py => pyphonetics/utils.py +15 -0
@@ 0,0 1,15 @@
from itertools import groupby

from .exceptions import WrongLengthException


def translation(first, second):
    """Create an index of mapped letters (zip to dict)."""
    if len(first) != len(second):
        raise WrongLengthException('Expected a unicode string!')
    return dict(zip(first, second))


def squeeze(word):
    """squeeze the given sequence by dropping consecutive duplicates."""
    return ''.join(x[0] for x in groupby(word))

A tests/__init__.py => tests/__init__.py +0 -0
A tests/test_phonetics.py => tests/test_phonetics.py +189 -0
@@ 0,0 1,189 @@
from pyphonetics import Metaphone, Soundex, MatchingRatingApproach, FuzzySoundex, Lein


def test_metaphone():
    tests = [
        ('TSKRMNXN', 'discrimination'),
        ('HL', 'hello'),
        ('TRT', 'droid'),
        ('HPKRT', 'hypocrite'),
        ('WL', 'well'),
        ('AM', 'am'),
        ('S', 'say'),
        ('FSNT', 'pheasant'),
        ('KT', 'god')
    ]

    metaphone = Metaphone()
    for test in tests:
        assert metaphone.phonetics(test[1]) == test[0]


def test_soundex():
    tests = [
        ('R163', 'Rupert'),
        ('R163', 'Robert'),
        ('R150', 'Rubin'),
        ('A261', 'Ashcroft'),
        ('A261', 'Ashcraft'),
        ('T522', 'Tymczak'),
        ('P123', 'Pfister'),
        ('A536', 'Andrew'),
        ('W252', 'Wozniak'),
        ('C423', 'Callister'),
        ('H400', 'Hello'),
        ('M635', 'Martin'),
        ('B656', 'Bernard'),
        ('F600', 'Faure'),
        ('P620', 'Perez'),
        ('G620', 'Gros'),
        ('C120', 'Chapuis'),
        ('B600', 'Boyer'),
        ('G360', 'Gauthier'),
        ('R000', 'Rey'),
        ('B634', 'Barthélémy'),
        ('H560', 'Henry'),
        ('M450', 'Moulin'),
        ('R200', 'Rousseau')
    ]

    soundex = Soundex()
    for test in tests:
        assert soundex.phonetics(test[1]) == test[0]


def test_soundex_refined():
    tests = [
        ('T6036084', 'testing'),
        ('T6036084', 'TESTING'),
        ('T60', 'The'),
        ('Q503', 'quick'),
        ('B1908', 'brown'),
        ('F205', 'fox'),
        ('J408106', 'jumped'),
        ('O0209', 'over'),
        ('L7050', 'lazy'),
        ('D6043', 'dogs')
    ]

    soundex = Soundex(refined=True)
    for test in tests:
        assert soundex.phonetics(test[1]) == test[0]
        
        
def test_soundex_homophones():
    tests = [
        ('Braz', 'Broz'),
        ('Caren', 'Caron', 'Carren', 'Charon', 'Corain', 'Coram', 'Corran', 
         'Corrin', 'Corwin', 'Curran', 'Curreen','Currin', 'Currom', 'Currum', 'Curwen'),
        ('Hairs', 'Hark', 'Hars', 'Hayers', 'Heers', 'Hiers'),
        ('Lambard', 'Lambart', 'Lambert', 'Lambird', 'Lampaert', 'Lampard', 
         'Lampart', 'Lamperd', 'Lampert', 'Lamport','Limbert', 'Lombard'),
        ('Nolton', 'Noulton')
    ]

    soundex = Soundex()
    for test in tests:
        phonetics = [soundex.phonetics(word) for word in test]
        assert len(set(phonetics)) == 1  # all phonetics are the same, so set size = 1


def test_mra():
    tests = [
        ('BYRN', 'Byrne'),
        ('BRN', 'Boern'),
        ('SMTH', 'Smith'),
        ('SMYTH', 'Smyth'),
        ('CTHRN', 'Catherine'),
        ('KTHRYN', 'Kathryn')
    ]

    mra = MatchingRatingApproach()
    for test in tests:
        assert mra.phonetics(test[1]) == test[0]


def test_fuzzy_soundex():
    tests = [
        ('', ''),
        ('Kristen', 'K6935'),
        ('Krissy', 'K69'),
        ('Christen', 'K6935'),
        ('peter', 'P36'),
        ('pete', 'P3'),
        ('pedro', 'P36'),
        ('stephen', 'S315'),
        ('steve', 'S31'),
        ('smith', 'S53'),
        ('smythe', 'S53'),
        ('gail', 'G4'),
        ('gayle', 'G4'),
        ('guillaume', 'G45'),
        ('christine', 'K6935'),
        ('christina', 'K6935'),
        ('kristina', 'K6935'),
        ('Wight', 'W3'),
        ('Hardt', 'H6'),
        ('Knight', 'N3'),
        ('Czech', 'S7'),
        ('Tsech', 'S7'),
        ('gnomic', 'N59'),
        ('Wright', 'R3'),
        ('Hrothgar', 'R376'),
        ('Hwaet', 'W3'),
        ('Grant', 'G63'),
        ('Hart', 'H6')
    ]

    fuzzy = FuzzySoundex()
    for test in tests:
        assert fuzzy.phonetics(test[0]) == test[1]


def test_lein():
    tests = [
        ('Guillaume', 'G320'),
        ('Dabbs', 'D450'),
        ('Daves', 'D450'),
        ('Davies', 'D450'),
        ('Davis', 'D450'),
        ('Debaca', 'D450'),
        ('Debose', 'D450'),
        ('Debus', 'D450'),
        ('Defazio', 'D450'),
        ('Defigh', 'D450'),
        ('Deveaux', 'D450'),
        ('Devese', 'D450'),
        ('Devies', 'D450'),
        ('Devos', 'D450'),
        ('Dipiazza', 'D450'),
        ('Divish', 'D450'),
        ('Dobak', 'D450'),
        ('Dobbs', 'D450'),
        ('Dobis', 'D450'),
        ('Dobish', 'D450'),
        ('Dobosh', 'D450'),
        ('Doepke', 'D450'),
        ('Dopps', 'D450'),
        ('Doubek', 'D450'),
        ('Doviak', 'D450'),
        ('Dubbs', 'D450'),
        ('Dubke', 'D450'),
        ('Dubois', 'D450'),
        ('Duboise', 'D450'),
        ('Dubose', 'D450'),
        ('Dubs', 'D450'),
        ('Dubukey', 'D450'),
        ('Dubus', 'D450'),
        ('Dufek', 'D450'),
        ('Duffek', 'D450'),
        ('Dupas', 'D450'),
        ('Dupois', 'D450'),
        ('Dupuis', 'D450'),
        ('Arlène', 'A332'),
        ('Lüdenscheidt', 'L125')
    ]

    lein = Lein()
    for test in tests:
        assert lein.phonetics(test[0]) == test[1]
\ No newline at end of file

A tests/test_utils.py => tests/test_utils.py +16 -0
@@ 0,0 1,16 @@
from pyphonetics.utils import squeeze, translation


def test_squeeze():
    tests = [
        ('test', 'test'),
        ('hello yellow', 'helo yelow'),
        ('112345566', '123456')
    ]

    for test in tests:
        assert squeeze(test[0]) == test[1]


def test_translation():
    assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}
\ No newline at end of file