~linuxgoose/linguistics-robin: Algorithm implementation:

15 files changed, 575 insertions(+), 2 deletions(-)

A LICENSE.rst
M flit.ini
M pyphonetics/__init__.py
A pyphonetics/exceptions.py
M pyphonetics/phonetics/__init__.py
A pyphonetics/phonetics/fuzzy_soundex.py
A pyphonetics/phonetics/lein.py
A pyphonetics/phonetics/metaphone.py
A pyphonetics/phonetics/mra.py
A pyphonetics/phonetics/phonetic_algorithm.py
M pyphonetics/phonetics/soundex.py
A pyphonetics/utils.py
A tests/__init__.py
A tests/test_phonetics.py
A tests/test_utils.py

A LICENSE.rst => LICENSE.rst +21 -0

@@ 0,0 1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Guillaume Plique (Yomguithereal)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.<
\ No newline at end of file

M flit.ini => flit.ini +3 -0

@@ 3,6 3,9 @@ module=pyphonetics
 author=Lilykos
 author-email=lilykosk@gmail.com
 home-page=http://github.com/Lilykos/pyphonetics
+requires=unidecode
+    pytest
+requires-python= >=3
 description-file=README.rst
 classifiers=Intended Audience :: Developers
     Programming Language :: Python :: 3

M pyphonetics/__init__.py => pyphonetics/__init__.py +2 -2

@@ 1,4 1,4 @@
 """A Python 3 phonetics library."""
-from .phonetics import *
+from .phonetics import Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein
 
-__version__ = '0.1'
+__version__ = '0.2'

A pyphonetics/exceptions.py => pyphonetics/exceptions.py +6 -0

@@ 0,0 1,6 @@
+class UnicodeException(Exception):
+    pass
+
+
+class WrongLengthException(Exception):
+    pass

M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +4 -0

@@ 1,1 1,5 @@
 from .soundex import *
+from .metaphone import *
+from .mra import *
+from .fuzzy_soundex import *
+from .lein import *<
\ No newline at end of file

A pyphonetics/phonetics/fuzzy_soundex.py => pyphonetics/phonetics/fuzzy_soundex.py +112 -0

@@ 0,0 1,112 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze, translation
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class FuzzySoundex(PhoneticAlgorithm):
+    """
+    Implementation of the "Fuzzy Soundex" algorithm.
+
+    [Reference]: http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
+    [Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
+    """
+    def __init__(self):
+        self.translations = translation(
+            'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
+            '0193017-07745501769301-7-9'
+        )
+
+        self.rules = [
+            (r'CA', r'KA'),
+            (r'CC', r'KK'),
+            (r'CK', r'KK'),
+            (r'CE', r'SE'),
+            (r'CHL', r'KL'),
+            (r'CL', r'KL'),
+            (r'CHR', r'KR'),
+            (r'CR', r'KR'),
+            (r'CI', r'SI'),
+            (r'CO', r'KO'),
+            (r'CU', r'KU'),
+            (r'CY', r'SY'),
+            (r'DG', r'GG'),
+            (r'GH', r'HH'),
+            (r'MAC', r'MK'),
+            (r'MC', r'MK'),
+            (r'NST', r'NSS'),
+            (r'PF', r'FF'),
+            (r'PH', r'FF'),
+            (r'SCH', r'SSS'),
+            (r'TIO', r'SIO'),
+            (r'TIA', r'SIO'),
+            (r'TCH', r'CHH'),
+        ]
+
+        self.set1 = ['CS', 'CZ', 'TS', 'TZ']
+        self.set2 = ['HR', 'WR']
+        self.set3 = ['KN', 'NG']
+        self.set4 = list('HWY')
+
+    def phonetics(self, word):
+        if not isinstance(word, str):
+            raise UnicodeException('Expected a unicode string!')
+
+        if not word:
+            return ''
+
+        word = unidecode(word).upper()
+
+        # Substitutions for beginnings
+        first_two, rest = word[:2], word[2:]
+
+        if first_two in self.set1:
+            word = 'SS' + rest
+        elif first_two == 'GN':
+            word = 'NN' + rest
+        elif first_two in self.set2:
+            word = 'RR' + rest
+        elif first_two == 'HW':
+            word = 'WW' + rest
+        elif first_two in self.set3:
+            word = 'NN' + rest
+
+        # Substitutions for endings
+        last_two, initial = word[-2:], word[0:-2]
+
+        if last_two == 'CH':
+            word = initial + 'KK'
+        elif last_two == 'NT':
+            word = initial + 'TT'
+        elif last_two == 'RT':
+            word = initial + 'RR'
+        elif word[-3:] == 'RDT':
+            word = word[0:-3] + 'RR'
+
+        # Applying the rules
+        for rule in self.rules:
+            word = re.sub(rule[0], rule[1], word)
+
+        # Catch the first letter
+        first_letter = word[0]
+
+        # Translating
+        code = ''.join(self.translations.get(char, char) for char in word)
+
+        # Removing hyphens
+        code = code.replace('-', '')
+
+        # Squeezing the code
+        code = squeeze(code)
+
+        # Dealing with initials
+        if code[0] in self.set4:
+            code = first_letter
+        else:
+            code = first_letter + code[1:]
+
+        # Dropping vowels
+        code = code.replace('0', '')
+        return code

A pyphonetics/phonetics/lein.py => pyphonetics/phonetics/lein.py +43 -0

@@ 0,0 1,43 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze, translation
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Lein(PhoneticAlgorithm):
+    """
+    The Lein name coding procedure.
+
+    [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
+    """
+    def __init__(self):
+        self.translations = translation(
+            'DTMNLRBFPVCJKGQSXZ',
+            '112233444455555555'
+        )
+
+        self.pad = lambda code: '{}0000'.format(code)[:4]
+
+    def phonetics(self, word):
+        if not isinstance(word, str):
+            raise UnicodeException('Expected a unicode string!')
+
+        word = unidecode(word).upper()
+        word = re.sub(r'[^A-Z]\s', r'', word)
+
+        # Keep the 1st letter
+        first, code = word[0], word[1:]
+
+        # Drop vowels and Y, W & H
+        code = re.sub(r'[AEIOUYWH]', r'', code)
+
+        # Drop consecutive duplicates and truncate to 4 chars
+        code = squeeze(code)[0: 4]
+
+        # Translations
+        backup = code
+        code = ''.join(self.translations.get(char, char) for char in backup)
+
+        return self.pad(first + code)

A pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +53 -0

@@ 0,0 1,53 @@
+import re
+from unidecode import unidecode
+
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Metaphone(PhoneticAlgorithm):
+    """
+    The metaphone algorithm.
+
+    [Reference]: https://en.wikipedia.org/wiki/Metaphone
+    [Author]: Lawrence Philips, 1990
+    """
+    def __init__(self):
+        self.rules = [
+            (r'[^a-z]', r''),
+            (r'([bcdfhjklmnpqrstvwxyz])\1+', r'\1'),
+            (r'^ae', r'E'),
+            (r'^[gkp]n', r'N'),
+            (r'^wr', r'R'),
+            (r'^x', r'S'),
+            (r'^wh', r'W'),
+            (r'mb$', r'M'),
+            (r'(?!^)sch', r'SK'),
+            (r'th', r'0'),
+            (r't?ch|sh', r'X'),
+            (r'c(?=ia)', r'X'),
+            (r'[st](?=i[ao])', r'X'),
+            (r's?c(?=[iey])', r'S'),
+            (r'[cq]', r'K'),
+            (r'dg(?=[iey])', r'J'),
+            (r'd', r'T'),
+            (r'g(?=h[^aeiou])', r''),
+            (r'gn(ed)?', r'N'),
+            (r'([^g]|^)g(?=[iey])', '\1J'),
+            (r'g+', 'K'),
+            (r'ph', 'F'),
+            (r'([aeiou])h(?=\b|[^aeiou])', '\1'),
+            (r'[wy](?![aeiou])', ''),
+            (r'z', 'S'),
+            (r'v', 'F'),
+            (r'(?!^)[aeiou]+', '')
+        ]
+
+    def phonetics(self, word):
+        if not isinstance(word, str):
+            raise UnicodeException('Expected a unicode string!')
+
+        code = unidecode(word).lower()
+        for item in self.rules:
+            code = re.sub(item[0], item[1], code)
+        return code.upper()

A pyphonetics/phonetics/mra.py => pyphonetics/phonetics/mra.py +33 -0

@@ 0,0 1,33 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class MatchingRatingApproach(PhoneticAlgorithm):
+    """
+    Functions related to the computation of the Match Rating Approach codex.
+
+    [Reference]: https://en.wikipedia.org/wiki/Match_rating_approach
+    [Article]: Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977).
+        Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
+        US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
+    """
+    def phonetics(self, word):
+        if not isinstance(word, str):
+            raise UnicodeException('Expected a unicode string!')
+
+        codex = unidecode(word).upper()
+        codex = re.sub(r'[^A-Z]', r'', codex)
+
+        # Dropping non - leading vowels
+        codex = codex[0] + re.sub(r'[AEIOU]', r'', codex[1:])
+
+        # Dropping consecutive consonants
+        codex = squeeze(codex)
+
+        # Returning the codex
+        offset = min(3, len(codex) - 3)
+        return codex[:3] + codex[len(codex) - offset:offset + len(codex)]

A pyphonetics/phonetics/phonetic_algorithm.py => pyphonetics/phonetics/phonetic_algorithm.py +12 -0

@@ 0,0 1,12 @@
+class PhoneticAlgorithm:
+    """
+    The main Phonetic Algorithm class, to ensure a unified API
+    for all the included algorithms.
+    """
+    def phonetics(self, word):
+        """Get the phonetic representation of the word."""
+        pass
+
+    def sounds_like(self, word1, word2):
+        """Compare the phonetic representations of 2 words, and return a boolean value."""
+        return self.phonetics(word1) == self.phonetics(word2)

M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +66 -0

@@ 0,0 1,66 @@
+import re
+from unidecode import unidecode
+
+from ..utils import translation, squeeze
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Soundex(PhoneticAlgorithm):
+    """
+    The Soundex algorithm.
+
+    [Reference]: https://en.wikipedia.org/wiki/Soundex
+    [Authors]: Robert C. Russel, Margaret King Odell
+    """
+    def __init__(self, refined=False):
+        self.translations = translation(
+            'AEIOUYWHBPFVCSKGJQXZDTLMNR',
+            '000000DD111122222222334556'
+        )
+
+        self.refined_translations = translation(
+            'AEIOUYWHBPFVCKSGJQXZDTLMNR',
+            '000000DD112233344555667889'
+        )
+
+        self.refined = refined
+        self.pad = lambda code: '{}0000'.format(code)[:4]
+
+    def phonetics(self, word):
+        if not isinstance(word, str):
+            raise UnicodeException('Expected a unicode string!')
+
+        word = unidecode(word).upper()
+        word = re.sub(r'[^A-Z]', r'', word)
+
+        first_letter = word[0]
+
+        if self.refined:
+            return self._refined_soundex(first_letter, word)
+        else:
+            return self._soundex(first_letter, word)
+
+    #
+    # Private methods Simple/Refined Soundex
+    #
+    def _soundex(self, first_letter, word):
+        """Soundex algorithm."""
+        tail = ''.join(self.translations[char]
+                       for char in word
+                       if self.translations[char] != 'D')
+
+        # Dropping first code's letter if duplicate
+        if tail[0] == self.translations[first_letter]:
+            tail = tail[1:]
+
+        code = squeeze(tail).replace('0', '')
+        return self.pad(first_letter + code)
+
+    def _refined_soundex(self, first_letter, word):
+        """Refined Soundex algorithm."""
+        tail = ''.join(self.refined_translations[char]
+                       for char in word
+                       if self.refined_translations[char] != 'D')
+        code = squeeze(tail)
+        return first_letter + code

A pyphonetics/utils.py => pyphonetics/utils.py +15 -0

@@ 0,0 1,15 @@
+from itertools import groupby
+
+from .exceptions import WrongLengthException
+
+
+def translation(first, second):
+    """Create an index of mapped letters (zip to dict)."""
+    if len(first) != len(second):
+        raise WrongLengthException('Expected a unicode string!')
+    return dict(zip(first, second))
+
+
+def squeeze(word):
+    """squeeze the given sequence by dropping consecutive duplicates."""
+    return ''.join(x[0] for x in groupby(word))

A tests/__init__.py => tests/__init__.py +0 -0

A tests/test_phonetics.py => tests/test_phonetics.py +189 -0

@@ 0,0 1,189 @@
+from pyphonetics import Metaphone, Soundex, MatchingRatingApproach, FuzzySoundex, Lein
+
+
+def test_metaphone():
+    tests = [
+        ('TSKRMNXN', 'discrimination'),
+        ('HL', 'hello'),
+        ('TRT', 'droid'),
+        ('HPKRT', 'hypocrite'),
+        ('WL', 'well'),
+        ('AM', 'am'),
+        ('S', 'say'),
+        ('FSNT', 'pheasant'),
+        ('KT', 'god')
+    ]
+
+    metaphone = Metaphone()
+    for test in tests:
+        assert metaphone.phonetics(test[1]) == test[0]
+
+
+def test_soundex():
+    tests = [
+        ('R163', 'Rupert'),
+        ('R163', 'Robert'),
+        ('R150', 'Rubin'),
+        ('A261', 'Ashcroft'),
+        ('A261', 'Ashcraft'),
+        ('T522', 'Tymczak'),
+        ('P123', 'Pfister'),
+        ('A536', 'Andrew'),
+        ('W252', 'Wozniak'),
+        ('C423', 'Callister'),
+        ('H400', 'Hello'),
+        ('M635', 'Martin'),
+        ('B656', 'Bernard'),
+        ('F600', 'Faure'),
+        ('P620', 'Perez'),
+        ('G620', 'Gros'),
+        ('C120', 'Chapuis'),
+        ('B600', 'Boyer'),
+        ('G360', 'Gauthier'),
+        ('R000', 'Rey'),
+        ('B634', 'Barthélémy'),
+        ('H560', 'Henry'),
+        ('M450', 'Moulin'),
+        ('R200', 'Rousseau')
+    ]
+
+    soundex = Soundex()
+    for test in tests:
+        assert soundex.phonetics(test[1]) == test[0]
+
+
+def test_soundex_refined():
+    tests = [
+        ('T6036084', 'testing'),
+        ('T6036084', 'TESTING'),
+        ('T60', 'The'),
+        ('Q503', 'quick'),
+        ('B1908', 'brown'),
+        ('F205', 'fox'),
+        ('J408106', 'jumped'),
+        ('O0209', 'over'),
+        ('L7050', 'lazy'),
+        ('D6043', 'dogs')
+    ]
+
+    soundex = Soundex(refined=True)
+    for test in tests:
+        assert soundex.phonetics(test[1]) == test[0]
+        
+        
+def test_soundex_homophones():
+    tests = [
+        ('Braz', 'Broz'),
+        ('Caren', 'Caron', 'Carren', 'Charon', 'Corain', 'Coram', 'Corran', 
+         'Corrin', 'Corwin', 'Curran', 'Curreen','Currin', 'Currom', 'Currum', 'Curwen'),
+        ('Hairs', 'Hark', 'Hars', 'Hayers', 'Heers', 'Hiers'),
+        ('Lambard', 'Lambart', 'Lambert', 'Lambird', 'Lampaert', 'Lampard', 
+         'Lampart', 'Lamperd', 'Lampert', 'Lamport','Limbert', 'Lombard'),
+        ('Nolton', 'Noulton')
+    ]
+
+    soundex = Soundex()
+    for test in tests:
+        phonetics = [soundex.phonetics(word) for word in test]
+        assert len(set(phonetics)) == 1  # all phonetics are the same, so set size = 1
+
+
+def test_mra():
+    tests = [
+        ('BYRN', 'Byrne'),
+        ('BRN', 'Boern'),
+        ('SMTH', 'Smith'),
+        ('SMYTH', 'Smyth'),
+        ('CTHRN', 'Catherine'),
+        ('KTHRYN', 'Kathryn')
+    ]
+
+    mra = MatchingRatingApproach()
+    for test in tests:
+        assert mra.phonetics(test[1]) == test[0]
+
+
+def test_fuzzy_soundex():
+    tests = [
+        ('', ''),
+        ('Kristen', 'K6935'),
+        ('Krissy', 'K69'),
+        ('Christen', 'K6935'),
+        ('peter', 'P36'),
+        ('pete', 'P3'),
+        ('pedro', 'P36'),
+        ('stephen', 'S315'),
+        ('steve', 'S31'),
+        ('smith', 'S53'),
+        ('smythe', 'S53'),
+        ('gail', 'G4'),
+        ('gayle', 'G4'),
+        ('guillaume', 'G45'),
+        ('christine', 'K6935'),
+        ('christina', 'K6935'),
+        ('kristina', 'K6935'),
+        ('Wight', 'W3'),
+        ('Hardt', 'H6'),
+        ('Knight', 'N3'),
+        ('Czech', 'S7'),
+        ('Tsech', 'S7'),
+        ('gnomic', 'N59'),
+        ('Wright', 'R3'),
+        ('Hrothgar', 'R376'),
+        ('Hwaet', 'W3'),
+        ('Grant', 'G63'),
+        ('Hart', 'H6')
+    ]
+
+    fuzzy = FuzzySoundex()
+    for test in tests:
+        assert fuzzy.phonetics(test[0]) == test[1]
+
+
+def test_lein():
+    tests = [
+        ('Guillaume', 'G320'),
+        ('Dabbs', 'D450'),
+        ('Daves', 'D450'),
+        ('Davies', 'D450'),
+        ('Davis', 'D450'),
+        ('Debaca', 'D450'),
+        ('Debose', 'D450'),
+        ('Debus', 'D450'),
+        ('Defazio', 'D450'),
+        ('Defigh', 'D450'),
+        ('Deveaux', 'D450'),
+        ('Devese', 'D450'),
+        ('Devies', 'D450'),
+        ('Devos', 'D450'),
+        ('Dipiazza', 'D450'),
+        ('Divish', 'D450'),
+        ('Dobak', 'D450'),
+        ('Dobbs', 'D450'),
+        ('Dobis', 'D450'),
+        ('Dobish', 'D450'),
+        ('Dobosh', 'D450'),
+        ('Doepke', 'D450'),
+        ('Dopps', 'D450'),
+        ('Doubek', 'D450'),
+        ('Doviak', 'D450'),
+        ('Dubbs', 'D450'),
+        ('Dubke', 'D450'),
+        ('Dubois', 'D450'),
+        ('Duboise', 'D450'),
+        ('Dubose', 'D450'),
+        ('Dubs', 'D450'),
+        ('Dubukey', 'D450'),
+        ('Dubus', 'D450'),
+        ('Dufek', 'D450'),
+        ('Duffek', 'D450'),
+        ('Dupas', 'D450'),
+        ('Dupois', 'D450'),
+        ('Dupuis', 'D450'),
+        ('Arlène', 'A332'),
+        ('Lüdenscheidt', 'L125')
+    ]
+
+    lein = Lein()
+    for test in tests:
+        assert lein.phonetics(test[0]) == test[1]<
\ No newline at end of file

A tests/test_utils.py => tests/test_utils.py +16 -0

@@ 0,0 1,16 @@
+from pyphonetics.utils import squeeze, translation
+
+
+def test_squeeze():
+    tests = [
+        ('test', 'test'),
+        ('hello yellow', 'helo yelow'),
+        ('112345566', '123456')
+    ]
+
+    for test in tests:
+        assert squeeze(test[0]) == test[1]
+
+
+def test_translation():
+    assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}<
\ No newline at end of file