~linuxgoose/linguistics-robin: Added Levenshtein, Hamming distance, removed half-done algorithms.

19 files changed, 174 insertions(+), 271 deletions(-)

M README.md
M README.rst
M pyphonetics/__init__.py
A pyphonetics/distance_metrics/__init__.py
A pyphonetics/distance_metrics/hamming.py
A pyphonetics/distance_metrics/levenshtein.py
M pyphonetics/exceptions.py
M pyphonetics/phonetics/__init__.py
D pyphonetics/phonetics/daitch_mokotoff.py
M pyphonetics/phonetics/fuzzy_soundex.py
M pyphonetics/phonetics/lein.py
M pyphonetics/phonetics/metaphone.py
M pyphonetics/phonetics/mra.py
M pyphonetics/phonetics/phonetic_algorithm.py
M pyphonetics/phonetics/refined_soundex.py
M pyphonetics/phonetics/soundex.py
M pyphonetics/utils.py
A tests/test_distances.py
M tests/test_phonetics.py

M README.md => README.md +16 -0

@@ 8,6 8,11 @@ Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the follow
  * Fuzzy Soundex
  * Lein
  * Matching Rating Approach
+ 
+In addition, the following distance metrics:
+
+ * Hamming
+ * Levenshtein
 
 More will be added in the future.
 


@@ 38,6 43,17 @@ The same API applies to every algorithm, e.g:
 'TSKRMNXN'
 ```
 
+You can also use the `distance(word1, word2, metric='levenshtein')` method to find the distance between 2 phonetic representations.
+
+```python
+>>> from pyphonetics import RefinedSoundex
+>>> rs = RefinedSoundex()
+>>> rs.distance('Rupert', 'Robert')
+0
+>>> rs.distance('assign', 'assist', metric='hamming')
+2
+```
+
 ## Credits
 
 The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.=
\ No newline at end of file

M README.rst => README.rst +10 -1

@@ 39,7 39,16 @@ The same API applies to every algorithm, e.g:
     >>> metaphone.phonetics('discrimination')
     'TSKRMNXN'
 
+You can also use the `distance(word1, word2, metric='levenshtein')` method to find the distance between 2 phonetic representations.
+
+    >>> from pyphonetics import RefinedSoundex
+    >>> rs = RefinedSoundex()
+    >>> rs.distance('Rupert', 'Robert')
+    0
+    >>> rs.distance('assign', 'assist', metric='hamming')
+    2
+
 Credits
 =======
 
-The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.>
\ No newline at end of file
+The module was largely based on the implementation of phonetic algorithms found in the Talisman.js (https://github.com/Yomguithereal/talisman) Node NLP library.<
\ No newline at end of file

M pyphonetics/__init__.py => pyphonetics/__init__.py +2 -4

@@ 4,8 4,6 @@ from .phonetics import (Soundex,
                         MatchingRatingApproach,
                         FuzzySoundex,
                         Lein,
-                        RefinedSoundex,
-                        # DaitchMokotoff
-                        )
+                        RefinedSoundex)
 
-__version__ = '0.3.1'
+__version__ = '0.4.1'

A pyphonetics/distance_metrics/__init__.py => pyphonetics/distance_metrics/__init__.py +2 -0

@@ 0,0 1,2 @@
+from .levenshtein import *
+from .hamming import *

A pyphonetics/distance_metrics/hamming.py => pyphonetics/distance_metrics/hamming.py +16 -0

@@ 0,0 1,16 @@
+from ..exceptions import WrongLengthException
+
+
+def hamming_distance(word1, word2):
+    """
+    Computes the Hamming distance.
+
+    [Reference]: https://en.wikipedia.org/wiki/Hamming_distance
+    [Article]: Hamming, Richard W. (1950), "Error detecting and error correcting codes",
+        Bell System Technical Journal 29 (2): 147–160
+    """
+    from operator import ne
+    if len(word1) != len(word2):
+        raise WrongLengthException('The words need to be of the same length!')
+
+    return sum(map(ne, word1, word2))

A pyphonetics/distance_metrics/levenshtein.py => pyphonetics/distance_metrics/levenshtein.py +29 -0

@@ 0,0 1,29 @@
+def levenshtein_distance(word1, word2):
+    """
+    Computes the Levenshtein distance.
+
+    [Reference]: https://en.wikipedia.org/wiki/Levenshtein_distance
+    [Article]: Levenshtein, Vladimir I. (February 1966). "Binary codes capable of correcting deletions,
+        insertions,and reversals". Soviet Physics Doklady 10 (8): 707–710.
+    [Implementation]: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
+    """
+    if len(word1) < len(word2):
+        return levenshtein_distance(word2, word1)
+
+    if len(word2) == 0:
+        return len(word1)
+
+    previous_row = list(range(len(word2) + 1))
+
+    for i, char1 in enumerate(word1):
+        current_row = [i + 1]
+
+        for j, char2 in enumerate(word2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (char1 != char2)
+
+            current_row.append(min(insertions, deletions, substitutions))
+
+        previous_row = current_row
+    return previous_row[-1]

M pyphonetics/exceptions.py => pyphonetics/exceptions.py +4 -0

@@ 4,3 4,7 @@ class UnicodeException(Exception):
 
 class WrongLengthException(Exception):
     pass
+
+
+class DistanceMetricError(Exception):
+    pass

M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +0 -1

@@ 4,4 4,3 @@ from .mra import *
 from .fuzzy_soundex import *
 from .lein import *
 from .refined_soundex import *
-# from .daitch_mokotoff import *

D pyphonetics/phonetics/daitch_mokotoff.py => pyphonetics/phonetics/daitch_mokotoff.py +0 -201

@@ 1,201 0,0 @@
-import re
-from unidecode import unidecode
-
-from ..exceptions import UnicodeException
-from .phonetic_algorithm import PhoneticAlgorithm
-
-
-class DaitchMokotoff(PhoneticAlgorithm):
-    """
-    The Daitch-Mokotoff Soundex.
-
-    [Reference]: https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
-    [Note]: For the (RS|RZ) part, the original algo says (94, 4) but most implementations
-        drop it to only (94). This implementation follows the original algo.
-    """
-    def __init__(self):
-        self.rules = {
-            'A': [
-                [r'^(AI|AJ|AY)', 0, 1, None],
-                [r'^AU', 0, 7, None],
-                [None, 0, None, None]
-            ],
-            'Ą': [
-                [None, None, None, [6, None]]
-            ],
-            'B': [
-                [None, 7, 7, 7]
-            ],
-            'C': [
-                [r'^CHS', 5, 54, 54],
-                [r'^CH', [5, 4], [5, 4], [5, 4]],
-                [r'^CK', [5, 45], [5, 45], [5, 45]],
-                [r'^(CSZ|CZS|CZ|CS)', 4, 4, 4],
-                [None, [5, 4], [5, 4], [5, 4]]
-            ],
-            'D': [
-                [r'^(DRZ|DRS|DSH|DSZ|DZH|DZS|DS|DZ)', 4, 4, 4],
-                [r'^(DT|D)', 3, 3, 3]
-            ],
-            'E': [
-                [r'^(EI|EJ|EY)', 0, 1, None],
-                [r'^EU', 1, 1, None],
-                [None, 0, None, None],
-            ],
-            'Ę': [
-                [None, None, None, [6, None]]
-            ],
-            'F': [
-                [r'^(FB|F)', 7, 7, 7]
-            ],
-            'G': [
-                [None, 5, 5, 5]
-            ],
-            'H': [
-                [None, 5, 5, None]
-            ],
-            'I': [
-                [r'^(IA|IE|IO|IU)', 1, None, None],
-                [None, 0, None, None]
-            ],
-            'J': [
-                [None, [1, 4], [None, 4], [None, 4]]
-            ],
-            'K': [
-                [r'^KS', 5, 54, 54],
-                [r'^(KH|K)', 5, 5, 5]
-            ],
-            'L': [
-                [None, 8, 8, 8]
-            ],
-            'M': [
-                ['MNNM', None, 66, 66],
-                ['MN', 6, 6, 6]
-            ],
-            'N': [
-                ['MNNM', None, 66, 66],
-                ['MN', 6, 6, 6],
-            ],
-            'O': [
-                [r'^(OI|OJ|OY)', 0, 1, None],
-                [None, 0, None, None]
-            ],
-            'P': [
-                [r'^(PF|PH|P)', 7, 7, 7]
-            ],
-            'Q': [
-                [None, 5, 5, 5]
-            ],
-            'R': [
-                [r'^(RZ|RS)', [94, 4], [94, 4], [94, 4]],
-                [None, 9, 9, 9]
-            ],
-            'S': [
-                [r'^(SCHTSCH|SCHTSH|SCHTCH|SHTCH|SHCH|SHTSH)', 2, 4, 4],
-                [r'^SCH', 4, 4, 4],
-                [r'^(SHT|SCHT|SCHD)', 2, 43, 43],
-                [r'^SH', 4, 4, 4],
-                [r'^(STCH|STSCH|SC|STRZ|STRS|STSH)', 2, 4, 4],
-                [r'^ST', 2, 43, 43],
-                [r'^(SZCZ|SZCS)', 2, 4, 4],
-                [r'^(SZT|SHD|SZD|SD)', 2, 43, 43],
-                [r'^(SZ|S)', 4, 4, 4]
-            ],
-            'T': [
-                [r'^(TCH|TTCH|TTSCH)', 4, 4, 4],
-                [r'^TH', 3, 3, 3],
-                [r'^(TRZ|TRS|TSCH|TSH|TS|TTS|TTSZ|TC|TZ|TTZ|TZS|TSZ)', 4, 4, 4],
-                [None, 3, 3, 3]
-            ],
-            'Ţ': [
-                [None, [3, 4], [3, 4], [3, 4]]
-            ],
-            'U': [
-                [r'^(UI|UJ|UY)', 0, 1, None],
-                [r'^(UE|U)', 0, None, None]
-            ],
-            'V': [
-                [None, 7, 7, 7]
-            ],
-            'W': [
-                [None, 7, 7, 7]
-            ],
-            'X': [
-                [None, 5, 54, 54]
-            ],
-            'Y': [
-                [None, 1, None, None]
-            ],
-            'Z': [
-                [r'^(ZHDZH|ZDZH|ZDZ)', 2, 4, 4],
-                [r'^(ZHD|ZD)', 2, 43, 43],
-                [r'^(ZSCH|ZSH|ZH|ZS|Z)', 4, 4, 4]
-            ],
-        }
-        self.pad = lambda code: '{}000000'.format(code)[:6]
-        self.vowels = 'AEIOUY'
-
-    def _permutations(self, code):
-        codes = ['']
-
-        for current_part in code:
-
-            if isinstance(current_part, dict):
-                # Double the codes
-                for item in codes:
-                    codes.append(item)
-
-                # Fill the nodes
-                length = len(codes)
-                for i in range(length):
-                    s = current_part[0] if i < length/2 else current_part[1]
-                    codes[i] = codes[i] + s if s is not None else codes[i]
-
-            else:
-                for i in range(len(codes)):
-                    codes[i] += current_part
-
-    def phonetics(self, word):
-        if not isinstance(word, str):
-            raise UnicodeException('Expected a unicode string!')
-
-        code = []
-        word = unidecode(word).upper()
-        current = re.sub(r'[^A-ZĄĘŢ]', r'', word)
-
-        start = True
-        last_pattern = ''
-
-        while len(current):
-            first_letter = current[0]
-            rules = self.rules[first_letter]
-
-            for rule in rules:
-                pattern, if_first_letter,\
-                    vowel_next, usual = rule
-
-                match = re.match(pattern, current) if pattern else [first_letter]
-                if match:
-                    if isinstance(match, list):
-                        offset = len(match[0])
-                    else:
-                        offset = len(pattern)
-
-                    correct_code = usual
-
-                    if start:
-                        correct_code = if_first_letter
-                    elif current[offset] in self.vowels:
-                        correct_code = vowel_next
-
-                    if last_pattern != pattern and correct_code is not None:
-                        code.append(correct_code)
-
-                    last_pattern = pattern or first_letter
-                    current = current[offset:]
-                    break
-
-            start = False
-
-        return map(self.pad, self._permutations(code))
-

M pyphonetics/phonetics/fuzzy_soundex.py => pyphonetics/phonetics/fuzzy_soundex.py +5 -5

@@ 14,6 14,8 @@ class FuzzySoundex(PhoneticAlgorithm):
     [Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
     """
     def __init__(self):
+        super().__init__()
+
         self.translations = translation(
             'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
             '0193017-07745501769301-7-9'


@@ 48,7 50,7 @@ class FuzzySoundex(PhoneticAlgorithm):
         self.set1 = ['CS', 'CZ', 'TS', 'TZ']
         self.set2 = ['HR', 'WR']
         self.set3 = ['KN', 'NG']
-        self.set4 = list('HWY')
+        self.set4 = 'HWY'
 
     def phonetics(self, word):
         if not isinstance(word, str):


@@ 102,10 104,8 @@ class FuzzySoundex(PhoneticAlgorithm):
         code = squeeze(code)
 
         # Dealing with initials
-        if code[0] in self.set4:
-            code = first_letter
-        else:
-            code = first_letter + code[1:]
+        code = first_letter if code[0] in self.set4 \
+            else first_letter + code[1:]
 
         # Dropping vowels
         code = code.replace('0', '')

M pyphonetics/phonetics/lein.py => pyphonetics/phonetics/lein.py +3 -2

@@ 13,6 13,8 @@ class Lein(PhoneticAlgorithm):
     [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
     """
     def __init__(self):
+        super().__init__()
+
         self.translations = translation(
             'DTMNLRBFPVCJKGQSXZ',
             '112233444455555555'


@@ 37,7 39,6 @@ class Lein(PhoneticAlgorithm):
         code = squeeze(code)[0: 4]
 
         # Translations
-        backup = code
-        code = ''.join(self.translations.get(char, char) for char in backup)
+        code = ''.join(self.translations.get(char, char) for char in code)
 
         return self.pad(first + code)

M pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +2 -0

@@ 13,6 13,8 @@ class Metaphone(PhoneticAlgorithm):
     [Author]: Lawrence Philips, 1990
     """
     def __init__(self):
+        super().__init__()
+
         self.rules = [
             (r'[^a-z]', r''),
             (r'([bcdfhjklmnpqrstvwxyz])\1+', r'\1'),

M pyphonetics/phonetics/mra.py => pyphonetics/phonetics/mra.py +3 -0

@@ 15,6 15,9 @@ class MatchingRatingApproach(PhoneticAlgorithm):
         Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
         US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
     """
+    def __init__(self):
+        super().__init__()
+
     def phonetics(self, word):
         if not isinstance(word, str):
             raise UnicodeException('Expected a unicode string!')

M pyphonetics/phonetics/phonetic_algorithm.py => pyphonetics/phonetics/phonetic_algorithm.py +18 -0

@@ 1,8 1,18 @@
+from ..distance_metrics import levenshtein_distance, hamming_distance
+from ..exceptions import DistanceMetricError
+
+
 class PhoneticAlgorithm:
     """
     The main Phonetic Algorithm class, to ensure a unified API
     for all the included algorithms.
     """
+    def __init__(self):
+        self.distances = {
+            'levenshtein': levenshtein_distance,
+            'hamming': hamming_distance,
+        }
+
     def phonetics(self, word):
         """Get the phonetic representation of the word."""
         pass


@@ 10,3 20,11 @@ class PhoneticAlgorithm:
     def sounds_like(self, word1, word2):
         """Compare the phonetic representations of 2 words, and return a boolean value."""
         return self.phonetics(word1) == self.phonetics(word2)
+
+    def distance(self, word1, word2, metric='levenshtein'):
+        """Get the similarity of the words, using the supported distance metrics."""
+        if metric in self.distances:
+            distance_func = self.distances[metric]
+            return distance_func(self.phonetics(word1), self.phonetics(word2))
+        else:
+            raise DistanceMetricError('Distance metric not supported! Choose from levenshtein, hamming.')

M pyphonetics/phonetics/refined_soundex.py => pyphonetics/phonetics/refined_soundex.py +3 -2

@@ 14,6 14,8 @@ class RefinedSoundex(PhoneticAlgorithm):
     [Authors]: Robert C. Russel, Margaret King Odell
     """
     def __init__(self):
+        super().__init__()
+
         self.translations = translation(
             'AEIOUYWHBPFVCKSGJQXZDTLMNR',
             '000000DD112233344555667889'


@@ 30,5 32,4 @@ class RefinedSoundex(PhoneticAlgorithm):
         tail = ''.join(self.translations[char] for char in word
                        if self.translations[char] != 'D')
 
-        code = squeeze(tail)
-        return first_letter + code
+        return first_letter + squeeze(tail)

M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +2 -0

@@ 14,6 14,8 @@ class Soundex(PhoneticAlgorithm):
     [Authors]: Robert C. Russel, Margaret King Odell
     """
     def __init__(self):
+        super().__init__()
+
         self.translations = translation(
             'AEIOUYWHBPFVCSKGJQXZDTLMNR',
             '000000DD111122222222334556'

M pyphonetics/utils.py => pyphonetics/utils.py +2 -2

@@ 6,10 6,10 @@ from .exceptions import WrongLengthException
 def translation(first, second):
     """Create an index of mapped letters (zip to dict)."""
     if len(first) != len(second):
-        raise WrongLengthException('Expected a unicode string!')
+        raise WrongLengthException('The lists are not of the same length!')
     return dict(zip(first, second))
 
 
 def squeeze(word):
-    """squeeze the given sequence by dropping consecutive duplicates."""
+    """Squeeze the given sequence by dropping consecutive duplicates."""
     return ''.join(x[0] for x in groupby(word))

A tests/test_distances.py => tests/test_distances.py +57 -0

@@ 0,0 1,57 @@
+from pyphonetics.distance_metrics import levenshtein_distance, hamming_distance
+
+
+def test_levenshtein():
+    tests = [
+        (('b', 'o', 'o', 'k'), ('b', 'a', 'c', 'k'), 2),
+        ('book', 'back', 2),
+        ('hello', 'helo', 1),
+        ('good sir', 'baal', 8),
+        ('say', 'shiver', 5),
+        ('feature', 'get-project-features', 13),
+        ('example', 'samples', 3),
+        ('sturgeon', 'urgently', 6),
+        ('levenshtein', 'frankenstein', 6),
+        ('distance', 'difference', 5),
+        ('a', 'b', 1),
+        ('ab', 'ac', 1),
+        ('ac', 'bc', 1),
+        ('abc', 'axc', 1),
+        ('xabxcdxxefxgx', '1ab2cd34ef5g6', 6),
+        ('a', '', 1),
+        ('ab', 'a', 1),
+        ('ab', 'b', 1),
+        ('abc', 'ac', 1),
+        ('xabxcdxxefxgx', 'abcdefg', 6),
+        ('', 'a', 1),
+        ('a', 'ab', 1),
+        ('b', 'ab', 1),
+        ('ac', 'abc', 1),
+        ('abcdefg', 'xabxcdxxefxgx', 6),
+        ('', '', 0),
+        ('a', 'a', 0),
+        ('abc', 'abc', 0),
+        ('', '', 0),
+        ('a', '', 1),
+        ('', 'a', 1),
+        ('abc', '', 3),
+        ('', 'abc', 3)
+    ]
+
+    for test in tests:
+        assert levenshtein_distance(test[0], test[1]) == test[2]
+
+
+def test_hamming():
+    tests = [
+        ('1011101', '1001001', 2),
+        ('2143896', '2233796', 3),
+        ('ramer', 'cases', 3),
+        ('abc', 'abc', 0),
+        ('abc', 'abd', 1),
+        ('night', 'nacht', 2),
+        ((0, 1, 0, 1), (1, 2, 0, 1), 2)
+    ]
+
+    for test in tests:
+        assert hamming_distance(test[0], test[1]) == test[2]

M tests/test_phonetics.py => tests/test_phonetics.py +0 -53

@@ 188,56 188,3 @@ def test_lein():
     lein = Lein()
     for test in tests:
         assert lein.phonetics(test[0]) == test[1]
-
-
-# def test_daitch_mokotoff():
-#     tests = [
-#         ['Alpert', ['087930']],
-#         ['Breuer', ['791900']],
-#         ['Golden', ['583600']],
-#         ['Haber', ['579000']],
-#         ['Manheim', ['665600']],
-#         ['Topf', ['370000']],
-#         ['Kleinman', ['586660']],
-#         ['Peters', ['739400', '734000']],
-#         ['Peterson', ['739460', '734600']],
-#         ['Moskowitz', ['645740']],
-#         ['Moskovitz', ['645740']],
-#         ['Auerbach', ['097500', '097400']],
-#         ['Ohrbach', ['097500', '097400']],
-#         ['Uhrbach', ['097500', '097400']],
-#         ['Lipshitz', ['874400']],
-#         ['Lippszyc', ['874500', '874400']],
-#         ['Lewinsky', ['876450']],
-#         ['Levinsky', ['876450']],
-#         ['Szlamawicz', ['486740']],
-#         ['Shlamovitz', ['486740']],
-#         ['Jackson', ['154600', '454600', '145460', '445460']],
-#         ['Jackson-Jackson', ['154654', '454654', '145465', '445465',
-#                              '154645', '454645', '145464', '445464', '154644', '454644']],
-#         ['augsburg', ['054795']],
-#         ['halberstadt', ['587943', '587433']],
-#         ['mannheim', ['665600']],
-#         ['chernowitz', ['596740', '496740']],
-#         ['cherkassy', ['595400', '495400']],
-#         ['berlin', ['798600']],
-#         ['mintz', ['664000']],
-#         ['eisenstadt', ['046433']],
-#         ['izenstadt', ['046433']],
-#         ['lewin', ['876000']],
-#         ['levine', ['876000']],
-#         ['szlachter', ['485390', '484390']],
-#         ['chelm', ['586000', '486000']],
-#         ['chelmie', ['586000', '486000']],
-#         ['chelma', ['586000', '486000']],
-#         ['helm', ['586000']],
-#         ['daitch', ['340000']],
-#         ['levy', ['870000']],
-#         ['mokotoff', ['653700']],
-#         ['chajackachac', ['515550', '415550', '514555', '414555', '515450', '415450', '514545',
-#                           '414545', '515540', '415540','514554', '414554', '515440', '415440', '514544', '414544']]
-#     ]
-#
-#     dm = DaitchMokotoff()
-#     for test in tests:
-#         assert dm.phonetics(test[0]) == test[1]