~linuxgoose/linguistics-robin

ca52233076f58ce9a213ac62dfe5c47afd198def — Lilykos 9 years ago ea93ccc
Updated README with instructions of use.
Added the skeleton for DaitchMokotoff.
A README.md => README.md +43 -0
@@ 0,0 1,43 @@
# Pyphonetics

Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the following algorithms are implemented and supported:

 * Soundex
 * Metaphone
 * Refined Soundex
 * Fuzzy Soundex
 * Lein
 * Matching Rating Approach

More will be added in the future.

## Instalation

The module is available in PyPI, just use `pip install pyphonetics`.


## Usage

```python
>>> from pyphonetics import Soundex
>>> soundex = Soundex()
>>> soundex.phonetics('Rupert')
'R163'
>>> soundex.phonetics('Robert')
'R163'
>>> soundex.sounds_like('Robert', 'Rupert')
True
```

The same API applies to every algorithm, e.g:

```python
>>> from pyphonetics import Metaphone
>>> metaphone = Metaphone()
>>> metaphone.phonetics('discrimination')
'TSKRMNXN'
```

## Credits

The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.
\ No newline at end of file

M README.rst => README.rst +41 -1
@@ 2,4 2,44 @@
Pyphonetics
===========

A Python 3 phonetics library.
\ No newline at end of file
Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the following algorithms are implemented and supported:

 * Soundex
 * Metaphone
 * Refined Soundex
 * Fuzzy Soundex
 * Lein
 * Matching Rating Approach

More will be added in the future.

Instalation
***********

The module is available in PyPI, just use `pip install pyphonetics`.


Usage
*****

    >>> from pyphonetics import Soundex
    >>> soundex = Soundex()
    >>> soundex.phonetics('Rupert')
    'R163'
    >>> soundex.phonetics('Robert')
    'R163'
    >>> soundex.sounds_like('Robert', 'Rupert')
    True


The same API applies to every algorithm, e.g:

    >>> from pyphonetics import Metaphone
    >>> metaphone = Metaphone()
    >>> metaphone.phonetics('discrimination')
    'TSKRMNXN'

Credits
=======

The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.
\ No newline at end of file

M pyphonetics/__init__.py => pyphonetics/__init__.py +9 -2
@@ 1,4 1,11 @@
"""A Python 3 phonetics library."""
from .phonetics import Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein
from .phonetics import (Soundex,
                        Metaphone,
                        MatchingRatingApproach,
                        FuzzySoundex,
                        Lein,
                        RefinedSoundex,
                        # DaitchMokotoff
                        )

__version__ = '0.2'
__version__ = '0.3.1'

M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +3 -1
@@ 2,4 2,6 @@ from .soundex import *
from .metaphone import *
from .mra import *
from .fuzzy_soundex import *
from .lein import *
\ No newline at end of file
from .lein import *
from .refined_soundex import *
# from .daitch_mokotoff import *

A pyphonetics/phonetics/daitch_mokotoff.py => pyphonetics/phonetics/daitch_mokotoff.py +201 -0
@@ 0,0 1,201 @@
import re
from unidecode import unidecode

from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class DaitchMokotoff(PhoneticAlgorithm):
    """
    The Daitch-Mokotoff Soundex.

    [Reference]: https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
    [Note]: For the (RS|RZ) part, the original algo says (94, 4) but most implementations
        drop it to only (94). This implementation follows the original algo.
    """
    def __init__(self):
        self.rules = {
            'A': [
                [r'^(AI|AJ|AY)', 0, 1, None],
                [r'^AU', 0, 7, None],
                [None, 0, None, None]
            ],
            'Ą': [
                [None, None, None, [6, None]]
            ],
            'B': [
                [None, 7, 7, 7]
            ],
            'C': [
                [r'^CHS', 5, 54, 54],
                [r'^CH', [5, 4], [5, 4], [5, 4]],
                [r'^CK', [5, 45], [5, 45], [5, 45]],
                [r'^(CSZ|CZS|CZ|CS)', 4, 4, 4],
                [None, [5, 4], [5, 4], [5, 4]]
            ],
            'D': [
                [r'^(DRZ|DRS|DSH|DSZ|DZH|DZS|DS|DZ)', 4, 4, 4],
                [r'^(DT|D)', 3, 3, 3]
            ],
            'E': [
                [r'^(EI|EJ|EY)', 0, 1, None],
                [r'^EU', 1, 1, None],
                [None, 0, None, None],
            ],
            'Ę': [
                [None, None, None, [6, None]]
            ],
            'F': [
                [r'^(FB|F)', 7, 7, 7]
            ],
            'G': [
                [None, 5, 5, 5]
            ],
            'H': [
                [None, 5, 5, None]
            ],
            'I': [
                [r'^(IA|IE|IO|IU)', 1, None, None],
                [None, 0, None, None]
            ],
            'J': [
                [None, [1, 4], [None, 4], [None, 4]]
            ],
            'K': [
                [r'^KS', 5, 54, 54],
                [r'^(KH|K)', 5, 5, 5]
            ],
            'L': [
                [None, 8, 8, 8]
            ],
            'M': [
                ['MNNM', None, 66, 66],
                ['MN', 6, 6, 6]
            ],
            'N': [
                ['MNNM', None, 66, 66],
                ['MN', 6, 6, 6],
            ],
            'O': [
                [r'^(OI|OJ|OY)', 0, 1, None],
                [None, 0, None, None]
            ],
            'P': [
                [r'^(PF|PH|P)', 7, 7, 7]
            ],
            'Q': [
                [None, 5, 5, 5]
            ],
            'R': [
                [r'^(RZ|RS)', [94, 4], [94, 4], [94, 4]],
                [None, 9, 9, 9]
            ],
            'S': [
                [r'^(SCHTSCH|SCHTSH|SCHTCH|SHTCH|SHCH|SHTSH)', 2, 4, 4],
                [r'^SCH', 4, 4, 4],
                [r'^(SHT|SCHT|SCHD)', 2, 43, 43],
                [r'^SH', 4, 4, 4],
                [r'^(STCH|STSCH|SC|STRZ|STRS|STSH)', 2, 4, 4],
                [r'^ST', 2, 43, 43],
                [r'^(SZCZ|SZCS)', 2, 4, 4],
                [r'^(SZT|SHD|SZD|SD)', 2, 43, 43],
                [r'^(SZ|S)', 4, 4, 4]
            ],
            'T': [
                [r'^(TCH|TTCH|TTSCH)', 4, 4, 4],
                [r'^TH', 3, 3, 3],
                [r'^(TRZ|TRS|TSCH|TSH|TS|TTS|TTSZ|TC|TZ|TTZ|TZS|TSZ)', 4, 4, 4],
                [None, 3, 3, 3]
            ],
            'Ţ': [
                [None, [3, 4], [3, 4], [3, 4]]
            ],
            'U': [
                [r'^(UI|UJ|UY)', 0, 1, None],
                [r'^(UE|U)', 0, None, None]
            ],
            'V': [
                [None, 7, 7, 7]
            ],
            'W': [
                [None, 7, 7, 7]
            ],
            'X': [
                [None, 5, 54, 54]
            ],
            'Y': [
                [None, 1, None, None]
            ],
            'Z': [
                [r'^(ZHDZH|ZDZH|ZDZ)', 2, 4, 4],
                [r'^(ZHD|ZD)', 2, 43, 43],
                [r'^(ZSCH|ZSH|ZH|ZS|Z)', 4, 4, 4]
            ],
        }
        self.pad = lambda code: '{}000000'.format(code)[:6]
        self.vowels = 'AEIOUY'

    def _permutations(self, code):
        codes = ['']

        for current_part in code:

            if isinstance(current_part, dict):
                # Double the codes
                for item in codes:
                    codes.append(item)

                # Fill the nodes
                length = len(codes)
                for i in range(length):
                    s = current_part[0] if i < length/2 else current_part[1]
                    codes[i] = codes[i] + s if s is not None else codes[i]

            else:
                for i in range(len(codes)):
                    codes[i] += current_part

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        code = []
        word = unidecode(word).upper()
        current = re.sub(r'[^A-ZĄĘŢ]', r'', word)

        start = True
        last_pattern = ''

        while len(current):
            first_letter = current[0]
            rules = self.rules[first_letter]

            for rule in rules:
                pattern, if_first_letter,\
                    vowel_next, usual = rule

                match = re.match(pattern, current) if pattern else [first_letter]
                if match:
                    if isinstance(match, list):
                        offset = len(match[0])
                    else:
                        offset = len(pattern)

                    correct_code = usual

                    if start:
                        correct_code = if_first_letter
                    elif current[offset] in self.vowels:
                        correct_code = vowel_next

                    if last_pattern != pattern and correct_code is not None:
                        code.append(correct_code)

                    last_pattern = pattern or first_letter
                    current = current[offset:]
                    break

            start = False

        return map(self.pad, self._permutations(code))


M pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +8 -8
@@ 33,14 33,14 @@ class Metaphone(PhoneticAlgorithm):
            (r'd', r'T'),
            (r'g(?=h[^aeiou])', r''),
            (r'gn(ed)?', r'N'),
            (r'([^g]|^)g(?=[iey])', '\1J'),
            (r'g+', 'K'),
            (r'ph', 'F'),
            (r'([aeiou])h(?=\b|[^aeiou])', '\1'),
            (r'[wy](?![aeiou])', ''),
            (r'z', 'S'),
            (r'v', 'F'),
            (r'(?!^)[aeiou]+', '')
            (r'([^g]|^)g(?=[iey])', r'\1J'),
            (r'g+', r'K'),
            (r'ph', r'F'),
            (r'([aeiou])h(?=\b|[^aeiou])', r'\1'),
            (r'[wy](?![aeiou])', r''),
            (r'z', r'S'),
            (r'v', r'F'),
            (r'(?!^)[aeiou]+', r'')
        ]

    def phonetics(self, word):

A pyphonetics/phonetics/refined_soundex.py => pyphonetics/phonetics/refined_soundex.py +34 -0
@@ 0,0 1,34 @@
import re
from unidecode import unidecode

from ..utils import translation, squeeze
from ..exceptions import UnicodeException
from .phonetic_algorithm import PhoneticAlgorithm


class RefinedSoundex(PhoneticAlgorithm):
    """
    The Refined Soundex algorithm.

    [Reference]: https://en.wikipedia.org/wiki/Soundex
    [Authors]: Robert C. Russel, Margaret King Odell
    """
    def __init__(self):
        self.translations = translation(
            'AEIOUYWHBPFVCKSGJQXZDTLMNR',
            '000000DD112233344555667889'
        )

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)

        first_letter = word[0]
        tail = ''.join(self.translations[char] for char in word
                       if self.translations[char] != 'D')

        code = squeeze(tail)
        return first_letter + code

M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +2 -29
@@ 13,18 13,11 @@ class Soundex(PhoneticAlgorithm):
    [Reference]: https://en.wikipedia.org/wiki/Soundex
    [Authors]: Robert C. Russel, Margaret King Odell
    """
    def __init__(self, refined=False):
    def __init__(self):
        self.translations = translation(
            'AEIOUYWHBPFVCSKGJQXZDTLMNR',
            '000000DD111122222222334556'
        )

        self.refined_translations = translation(
            'AEIOUYWHBPFVCKSGJQXZDTLMNR',
            '000000DD112233344555667889'
        )

        self.refined = refined
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):


@@ 35,19 28,7 @@ class Soundex(PhoneticAlgorithm):
        word = re.sub(r'[^A-Z]', r'', word)

        first_letter = word[0]

        if self.refined:
            return self._refined_soundex(first_letter, word)
        else:
            return self._soundex(first_letter, word)

    #
    # Private methods Simple/Refined Soundex
    #
    def _soundex(self, first_letter, word):
        """Soundex algorithm."""
        tail = ''.join(self.translations[char]
                       for char in word
        tail = ''.join(self.translations[char] for char in word
                       if self.translations[char] != 'D')

        # Dropping first code's letter if duplicate


@@ 56,11 37,3 @@ class Soundex(PhoneticAlgorithm):

        code = squeeze(tail).replace('0', '')
        return self.pad(first_letter + code)

    def _refined_soundex(self, first_letter, word):
        """Refined Soundex algorithm."""
        tail = ''.join(self.refined_translations[char]
                       for char in word
                       if self.refined_translations[char] != 'D')
        code = squeeze(tail)
        return first_letter + code

M tests/test_phonetics.py => tests/test_phonetics.py +57 -3
@@ 1,4 1,5 @@
from pyphonetics import Metaphone, Soundex, MatchingRatingApproach, FuzzySoundex, Lein
from pyphonetics import Metaphone, Soundex, MatchingRatingApproach,\
    FuzzySoundex, Lein, RefinedSoundex


def test_metaphone():


@@ 66,7 67,7 @@ def test_soundex_refined():
        ('D6043', 'dogs')
    ]

    soundex = Soundex(refined=True)
    soundex = RefinedSoundex()
    for test in tests:
        assert soundex.phonetics(test[1]) == test[0]
        


@@ 186,4 187,57 @@ def test_lein():

    lein = Lein()
    for test in tests:
        assert lein.phonetics(test[0]) == test[1]
\ No newline at end of file
        assert lein.phonetics(test[0]) == test[1]


# def test_daitch_mokotoff():
#     tests = [
#         ['Alpert', ['087930']],
#         ['Breuer', ['791900']],
#         ['Golden', ['583600']],
#         ['Haber', ['579000']],
#         ['Manheim', ['665600']],
#         ['Topf', ['370000']],
#         ['Kleinman', ['586660']],
#         ['Peters', ['739400', '734000']],
#         ['Peterson', ['739460', '734600']],
#         ['Moskowitz', ['645740']],
#         ['Moskovitz', ['645740']],
#         ['Auerbach', ['097500', '097400']],
#         ['Ohrbach', ['097500', '097400']],
#         ['Uhrbach', ['097500', '097400']],
#         ['Lipshitz', ['874400']],
#         ['Lippszyc', ['874500', '874400']],
#         ['Lewinsky', ['876450']],
#         ['Levinsky', ['876450']],
#         ['Szlamawicz', ['486740']],
#         ['Shlamovitz', ['486740']],
#         ['Jackson', ['154600', '454600', '145460', '445460']],
#         ['Jackson-Jackson', ['154654', '454654', '145465', '445465',
#                              '154645', '454645', '145464', '445464', '154644', '454644']],
#         ['augsburg', ['054795']],
#         ['halberstadt', ['587943', '587433']],
#         ['mannheim', ['665600']],
#         ['chernowitz', ['596740', '496740']],
#         ['cherkassy', ['595400', '495400']],
#         ['berlin', ['798600']],
#         ['mintz', ['664000']],
#         ['eisenstadt', ['046433']],
#         ['izenstadt', ['046433']],
#         ['lewin', ['876000']],
#         ['levine', ['876000']],
#         ['szlachter', ['485390', '484390']],
#         ['chelm', ['586000', '486000']],
#         ['chelmie', ['586000', '486000']],
#         ['chelma', ['586000', '486000']],
#         ['helm', ['586000']],
#         ['daitch', ['340000']],
#         ['levy', ['870000']],
#         ['mokotoff', ['653700']],
#         ['chajackachac', ['515550', '415550', '514555', '414555', '515450', '415450', '514545',
#                           '414545', '515540', '415540','514554', '414554', '515440', '415440', '514544', '414544']]
#     ]
#
#     dm = DaitchMokotoff()
#     for test in tests:
#         assert dm.phonetics(test[0]) == test[1]

M tests/test_utils.py => tests/test_utils.py +1 -1
@@ 13,4 13,4 @@ def test_squeeze():


def test_translation():
    assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}
\ No newline at end of file
    assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}