~linuxgoose/linguistics-robin

ref: ba110fe71c3e4a7ef4cfec4dd7b23547b127c37c linguistics-robin/linguistics_robin/phonetics/soundex.py -rw-r--r-- 1.1 KiB
ba110fe7 — Jordan Robinson Merge pull request #8 from linuxgoose/6-rebrand---linguistics-robin 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
from unidecode import unidecode

from ..utils import translation, squeeze, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class Soundex(PhoneticAlgorithm):
    """
    The Soundex algorithm.

    [Reference]: https://en.wikipedia.org/wiki/Soundex
    [Authors]: Robert C. Russel, Margaret King Odell
    """
    def __init__(self):
        super().__init__()

        self.translations = translation(
            'AEIOUYWHBPFVCSKGJQXZDTLMNR',
            '000000DD111122222222334556'
        )
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)

        first_letter = word[0]
        tail = ''.join(self.translations[char] for char in word
                       if self.translations[char] != 'D')

        # Dropping first code's letter if duplicate
        if len(tail):
            if tail[0] == self.translations[first_letter]:
                tail = tail[1:]

        code = squeeze(tail).replace('0', '')
        return self.pad(first_letter + code)