~linuxgoose/linguistics-robin

ref: cd2ed9b16b07e593d8989a8fae7245cf1403f49f linguistics-robin/linguistics_robin/phonetics/soundex.py -rw-r--r-- 1.5 KiB
cd2ed9b1 — Jordan Robinson Merge pull request #22 from linuxgoose/10-add-caverphone1-and-caverphone2 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
from unidecode import unidecode

from ..utils import translation, squeeze, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class Soundex(PhoneticAlgorithm):
    """
    The Soundex algorithm.

    [Reference]: https://en.wikipedia.org/wiki/Soundex
    [Authors]: Robert C. Russel, Margaret King Odell
    """
    def __init__(self):
        super().__init__()

        self.translations = translation(
            'AEIOUYWHBPFVCSKGJQXZDTLMNR',
            '000000DD111122222222334556'
        )
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)

        first_letter = word[0]
        tail = ''.join(self.translations[char] for char in word
                       if self.translations[char] != 'D')

        # Dropping all leading code's letters if same as first letter - AMERICAN SOUNDEX RULE
        if len(tail):
            print(word)
            for i, char in enumerate(tail):
                if char != self.translations[first_letter] and len(tail) > 1:
                    tail = tail[i:]
                    break
                if len(tail) == 1:
                    if tail[0] == self.translations[first_letter]:
                        tail = tail[1:]
                        break
                if tail[i+1:] == '':
                    tail = tail[i+1:]

        code = squeeze(tail).replace('0', '')
        return self.pad(first_letter + code)