~linuxgoose/linguistics-robin

ref: 729f20b8b8e838f3b19501fc2480fb8e27f34fa9 linguistics-robin/linguistics_robin/phonetics/lein.py -rw-r--r-- 1.1 KiB
729f20b8 — Jordan Update caverphone2.py 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
from unidecode import unidecode

from ..utils import squeeze, translation, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class Lein(PhoneticAlgorithm):
    """
    The Lein name coding procedure.

    [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
    """
    def __init__(self):
        super().__init__()

        self.translations = translation(
            'DTMNLRBFPVCJKGQSXZ',
            '112233444455555555'
        )

        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]\s', r'', word)

        # Keep the 1st letter
        first, code = word[0], word[1:]

        # Drop vowels and Y, W & H
        code = re.sub(r'[AEIOUYWH]', r'', code)

        # Drop consecutive duplicates and truncate to 4 chars
        code = squeeze(code)[0: 4]

        # Translations
        code = ''.join(self.translations.get(char, char) for char in code)

        return self.pad(first + code)