~linuxgoose/linguistics-robin

ref: ad3fdc410fa157522dd1e00f4a49210e9daa456f linguistics-robin/linguistics_robin/phonetics/lein.py -rw-r--r-- 1.1 KiB
ad3fdc41 — Jordan Robinson Merge pull request #24 from linuxgoose/23-caverphone-2-__vowels-undefined-error 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
from unidecode import unidecode

from ..utils import squeeze, translation, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm


class Lein(PhoneticAlgorithm):
    """
    The Lein name coding procedure.

    [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
    """
    def __init__(self):
        super().__init__()

        self.translations = translation(
            'DTMNLRBFPVCJKGQSXZ',
            '112233444455555555'
        )

        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]\s', r'', word)

        # Keep the 1st letter
        first, code = word[0], word[1:]

        # Drop vowels and Y, W & H
        code = re.sub(r'[AEIOUYWH]', r'', code)

        # Drop consecutive duplicates and truncate to 4 chars
        code = squeeze(code)[0: 4]

        # Translations
        code = ''.join(self.translations.get(char, char) for char in code)

        return self.pad(first + code)