~linuxgoose/linguistics-robin

ref: e134204c6ef0360b26476f1825a05e5cc0f9bd36 linguistics-robin/linguistics_robin/phonetics/mra.py -rw-r--r-- 1.1 KiB
e134204c — Jordan Robinson Merge pull request #9 from linuxgoose/7-soundex-incorrect-calculation 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
from unidecode import unidecode

from ..utils import squeeze, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm


class MatchingRatingApproach(PhoneticAlgorithm):
    """
    Functions related to the computation of the Match Rating Approach codex.

    [Reference]: https://en.wikipedia.org/wiki/Match_rating_approach
    [Article]: Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977).
        Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
        US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
    """
    def __init__(self):
        super().__init__()

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        codex = unidecode(word).upper()
        codex = re.sub(r'[^A-Z]', r'', codex)

        # Dropping non - leading vowels
        codex = codex[0] + re.sub(r'[AEIOU]', r'', codex[1:])

        # Dropping consecutive consonants
        codex = squeeze(codex)

        # Returning the codex
        offset = min(3, len(codex) - 3)
        return codex[:3] + codex[len(codex) - offset:offset + len(codex)]