~linuxgoose/linguistics-robin (e134204c6ef0360b26476f1825a05e5cc0f9bd36): linguistics_robin/phonetics/mra.py

import re
from unidecode import unidecode

from ..utils import squeeze, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm


class MatchingRatingApproach(PhoneticAlgorithm):
    """
    Functions related to the computation of the Match Rating Approach codex.

    [Reference]: https://en.wikipedia.org/wiki/Match_rating_approach
    [Article]: Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977).
        Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
        US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
    """
    def __init__(self):
        super().__init__()

    def phonetics(self, word):
        check_str(word)
        check_empty(word)

        codex = unidecode(word).upper()
        codex = re.sub(r'[^A-Z]', r'', codex)

        # Dropping non - leading vowels
        codex = codex[0] + re.sub(r'[AEIOU]', r'', codex[1:])

        # Dropping consecutive consonants
        codex = squeeze(codex)

        # Returning the codex
        offset = min(3, len(codex) - 3)
        return codex[:3] + codex[len(codex) - offset:offset + len(codex)]