~linuxgoose/linguistics-robin

efe2269fb215344e2199c1301f18d3652e774fb9 — Ilias Koutsakis 5 years ago 7f55ccc
general updates

* added exception type for empty string
* updated checks in algorithms
* updated tests
* moved to new flit publishing method using pyproject.toml
A .gitignore => .gitignore +198 -0
@@ 0,0 1,198 @@
# Created by .ignore support plugin (hsz.mobi)
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json

### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser


M LICENSE.rst => LICENSE.rst +1 -1
@@ 1,6 1,6 @@
The MIT License (MIT)

Copyright (c) 2016 Guillaume Plique (Yomguithereal)
Copyright (c) 2020 Ilias Koutsakis

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

D README.rst => README.rst +0 -54
@@ 1,54 0,0 @@
===========
Pyphonetics
===========

Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the following algorithms are implemented and supported:

 * Soundex
 * Metaphone
 * Refined Soundex
 * Fuzzy Soundex
 * Lein
 * Matching Rating Approach

More will be added in the future.

Instalation
***********

The module is available in PyPI, just use `pip install pyphonetics`.


Usage
*****

    >>> from pyphonetics import Soundex
    >>> soundex = Soundex()
    >>> soundex.phonetics('Rupert')
    'R163'
    >>> soundex.phonetics('Robert')
    'R163'
    >>> soundex.sounds_like('Robert', 'Rupert')
    True


The same API applies to every algorithm, e.g:

    >>> from pyphonetics import Metaphone
    >>> metaphone = Metaphone()
    >>> metaphone.phonetics('discrimination')
    'TSKRMNXN'

You can also use the `distance(word1, word2, metric='levenshtein')` method to find the distance between 2 phonetic representations.

    >>> from pyphonetics import RefinedSoundex
    >>> rs = RefinedSoundex()
    >>> rs.distance('Rupert', 'Robert')
    0
    >>> rs.distance('assign', 'assist', metric='hamming')
    2

Credits
=======

The module was largely based on the implementation of phonetic algorithms found in the Talisman.js (https://github.com/Yomguithereal/talisman) Node NLP library.
\ No newline at end of file

D flit.ini => flit.ini +0 -18
@@ 1,18 0,0 @@
[metadata]
module=pyphonetics
author=Lilykos
author-email=lilykosk@gmail.com
home-page=http://github.com/Lilykos/pyphonetics
requires=unidecode
    pytest
requires-python= >=3
description-file=README.rst
classifiers=Intended Audience :: Developers
    Programming Language :: Python :: 3
    Topic :: Software Development :: Libraries :: Python Modules

# If you want command line scripts, this is how to declare them.
# If not, you can leave this section out completely.
# [scripts]
# # foobar:main means the script will do: from foobar import main; main()
# foobar=foobar:main
\ No newline at end of file

M pyphonetics/__init__.py => pyphonetics/__init__.py +1 -1
@@ 6,4 6,4 @@ from .phonetics import (Soundex,
                        Lein,
                        RefinedSoundex)

__version__ = '0.4.1'
__version__ = '0.5'

M pyphonetics/exceptions.py => pyphonetics/exceptions.py +4 -0
@@ 8,3 8,7 @@ class WrongLengthException(Exception):

class DistanceMetricError(Exception):
    pass


class EmptyStringError(Exception):
    pass

M pyphonetics/phonetics/fuzzy_soundex.py => pyphonetics/phonetics/fuzzy_soundex.py +3 -7
@@ 1,8 1,7 @@
import re
from unidecode import unidecode

from ..utils import squeeze, translation
from ..exceptions import UnicodeException
from ..utils import squeeze, translation, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm




@@ 53,11 52,8 @@ class FuzzySoundex(PhoneticAlgorithm):
        self.set4 = 'HWY'

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')

        if not word:
            return ''
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()


M pyphonetics/phonetics/lein.py => pyphonetics/phonetics/lein.py +3 -4
@@ 1,8 1,7 @@
import re
from unidecode import unidecode

from ..utils import squeeze, translation
from ..exceptions import UnicodeException
from ..utils import squeeze, translation, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm




@@ 23,8 22,8 @@ class Lein(PhoneticAlgorithm):
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]\s', r'', word)

M pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +3 -3
@@ 1,7 1,7 @@
import re
from unidecode import unidecode

from ..exceptions import UnicodeException
from ..utils import check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm




@@ 46,8 46,8 @@ class Metaphone(PhoneticAlgorithm):
        ]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')
        check_str(word)
        check_empty(word)

        code = unidecode(word).lower()
        for item in self.rules:

M pyphonetics/phonetics/mra.py => pyphonetics/phonetics/mra.py +3 -4
@@ 1,8 1,7 @@
import re
from unidecode import unidecode

from ..utils import squeeze
from ..exceptions import UnicodeException
from ..utils import squeeze, check_empty, check_str
from .phonetic_algorithm import PhoneticAlgorithm




@@ 19,8 18,8 @@ class MatchingRatingApproach(PhoneticAlgorithm):
        super().__init__()

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')
        check_str(word)
        check_empty(word)

        codex = unidecode(word).upper()
        codex = re.sub(r'[^A-Z]', r'', codex)

M pyphonetics/phonetics/refined_soundex.py => pyphonetics/phonetics/refined_soundex.py +3 -4
@@ 1,8 1,7 @@
import re
from unidecode import unidecode

from ..utils import translation, squeeze
from ..exceptions import UnicodeException
from ..utils import translation, squeeze, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm




@@ 22,8 21,8 @@ class RefinedSoundex(PhoneticAlgorithm):
        )

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)

M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +6 -6
@@ 1,8 1,7 @@
import re
from unidecode import unidecode

from ..utils import translation, squeeze
from ..exceptions import UnicodeException
from ..utils import translation, squeeze, check_str, check_empty
from .phonetic_algorithm import PhoneticAlgorithm




@@ 23,8 22,8 @@ class Soundex(PhoneticAlgorithm):
        self.pad = lambda code: '{}0000'.format(code)[:4]

    def phonetics(self, word):
        if not isinstance(word, str):
            raise UnicodeException('Expected a unicode string!')
        check_str(word)
        check_empty(word)

        word = unidecode(word).upper()
        word = re.sub(r'[^A-Z]', r'', word)


@@ 34,8 33,9 @@ class Soundex(PhoneticAlgorithm):
                       if self.translations[char] != 'D')

        # Dropping first code's letter if duplicate
        if tail[0] == self.translations[first_letter]:
            tail = tail[1:]
        if len(tail):
            if tail[0] == self.translations[first_letter]:
                tail = tail[1:]

        code = squeeze(tail).replace('0', '')
        return self.pad(first_letter + code)

M pyphonetics/utils.py => pyphonetics/utils.py +14 -1
@@ 1,6 1,7 @@
from itertools import groupby

from .exceptions import WrongLengthException
from .exceptions import WrongLengthException, UnicodeException, \
    EmptyStringError


def translation(first, second):


@@ 13,3 14,15 @@ def translation(first, second):
def squeeze(word):
    """Squeeze the given sequence by dropping consecutive duplicates."""
    return ''.join(x[0] for x in groupby(word))


def check_str(word):
    """Throw exception at non-string input."""
    if not isinstance(word, str):
        raise UnicodeException('Expected a unicode string!')


def check_empty(word):
    """Throw exception at empty string input."""
    if not len(word):
        raise EmptyStringError('The given string is empty.')

A pyproject.toml => pyproject.toml +19 -0
@@ 0,0 1,19 @@
[build-system]
requires = [
    "flit_core >=2,<3",
    "unidecode",
    "pytest"
]
build-backend = "flit_core.buildapi"

[tool.flit.metadata]
module = "pyphonetics"
author = "Lilykos"
author-email = "ilias.koutsakis@gmail.com"
home-page = "https://github.com/Lilykos/pyphonetics"
classifiers = [
    "License :: OSI Approved :: MIT License",
    "Programming Language :: Python :: 3",
    "Intended Audience :: Developers",
    "Topic :: Software Development :: Libraries :: Python Modules"
]

A tests/test_corner_cases.py => tests/test_corner_cases.py +30 -0
@@ 0,0 1,30 @@
import pytest
from pyphonetics import Soundex, RefinedSoundex, FuzzySoundex
from pyphonetics.exceptions import EmptyStringError


def test_soundex():
    soundex = Soundex()

    assert soundex.phonetics('h') == 'H000'
    assert soundex.phonetics('d') == 'D000'

    with pytest.raises(EmptyStringError):
        soundex.phonetics('')


def test_refined_soundex():
    soundex = RefinedSoundex()

    assert soundex.phonetics('h') == 'H'
    assert soundex.phonetics('d') == 'D6'

    with pytest.raises(EmptyStringError):
        soundex.phonetics('')


def test_fuzzy_soundex():
    soundex = FuzzySoundex()

    with pytest.raises(EmptyStringError):
        soundex.phonetics('')

M tests/test_phonetics.py => tests/test_phonetics.py +0 -1
@@ 106,7 106,6 @@ def test_mra():

def test_fuzzy_soundex():
    tests = [
        ('', ''),
        ('Kristen', 'K6935'),
        ('Krissy', 'K69'),
        ('Christen', 'K6935'),