A LICENSE.rst => LICENSE.rst +21 -0
@@ 0,0 1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Guillaume Plique (Yomguithereal)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.<
\ No newline at end of file
M flit.ini => flit.ini +3 -0
@@ 3,6 3,9 @@ module=pyphonetics
author=Lilykos
author-email=lilykosk@gmail.com
home-page=http://github.com/Lilykos/pyphonetics
+requires=unidecode
+ pytest
+requires-python= >=3
description-file=README.rst
classifiers=Intended Audience :: Developers
Programming Language :: Python :: 3
M pyphonetics/__init__.py => pyphonetics/__init__.py +2 -2
@@ 1,4 1,4 @@
"""A Python 3 phonetics library."""
-from .phonetics import *
+from .phonetics import Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein
-__version__ = '0.1'
+__version__ = '0.2'
A pyphonetics/exceptions.py => pyphonetics/exceptions.py +6 -0
@@ 0,0 1,6 @@
+class UnicodeException(Exception):
+ pass
+
+
+class WrongLengthException(Exception):
+ pass
M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +4 -0
@@ 1,1 1,5 @@
from .soundex import *
+from .metaphone import *
+from .mra import *
+from .fuzzy_soundex import *
+from .lein import *<
\ No newline at end of file
A pyphonetics/phonetics/fuzzy_soundex.py => pyphonetics/phonetics/fuzzy_soundex.py +112 -0
@@ 0,0 1,112 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze, translation
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class FuzzySoundex(PhoneticAlgorithm):
+ """
+ Implementation of the "Fuzzy Soundex" algorithm.
+
+ [Reference]: http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
+ [Article]: Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for Soundex Retrieval."
+ """
+ def __init__(self):
+ self.translations = translation(
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
+ '0193017-07745501769301-7-9'
+ )
+
+ self.rules = [
+ (r'CA', r'KA'),
+ (r'CC', r'KK'),
+ (r'CK', r'KK'),
+ (r'CE', r'SE'),
+ (r'CHL', r'KL'),
+ (r'CL', r'KL'),
+ (r'CHR', r'KR'),
+ (r'CR', r'KR'),
+ (r'CI', r'SI'),
+ (r'CO', r'KO'),
+ (r'CU', r'KU'),
+ (r'CY', r'SY'),
+ (r'DG', r'GG'),
+ (r'GH', r'HH'),
+ (r'MAC', r'MK'),
+ (r'MC', r'MK'),
+ (r'NST', r'NSS'),
+ (r'PF', r'FF'),
+ (r'PH', r'FF'),
+ (r'SCH', r'SSS'),
+ (r'TIO', r'SIO'),
+ (r'TIA', r'SIO'),
+ (r'TCH', r'CHH'),
+ ]
+
+ self.set1 = ['CS', 'CZ', 'TS', 'TZ']
+ self.set2 = ['HR', 'WR']
+ self.set3 = ['KN', 'NG']
+ self.set4 = list('HWY')
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ if not word:
+ return ''
+
+ word = unidecode(word).upper()
+
+ # Substitutions for beginnings
+ first_two, rest = word[:2], word[2:]
+
+ if first_two in self.set1:
+ word = 'SS' + rest
+ elif first_two == 'GN':
+ word = 'NN' + rest
+ elif first_two in self.set2:
+ word = 'RR' + rest
+ elif first_two == 'HW':
+ word = 'WW' + rest
+ elif first_two in self.set3:
+ word = 'NN' + rest
+
+ # Substitutions for endings
+ last_two, initial = word[-2:], word[0:-2]
+
+ if last_two == 'CH':
+ word = initial + 'KK'
+ elif last_two == 'NT':
+ word = initial + 'TT'
+ elif last_two == 'RT':
+ word = initial + 'RR'
+ elif word[-3:] == 'RDT':
+ word = word[0:-3] + 'RR'
+
+ # Applying the rules
+ for rule in self.rules:
+ word = re.sub(rule[0], rule[1], word)
+
+ # Catch the first letter
+ first_letter = word[0]
+
+ # Translating
+ code = ''.join(self.translations.get(char, char) for char in word)
+
+ # Removing hyphens
+ code = code.replace('-', '')
+
+ # Squeezing the code
+ code = squeeze(code)
+
+ # Dealing with initials
+ if code[0] in self.set4:
+ code = first_letter
+ else:
+ code = first_letter + code[1:]
+
+ # Dropping vowels
+ code = code.replace('0', '')
+ return code
A pyphonetics/phonetics/lein.py => pyphonetics/phonetics/lein.py +43 -0
@@ 0,0 1,43 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze, translation
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Lein(PhoneticAlgorithm):
+ """
+ The Lein name coding procedure.
+
+ [Reference]: http://naldc.nal.usda.gov/download/27833/PDF
+ """
+ def __init__(self):
+ self.translations = translation(
+ 'DTMNLRBFPVCJKGQSXZ',
+ '112233444455555555'
+ )
+
+ self.pad = lambda code: '{}0000'.format(code)[:4]
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ word = unidecode(word).upper()
+ word = re.sub(r'[^A-Z]\s', r'', word)
+
+ # Keep the 1st letter
+ first, code = word[0], word[1:]
+
+ # Drop vowels and Y, W & H
+ code = re.sub(r'[AEIOUYWH]', r'', code)
+
+ # Drop consecutive duplicates and truncate to 4 chars
+ code = squeeze(code)[0: 4]
+
+ # Translations
+ backup = code
+ code = ''.join(self.translations.get(char, char) for char in backup)
+
+ return self.pad(first + code)
A pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +53 -0
@@ 0,0 1,53 @@
+import re
+from unidecode import unidecode
+
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Metaphone(PhoneticAlgorithm):
+ """
+ The metaphone algorithm.
+
+ [Reference]: https://en.wikipedia.org/wiki/Metaphone
+ [Author]: Lawrence Philips, 1990
+ """
+ def __init__(self):
+ self.rules = [
+ (r'[^a-z]', r''),
+ (r'([bcdfhjklmnpqrstvwxyz])\1+', r'\1'),
+ (r'^ae', r'E'),
+ (r'^[gkp]n', r'N'),
+ (r'^wr', r'R'),
+ (r'^x', r'S'),
+ (r'^wh', r'W'),
+ (r'mb$', r'M'),
+ (r'(?!^)sch', r'SK'),
+ (r'th', r'0'),
+ (r't?ch|sh', r'X'),
+ (r'c(?=ia)', r'X'),
+ (r'[st](?=i[ao])', r'X'),
+ (r's?c(?=[iey])', r'S'),
+ (r'[cq]', r'K'),
+ (r'dg(?=[iey])', r'J'),
+ (r'd', r'T'),
+ (r'g(?=h[^aeiou])', r''),
+ (r'gn(ed)?', r'N'),
+ (r'([^g]|^)g(?=[iey])', '\1J'),
+ (r'g+', 'K'),
+ (r'ph', 'F'),
+ (r'([aeiou])h(?=\b|[^aeiou])', '\1'),
+ (r'[wy](?![aeiou])', ''),
+ (r'z', 'S'),
+ (r'v', 'F'),
+ (r'(?!^)[aeiou]+', '')
+ ]
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ code = unidecode(word).lower()
+ for item in self.rules:
+ code = re.sub(item[0], item[1], code)
+ return code.upper()
A pyphonetics/phonetics/mra.py => pyphonetics/phonetics/mra.py +33 -0
@@ 0,0 1,33 @@
+import re
+from unidecode import unidecode
+
+from ..utils import squeeze
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class MatchingRatingApproach(PhoneticAlgorithm):
+ """
+ Functions related to the computation of the Match Rating Approach codex.
+
+ [Reference]: https://en.wikipedia.org/wiki/Match_rating_approach
+ [Article]: Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977).
+ Accessing Individual Records from Personal Data Files Using Nonunique Identifiers.
+ US National Institute of Standards and Technology. p. 17. NIST SP - 500-2.
+ """
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ codex = unidecode(word).upper()
+ codex = re.sub(r'[^A-Z]', r'', codex)
+
+ # Dropping non - leading vowels
+ codex = codex[0] + re.sub(r'[AEIOU]', r'', codex[1:])
+
+ # Dropping consecutive consonants
+ codex = squeeze(codex)
+
+ # Returning the codex
+ offset = min(3, len(codex) - 3)
+ return codex[:3] + codex[len(codex) - offset:offset + len(codex)]
A pyphonetics/phonetics/phonetic_algorithm.py => pyphonetics/phonetics/phonetic_algorithm.py +12 -0
@@ 0,0 1,12 @@
+class PhoneticAlgorithm:
+ """
+ The main Phonetic Algorithm class, to ensure a unified API
+ for all the included algorithms.
+ """
+ def phonetics(self, word):
+ """Get the phonetic representation of the word."""
+ pass
+
+ def sounds_like(self, word1, word2):
+ """Compare the phonetic representations of 2 words, and return a boolean value."""
+ return self.phonetics(word1) == self.phonetics(word2)
M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +66 -0
@@ 0,0 1,66 @@
+import re
+from unidecode import unidecode
+
+from ..utils import translation, squeeze
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class Soundex(PhoneticAlgorithm):
+ """
+ The Soundex algorithm.
+
+ [Reference]: https://en.wikipedia.org/wiki/Soundex
+ [Authors]: Robert C. Russel, Margaret King Odell
+ """
+ def __init__(self, refined=False):
+ self.translations = translation(
+ 'AEIOUYWHBPFVCSKGJQXZDTLMNR',
+ '000000DD111122222222334556'
+ )
+
+ self.refined_translations = translation(
+ 'AEIOUYWHBPFVCKSGJQXZDTLMNR',
+ '000000DD112233344555667889'
+ )
+
+ self.refined = refined
+ self.pad = lambda code: '{}0000'.format(code)[:4]
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ word = unidecode(word).upper()
+ word = re.sub(r'[^A-Z]', r'', word)
+
+ first_letter = word[0]
+
+ if self.refined:
+ return self._refined_soundex(first_letter, word)
+ else:
+ return self._soundex(first_letter, word)
+
+ #
+ # Private methods Simple/Refined Soundex
+ #
+ def _soundex(self, first_letter, word):
+ """Soundex algorithm."""
+ tail = ''.join(self.translations[char]
+ for char in word
+ if self.translations[char] != 'D')
+
+ # Dropping first code's letter if duplicate
+ if tail[0] == self.translations[first_letter]:
+ tail = tail[1:]
+
+ code = squeeze(tail).replace('0', '')
+ return self.pad(first_letter + code)
+
+ def _refined_soundex(self, first_letter, word):
+ """Refined Soundex algorithm."""
+ tail = ''.join(self.refined_translations[char]
+ for char in word
+ if self.refined_translations[char] != 'D')
+ code = squeeze(tail)
+ return first_letter + code
A pyphonetics/utils.py => pyphonetics/utils.py +15 -0
@@ 0,0 1,15 @@
+from itertools import groupby
+
+from .exceptions import WrongLengthException
+
+
+def translation(first, second):
+ """Create an index of mapped letters (zip to dict)."""
+ if len(first) != len(second):
+ raise WrongLengthException('Expected a unicode string!')
+ return dict(zip(first, second))
+
+
+def squeeze(word):
+ """squeeze the given sequence by dropping consecutive duplicates."""
+ return ''.join(x[0] for x in groupby(word))
A tests/__init__.py => tests/__init__.py +0 -0
A tests/test_phonetics.py => tests/test_phonetics.py +189 -0
@@ 0,0 1,189 @@
+from pyphonetics import Metaphone, Soundex, MatchingRatingApproach, FuzzySoundex, Lein
+
+
+def test_metaphone():
+ tests = [
+ ('TSKRMNXN', 'discrimination'),
+ ('HL', 'hello'),
+ ('TRT', 'droid'),
+ ('HPKRT', 'hypocrite'),
+ ('WL', 'well'),
+ ('AM', 'am'),
+ ('S', 'say'),
+ ('FSNT', 'pheasant'),
+ ('KT', 'god')
+ ]
+
+ metaphone = Metaphone()
+ for test in tests:
+ assert metaphone.phonetics(test[1]) == test[0]
+
+
+def test_soundex():
+ tests = [
+ ('R163', 'Rupert'),
+ ('R163', 'Robert'),
+ ('R150', 'Rubin'),
+ ('A261', 'Ashcroft'),
+ ('A261', 'Ashcraft'),
+ ('T522', 'Tymczak'),
+ ('P123', 'Pfister'),
+ ('A536', 'Andrew'),
+ ('W252', 'Wozniak'),
+ ('C423', 'Callister'),
+ ('H400', 'Hello'),
+ ('M635', 'Martin'),
+ ('B656', 'Bernard'),
+ ('F600', 'Faure'),
+ ('P620', 'Perez'),
+ ('G620', 'Gros'),
+ ('C120', 'Chapuis'),
+ ('B600', 'Boyer'),
+ ('G360', 'Gauthier'),
+ ('R000', 'Rey'),
+ ('B634', 'Barthélémy'),
+ ('H560', 'Henry'),
+ ('M450', 'Moulin'),
+ ('R200', 'Rousseau')
+ ]
+
+ soundex = Soundex()
+ for test in tests:
+ assert soundex.phonetics(test[1]) == test[0]
+
+
+def test_soundex_refined():
+ tests = [
+ ('T6036084', 'testing'),
+ ('T6036084', 'TESTING'),
+ ('T60', 'The'),
+ ('Q503', 'quick'),
+ ('B1908', 'brown'),
+ ('F205', 'fox'),
+ ('J408106', 'jumped'),
+ ('O0209', 'over'),
+ ('L7050', 'lazy'),
+ ('D6043', 'dogs')
+ ]
+
+ soundex = Soundex(refined=True)
+ for test in tests:
+ assert soundex.phonetics(test[1]) == test[0]
+
+
+def test_soundex_homophones():
+ tests = [
+ ('Braz', 'Broz'),
+ ('Caren', 'Caron', 'Carren', 'Charon', 'Corain', 'Coram', 'Corran',
+ 'Corrin', 'Corwin', 'Curran', 'Curreen','Currin', 'Currom', 'Currum', 'Curwen'),
+ ('Hairs', 'Hark', 'Hars', 'Hayers', 'Heers', 'Hiers'),
+ ('Lambard', 'Lambart', 'Lambert', 'Lambird', 'Lampaert', 'Lampard',
+ 'Lampart', 'Lamperd', 'Lampert', 'Lamport','Limbert', 'Lombard'),
+ ('Nolton', 'Noulton')
+ ]
+
+ soundex = Soundex()
+ for test in tests:
+ phonetics = [soundex.phonetics(word) for word in test]
+ assert len(set(phonetics)) == 1 # all phonetics are the same, so set size = 1
+
+
+def test_mra():
+ tests = [
+ ('BYRN', 'Byrne'),
+ ('BRN', 'Boern'),
+ ('SMTH', 'Smith'),
+ ('SMYTH', 'Smyth'),
+ ('CTHRN', 'Catherine'),
+ ('KTHRYN', 'Kathryn')
+ ]
+
+ mra = MatchingRatingApproach()
+ for test in tests:
+ assert mra.phonetics(test[1]) == test[0]
+
+
+def test_fuzzy_soundex():
+ tests = [
+ ('', ''),
+ ('Kristen', 'K6935'),
+ ('Krissy', 'K69'),
+ ('Christen', 'K6935'),
+ ('peter', 'P36'),
+ ('pete', 'P3'),
+ ('pedro', 'P36'),
+ ('stephen', 'S315'),
+ ('steve', 'S31'),
+ ('smith', 'S53'),
+ ('smythe', 'S53'),
+ ('gail', 'G4'),
+ ('gayle', 'G4'),
+ ('guillaume', 'G45'),
+ ('christine', 'K6935'),
+ ('christina', 'K6935'),
+ ('kristina', 'K6935'),
+ ('Wight', 'W3'),
+ ('Hardt', 'H6'),
+ ('Knight', 'N3'),
+ ('Czech', 'S7'),
+ ('Tsech', 'S7'),
+ ('gnomic', 'N59'),
+ ('Wright', 'R3'),
+ ('Hrothgar', 'R376'),
+ ('Hwaet', 'W3'),
+ ('Grant', 'G63'),
+ ('Hart', 'H6')
+ ]
+
+ fuzzy = FuzzySoundex()
+ for test in tests:
+ assert fuzzy.phonetics(test[0]) == test[1]
+
+
+def test_lein():
+ tests = [
+ ('Guillaume', 'G320'),
+ ('Dabbs', 'D450'),
+ ('Daves', 'D450'),
+ ('Davies', 'D450'),
+ ('Davis', 'D450'),
+ ('Debaca', 'D450'),
+ ('Debose', 'D450'),
+ ('Debus', 'D450'),
+ ('Defazio', 'D450'),
+ ('Defigh', 'D450'),
+ ('Deveaux', 'D450'),
+ ('Devese', 'D450'),
+ ('Devies', 'D450'),
+ ('Devos', 'D450'),
+ ('Dipiazza', 'D450'),
+ ('Divish', 'D450'),
+ ('Dobak', 'D450'),
+ ('Dobbs', 'D450'),
+ ('Dobis', 'D450'),
+ ('Dobish', 'D450'),
+ ('Dobosh', 'D450'),
+ ('Doepke', 'D450'),
+ ('Dopps', 'D450'),
+ ('Doubek', 'D450'),
+ ('Doviak', 'D450'),
+ ('Dubbs', 'D450'),
+ ('Dubke', 'D450'),
+ ('Dubois', 'D450'),
+ ('Duboise', 'D450'),
+ ('Dubose', 'D450'),
+ ('Dubs', 'D450'),
+ ('Dubukey', 'D450'),
+ ('Dubus', 'D450'),
+ ('Dufek', 'D450'),
+ ('Duffek', 'D450'),
+ ('Dupas', 'D450'),
+ ('Dupois', 'D450'),
+ ('Dupuis', 'D450'),
+ ('Arlène', 'A332'),
+ ('Lüdenscheidt', 'L125')
+ ]
+
+ lein = Lein()
+ for test in tests:
+ assert lein.phonetics(test[0]) == test[1]<
\ No newline at end of file
A tests/test_utils.py => tests/test_utils.py +16 -0
@@ 0,0 1,16 @@
+from pyphonetics.utils import squeeze, translation
+
+
+def test_squeeze():
+ tests = [
+ ('test', 'test'),
+ ('hello yellow', 'helo yelow'),
+ ('112345566', '123456')
+ ]
+
+ for test in tests:
+ assert squeeze(test[0]) == test[1]
+
+
+def test_translation():
+ assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}<
\ No newline at end of file