A README.md => README.md +43 -0
@@ 0,0 1,43 @@
+# Pyphonetics
+
+Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the following algorithms are implemented and supported:
+
+ * Soundex
+ * Metaphone
+ * Refined Soundex
+ * Fuzzy Soundex
+ * Lein
+ * Matching Rating Approach
+
+More will be added in the future.
+
+## Instalation
+
+The module is available in PyPI, just use `pip install pyphonetics`.
+
+
+## Usage
+
+```python
+>>> from pyphonetics import Soundex
+>>> soundex = Soundex()
+>>> soundex.phonetics('Rupert')
+'R163'
+>>> soundex.phonetics('Robert')
+'R163'
+>>> soundex.sounds_like('Robert', 'Rupert')
+True
+```
+
+The same API applies to every algorithm, e.g:
+
+```python
+>>> from pyphonetics import Metaphone
+>>> metaphone = Metaphone()
+>>> metaphone.phonetics('discrimination')
+'TSKRMNXN'
+```
+
+## Credits
+
+The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.<
\ No newline at end of file
M README.rst => README.rst +41 -1
@@ 2,4 2,44 @@
Pyphonetics
===========
-A Python 3 phonetics library.>
\ No newline at end of file
+Pyphonetics is a Python 3 library for phonetic algorithms. Right now, the following algorithms are implemented and supported:
+
+ * Soundex
+ * Metaphone
+ * Refined Soundex
+ * Fuzzy Soundex
+ * Lein
+ * Matching Rating Approach
+
+More will be added in the future.
+
+Instalation
+***********
+
+The module is available in PyPI, just use `pip install pyphonetics`.
+
+
+Usage
+*****
+
+ >>> from pyphonetics import Soundex
+ >>> soundex = Soundex()
+ >>> soundex.phonetics('Rupert')
+ 'R163'
+ >>> soundex.phonetics('Robert')
+ 'R163'
+ >>> soundex.sounds_like('Robert', 'Rupert')
+ True
+
+
+The same API applies to every algorithm, e.g:
+
+ >>> from pyphonetics import Metaphone
+ >>> metaphone = Metaphone()
+ >>> metaphone.phonetics('discrimination')
+ 'TSKRMNXN'
+
+Credits
+=======
+
+The module was largely based on the implementation of phonetic algorithms found in the [Talisman.js](https://github.com/Yomguithereal/talisman) Node NLP library.<
\ No newline at end of file
M pyphonetics/__init__.py => pyphonetics/__init__.py +9 -2
@@ 1,4 1,11 @@
"""A Python 3 phonetics library."""
-from .phonetics import Soundex, Metaphone, MatchingRatingApproach, FuzzySoundex, Lein
+from .phonetics import (Soundex,
+ Metaphone,
+ MatchingRatingApproach,
+ FuzzySoundex,
+ Lein,
+ RefinedSoundex,
+ # DaitchMokotoff
+ )
-__version__ = '0.2'
+__version__ = '0.3.1'
M pyphonetics/phonetics/__init__.py => pyphonetics/phonetics/__init__.py +3 -1
@@ 2,4 2,6 @@ from .soundex import *
from .metaphone import *
from .mra import *
from .fuzzy_soundex import *
-from .lein import *>
\ No newline at end of file
+from .lein import *
+from .refined_soundex import *
+# from .daitch_mokotoff import *
A pyphonetics/phonetics/daitch_mokotoff.py => pyphonetics/phonetics/daitch_mokotoff.py +201 -0
@@ 0,0 1,201 @@
+import re
+from unidecode import unidecode
+
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class DaitchMokotoff(PhoneticAlgorithm):
+ """
+ The Daitch-Mokotoff Soundex.
+
+ [Reference]: https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+ [Note]: For the (RS|RZ) part, the original algo says (94, 4) but most implementations
+ drop it to only (94). This implementation follows the original algo.
+ """
+ def __init__(self):
+ self.rules = {
+ 'A': [
+ [r'^(AI|AJ|AY)', 0, 1, None],
+ [r'^AU', 0, 7, None],
+ [None, 0, None, None]
+ ],
+ 'Ą': [
+ [None, None, None, [6, None]]
+ ],
+ 'B': [
+ [None, 7, 7, 7]
+ ],
+ 'C': [
+ [r'^CHS', 5, 54, 54],
+ [r'^CH', [5, 4], [5, 4], [5, 4]],
+ [r'^CK', [5, 45], [5, 45], [5, 45]],
+ [r'^(CSZ|CZS|CZ|CS)', 4, 4, 4],
+ [None, [5, 4], [5, 4], [5, 4]]
+ ],
+ 'D': [
+ [r'^(DRZ|DRS|DSH|DSZ|DZH|DZS|DS|DZ)', 4, 4, 4],
+ [r'^(DT|D)', 3, 3, 3]
+ ],
+ 'E': [
+ [r'^(EI|EJ|EY)', 0, 1, None],
+ [r'^EU', 1, 1, None],
+ [None, 0, None, None],
+ ],
+ 'Ę': [
+ [None, None, None, [6, None]]
+ ],
+ 'F': [
+ [r'^(FB|F)', 7, 7, 7]
+ ],
+ 'G': [
+ [None, 5, 5, 5]
+ ],
+ 'H': [
+ [None, 5, 5, None]
+ ],
+ 'I': [
+ [r'^(IA|IE|IO|IU)', 1, None, None],
+ [None, 0, None, None]
+ ],
+ 'J': [
+ [None, [1, 4], [None, 4], [None, 4]]
+ ],
+ 'K': [
+ [r'^KS', 5, 54, 54],
+ [r'^(KH|K)', 5, 5, 5]
+ ],
+ 'L': [
+ [None, 8, 8, 8]
+ ],
+ 'M': [
+ ['MNNM', None, 66, 66],
+ ['MN', 6, 6, 6]
+ ],
+ 'N': [
+ ['MNNM', None, 66, 66],
+ ['MN', 6, 6, 6],
+ ],
+ 'O': [
+ [r'^(OI|OJ|OY)', 0, 1, None],
+ [None, 0, None, None]
+ ],
+ 'P': [
+ [r'^(PF|PH|P)', 7, 7, 7]
+ ],
+ 'Q': [
+ [None, 5, 5, 5]
+ ],
+ 'R': [
+ [r'^(RZ|RS)', [94, 4], [94, 4], [94, 4]],
+ [None, 9, 9, 9]
+ ],
+ 'S': [
+ [r'^(SCHTSCH|SCHTSH|SCHTCH|SHTCH|SHCH|SHTSH)', 2, 4, 4],
+ [r'^SCH', 4, 4, 4],
+ [r'^(SHT|SCHT|SCHD)', 2, 43, 43],
+ [r'^SH', 4, 4, 4],
+ [r'^(STCH|STSCH|SC|STRZ|STRS|STSH)', 2, 4, 4],
+ [r'^ST', 2, 43, 43],
+ [r'^(SZCZ|SZCS)', 2, 4, 4],
+ [r'^(SZT|SHD|SZD|SD)', 2, 43, 43],
+ [r'^(SZ|S)', 4, 4, 4]
+ ],
+ 'T': [
+ [r'^(TCH|TTCH|TTSCH)', 4, 4, 4],
+ [r'^TH', 3, 3, 3],
+ [r'^(TRZ|TRS|TSCH|TSH|TS|TTS|TTSZ|TC|TZ|TTZ|TZS|TSZ)', 4, 4, 4],
+ [None, 3, 3, 3]
+ ],
+ 'Ţ': [
+ [None, [3, 4], [3, 4], [3, 4]]
+ ],
+ 'U': [
+ [r'^(UI|UJ|UY)', 0, 1, None],
+ [r'^(UE|U)', 0, None, None]
+ ],
+ 'V': [
+ [None, 7, 7, 7]
+ ],
+ 'W': [
+ [None, 7, 7, 7]
+ ],
+ 'X': [
+ [None, 5, 54, 54]
+ ],
+ 'Y': [
+ [None, 1, None, None]
+ ],
+ 'Z': [
+ [r'^(ZHDZH|ZDZH|ZDZ)', 2, 4, 4],
+ [r'^(ZHD|ZD)', 2, 43, 43],
+ [r'^(ZSCH|ZSH|ZH|ZS|Z)', 4, 4, 4]
+ ],
+ }
+ self.pad = lambda code: '{}000000'.format(code)[:6]
+ self.vowels = 'AEIOUY'
+
+ def _permutations(self, code):
+ codes = ['']
+
+ for current_part in code:
+
+ if isinstance(current_part, dict):
+ # Double the codes
+ for item in codes:
+ codes.append(item)
+
+ # Fill the nodes
+ length = len(codes)
+ for i in range(length):
+ s = current_part[0] if i < length/2 else current_part[1]
+ codes[i] = codes[i] + s if s is not None else codes[i]
+
+ else:
+ for i in range(len(codes)):
+ codes[i] += current_part
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ code = []
+ word = unidecode(word).upper()
+ current = re.sub(r'[^A-ZĄĘŢ]', r'', word)
+
+ start = True
+ last_pattern = ''
+
+ while len(current):
+ first_letter = current[0]
+ rules = self.rules[first_letter]
+
+ for rule in rules:
+ pattern, if_first_letter,\
+ vowel_next, usual = rule
+
+ match = re.match(pattern, current) if pattern else [first_letter]
+ if match:
+ if isinstance(match, list):
+ offset = len(match[0])
+ else:
+ offset = len(pattern)
+
+ correct_code = usual
+
+ if start:
+ correct_code = if_first_letter
+ elif current[offset] in self.vowels:
+ correct_code = vowel_next
+
+ if last_pattern != pattern and correct_code is not None:
+ code.append(correct_code)
+
+ last_pattern = pattern or first_letter
+ current = current[offset:]
+ break
+
+ start = False
+
+ return map(self.pad, self._permutations(code))
+
M pyphonetics/phonetics/metaphone.py => pyphonetics/phonetics/metaphone.py +8 -8
@@ 33,14 33,14 @@ class Metaphone(PhoneticAlgorithm):
(r'd', r'T'),
(r'g(?=h[^aeiou])', r''),
(r'gn(ed)?', r'N'),
- (r'([^g]|^)g(?=[iey])', '\1J'),
- (r'g+', 'K'),
- (r'ph', 'F'),
- (r'([aeiou])h(?=\b|[^aeiou])', '\1'),
- (r'[wy](?![aeiou])', ''),
- (r'z', 'S'),
- (r'v', 'F'),
- (r'(?!^)[aeiou]+', '')
+ (r'([^g]|^)g(?=[iey])', r'\1J'),
+ (r'g+', r'K'),
+ (r'ph', r'F'),
+ (r'([aeiou])h(?=\b|[^aeiou])', r'\1'),
+ (r'[wy](?![aeiou])', r''),
+ (r'z', r'S'),
+ (r'v', r'F'),
+ (r'(?!^)[aeiou]+', r'')
]
def phonetics(self, word):
A pyphonetics/phonetics/refined_soundex.py => pyphonetics/phonetics/refined_soundex.py +34 -0
@@ 0,0 1,34 @@
+import re
+from unidecode import unidecode
+
+from ..utils import translation, squeeze
+from ..exceptions import UnicodeException
+from .phonetic_algorithm import PhoneticAlgorithm
+
+
+class RefinedSoundex(PhoneticAlgorithm):
+ """
+ The Refined Soundex algorithm.
+
+ [Reference]: https://en.wikipedia.org/wiki/Soundex
+ [Authors]: Robert C. Russel, Margaret King Odell
+ """
+ def __init__(self):
+ self.translations = translation(
+ 'AEIOUYWHBPFVCKSGJQXZDTLMNR',
+ '000000DD112233344555667889'
+ )
+
+ def phonetics(self, word):
+ if not isinstance(word, str):
+ raise UnicodeException('Expected a unicode string!')
+
+ word = unidecode(word).upper()
+ word = re.sub(r'[^A-Z]', r'', word)
+
+ first_letter = word[0]
+ tail = ''.join(self.translations[char] for char in word
+ if self.translations[char] != 'D')
+
+ code = squeeze(tail)
+ return first_letter + code
M pyphonetics/phonetics/soundex.py => pyphonetics/phonetics/soundex.py +2 -29
@@ 13,18 13,11 @@ class Soundex(PhoneticAlgorithm):
[Reference]: https://en.wikipedia.org/wiki/Soundex
[Authors]: Robert C. Russel, Margaret King Odell
"""
- def __init__(self, refined=False):
+ def __init__(self):
self.translations = translation(
'AEIOUYWHBPFVCSKGJQXZDTLMNR',
'000000DD111122222222334556'
)
-
- self.refined_translations = translation(
- 'AEIOUYWHBPFVCKSGJQXZDTLMNR',
- '000000DD112233344555667889'
- )
-
- self.refined = refined
self.pad = lambda code: '{}0000'.format(code)[:4]
def phonetics(self, word):
@@ 35,19 28,7 @@ class Soundex(PhoneticAlgorithm):
word = re.sub(r'[^A-Z]', r'', word)
first_letter = word[0]
-
- if self.refined:
- return self._refined_soundex(first_letter, word)
- else:
- return self._soundex(first_letter, word)
-
- #
- # Private methods Simple/Refined Soundex
- #
- def _soundex(self, first_letter, word):
- """Soundex algorithm."""
- tail = ''.join(self.translations[char]
- for char in word
+ tail = ''.join(self.translations[char] for char in word
if self.translations[char] != 'D')
# Dropping first code's letter if duplicate
@@ 56,11 37,3 @@ class Soundex(PhoneticAlgorithm):
code = squeeze(tail).replace('0', '')
return self.pad(first_letter + code)
-
- def _refined_soundex(self, first_letter, word):
- """Refined Soundex algorithm."""
- tail = ''.join(self.refined_translations[char]
- for char in word
- if self.refined_translations[char] != 'D')
- code = squeeze(tail)
- return first_letter + code
M tests/test_phonetics.py => tests/test_phonetics.py +57 -3
@@ 1,4 1,5 @@
-from pyphonetics import Metaphone, Soundex, MatchingRatingApproach, FuzzySoundex, Lein
+from pyphonetics import Metaphone, Soundex, MatchingRatingApproach,\
+ FuzzySoundex, Lein, RefinedSoundex
def test_metaphone():
@@ 66,7 67,7 @@ def test_soundex_refined():
('D6043', 'dogs')
]
- soundex = Soundex(refined=True)
+ soundex = RefinedSoundex()
for test in tests:
assert soundex.phonetics(test[1]) == test[0]
@@ 186,4 187,57 @@ def test_lein():
lein = Lein()
for test in tests:
- assert lein.phonetics(test[0]) == test[1]>
\ No newline at end of file
+ assert lein.phonetics(test[0]) == test[1]
+
+
+# def test_daitch_mokotoff():
+# tests = [
+# ['Alpert', ['087930']],
+# ['Breuer', ['791900']],
+# ['Golden', ['583600']],
+# ['Haber', ['579000']],
+# ['Manheim', ['665600']],
+# ['Topf', ['370000']],
+# ['Kleinman', ['586660']],
+# ['Peters', ['739400', '734000']],
+# ['Peterson', ['739460', '734600']],
+# ['Moskowitz', ['645740']],
+# ['Moskovitz', ['645740']],
+# ['Auerbach', ['097500', '097400']],
+# ['Ohrbach', ['097500', '097400']],
+# ['Uhrbach', ['097500', '097400']],
+# ['Lipshitz', ['874400']],
+# ['Lippszyc', ['874500', '874400']],
+# ['Lewinsky', ['876450']],
+# ['Levinsky', ['876450']],
+# ['Szlamawicz', ['486740']],
+# ['Shlamovitz', ['486740']],
+# ['Jackson', ['154600', '454600', '145460', '445460']],
+# ['Jackson-Jackson', ['154654', '454654', '145465', '445465',
+# '154645', '454645', '145464', '445464', '154644', '454644']],
+# ['augsburg', ['054795']],
+# ['halberstadt', ['587943', '587433']],
+# ['mannheim', ['665600']],
+# ['chernowitz', ['596740', '496740']],
+# ['cherkassy', ['595400', '495400']],
+# ['berlin', ['798600']],
+# ['mintz', ['664000']],
+# ['eisenstadt', ['046433']],
+# ['izenstadt', ['046433']],
+# ['lewin', ['876000']],
+# ['levine', ['876000']],
+# ['szlachter', ['485390', '484390']],
+# ['chelm', ['586000', '486000']],
+# ['chelmie', ['586000', '486000']],
+# ['chelma', ['586000', '486000']],
+# ['helm', ['586000']],
+# ['daitch', ['340000']],
+# ['levy', ['870000']],
+# ['mokotoff', ['653700']],
+# ['chajackachac', ['515550', '415550', '514555', '414555', '515450', '415450', '514545',
+# '414545', '515540', '415540','514554', '414554', '515440', '415440', '514544', '414544']]
+# ]
+#
+# dm = DaitchMokotoff()
+# for test in tests:
+# assert dm.phonetics(test[0]) == test[1]
M tests/test_utils.py => tests/test_utils.py +1 -1
@@ 13,4 13,4 @@ def test_squeeze():
def test_translation():
- assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}>
\ No newline at end of file
+ assert translation(['a', 'b', 'c'], [1, 2, 3]) == {'a': 1, 'b': 2, 'c': 3}