syllabipy-0.2/0000755000175000017500000000000013062034444011455 5ustar jdgjdgsyllabipy-0.2/MANIFEST0000644000175000017500000000022213062034444012602 0ustar jdgjdg# file GENERATED by distutils, do NOT edit setup.cfg setup.py syllabipy/__init__.py syllabipy/legalipy.py syllabipy/sonoripy.py syllabipy/util.py syllabipy-0.2/README.md0000644000175000017500000000166213062034444012741 0ustar jdgjdg# syllabipy Collection of syllabification algorithms designed to be universal, aimed at low-resource languages without look-up techniques. No dependencies. LegaliPy and SonoriPy have been incorporated into the [Talisman](https://github.com/Yomguithereal/talisman) NLP library for JavaScript. ## installation syllabipy can be installed with `pip`: ~~~ $ pip install syllabipy ~~~ ### LegaliPy To get legal onsets for variable `text`: ~~~ >>> from syllabipy.legalipy import getOnsets >>> getOnsets(text) ~~~ To syllabify a word: ~~~ >>> from syllabipy.legalipy import LegaliPy >>> LegaliPy(word, getOnsets(text)) ~~~ Command line usage to syllabify a text file: ~~~ $ python legalipy.py text.txt ~~~ ### SonoriPy To syllabify a word: ~~~ >>> from syllabipy.sonoripy import SonoriPy >>> SonoriPy("justification") ['jus', 'ti', 'fi', 'ca', 'tion'] ~~~ Command line usage to syllabify a text file: ~~~ $ python sonoripy.py text.txt ~~~syllabipy-0.2/setup.py0000644000175000017500000000071413062034444013171 0ustar jdgjdgfrom distutils.core import setup setup( name='syllabipy', packages=['syllabipy'], version='0.1', description='Universal syllabification algorithms', author='Christopher Hench', author_email='chris.l.hench@gmail.com', url='https://github.com/henchc/syllabipy', download_url='https://github.com/henchc/syllabipy/archive/0.1.tar.gz', keywords=['syllabification', 'onset', 'sonority', 'ssp', 'legality'], classifiers=[], ) syllabipy-0.2/.gitignore0000644000175000017500000000202513062034444013444 0ustar jdgjdg# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject syllabipy-0.2/syllabipy/0000755000175000017500000000000013062034444013465 5ustar jdgjdgsyllabipy-0.2/syllabipy/sonoripy.py0000644000175000017500000001710413062034444015724 0ustar jdgjdgfrom __future__ import unicode_literals # for python2 compatibility # -*- coding: utf-8 -*- # created at UC Berkeley 2015 # Authors: Christopher Hench & Alex Estes © 2016 import codecs import sys from syllabipy.util import cleantext from datetime import datetime def SonoriPy(word, IPA=False): ''' This program syllabifies words based on the Sonority Sequencing Principle (SSP) >>> SonoriPy("justification") ['jus', 'ti', 'fi', 'ca', 'tion'] ''' def no_syll_no_vowel(ss): ''' cannot be a syllable without a vowel ''' nss = [] front = "" for i, syll in enumerate(ss): # if following syllable doesn't have vowel, # add it to the current one if not any(char in vowels for char in syll): if len(nss) == 0: front += syll else: nss = nss[:-1] + [nss[-1] + syll] else: if len(nss) == 0: nss.append(front + syll) else: nss.append(syll) return nss # SONORITY HIERARCHY, MODIFY FOR LANGUAGE BELOW # categories should be collapsed into more general groups vowels = 'aeiouyàáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūůÿ' approximates = '' nasals = 'lmnrw' fricatives = 'zvsf' affricates = '' stops = 'bcdgtkpqxhj' # SONORITY HIERARCHY for IPà if IPA: # categories can be collapsed into more general groups vowelcount = 0 # if vowel count is 1, syllable is automatically 1 sylset = [] # to collect letters and corresponding values for letter in word.strip(".:;?!)('" + '"'): if letter.lower() in 'aɔʌã': sylset.append((letter, 9)) vowelcount += 1 # to check for monosyllabic words elif letter.lower() in 'eéẽɛøoõ': sylset.append((letter, 8)) vowelcount += 1 # to check for monosyllabic words elif letter.lower() in 'iu': sylset.append((letter, 7)) vowelcount += 1 # to check for monosyllabic words elif letter.lower() in 'jwɥh': sylset.append((letter, 6)) elif letter.lower() in 'rl': sylset.append((letter, 5)) elif letter.lower() in 'mn': sylset.append((letter, 4)) elif letter.lower() in 'zvðʒ': sylset.append((letter, 3)) elif letter.lower() in 'sfθʃ': sylset.append((letter, 2)) elif letter.lower() in 'bdg': sylset.append((letter, 1)) elif letter.lower() in 'ptkx': sylset.append((letter, 0)) else: sylset.append((letter, 0)) # assign numerical values to phonemes (characters) vowelcount = 0 # if vowel count is 1, syllable is automatically 1 sylset = [] # to collect letters and corresponding values for letter in word: if letter.lower() in vowels: sylset.append((letter, 5)) vowelcount += 1 elif letter.lower() in approximates: sylset.append((letter, 4)) elif letter.lower() in nasals: sylset.append((letter, 3)) elif letter.lower() in fricatives: sylset.append((letter, 2)) elif letter.lower() in affricates: sylset.append((letter, 1)) elif letter.lower() in stops: sylset.append((letter, 0)) else: sylset.append((letter, 0)) # SSP syllabification follows final_sylset = [] if vowelcount == 1: # finalize word immediately if monosyllabic final_sylset.append(word) if vowelcount != 1: syllable = '' # prepare empty syllable to build upon for i, tup in enumerate(sylset): if i == 0: # if it's the first letter, append automatically syllable += tup[0] else: # add whatever is left at end of word, last letter if i == len(sylset) - 1: syllable += tup[0] final_sylset.append(syllable) # MAIN ALGORITHM BELOW # these cases DO NOT trigger syllable breaks elif (i < len(sylset) - 1) and tup[1] < sylset[i + 1][1] and \ tup[1] > sylset[i - 1][1]: syllable += tup[0] elif (i < len(sylset) - 1) and tup[1] > sylset[i + 1][1] and \ tup[1] < sylset[i - 1][1]: syllable += tup[0] elif (i < len(sylset) - 1) and tup[1] > sylset[i + 1][1] and \ tup[1] > sylset[i - 1][1]: syllable += tup[0] elif (i < len(sylset) - 1) and tup[1] > sylset[i + 1][1] and \ tup[1] == sylset[i - 1][1]: syllable += tup[0] elif (i < len(sylset) - 1) and tup[1] == sylset[i + 1][1] and \ tup[1] > sylset[i - 1][1]: syllable += tup[0] elif (i < len(sylset) - 1) and tup[1] < sylset[i + 1][1] and \ tup[1] == sylset[i - 1][1]: syllable += tup[0] # these cases DO trigger syllable break # if phoneme value is equal to value of preceding AND following # phoneme elif (i < len(sylset) - 1) and tup[1] == sylset[i + 1][1] and \ tup[1] == sylset[i - 1][1]: syllable += tup[0] # append and break syllable BEFORE appending letter at # index in new syllable final_sylset.append(syllable) syllable = "" # if phoneme value is less than preceding AND following value # (trough) elif (i < len(sylset) - 1) and tup[1] < sylset[i + 1][1] and \ tup[1] < sylset[i - 1][1]: # append and break syllable BEFORE appending letter at # index in new syllable final_sylset.append(syllable) syllable = "" syllable += tup[0] # if phoneme value is less than preceding value AND equal to # following value elif (i < len(sylset) - 1) and tup[1] == sylset[i + 1][1] and \ tup[1] < sylset[i - 1][1]: syllable += tup[0] # append and break syllable BEFORE appending letter at # index in new syllable final_sylset.append(syllable) syllable = "" final_sylset = no_syll_no_vowel(final_sylset) return (final_sylset) # command line usage if __name__ == '__main__': print("\n\nSonoriPy-ing...\n") sfile = sys.argv[1] # input text file to syllabify with open(sfile, 'r', encoding='utf-8') as f: text = f.read() sylls = [SonoriPy(w) for w in cleantext(text).split()] toprint = "" for word in sylls: for syll in word: if syll != word[-1]: toprint += syll toprint += "-" else: toprint += syll toprint += " " fmt = '%Y/%m/%d %H:%M:%S' date = "SonoriPyed on " + str(datetime.now().strftime(fmt)) finalwrite = date + "\n\n" + toprint with open('SonoriPyed.txt', 'w', encoding='utf-8') as f: f.write(finalwrite) print("\nResults saved to SonoriPyed.txt\n\n") syllabipy-0.2/syllabipy/util.py0000644000175000017500000000075313062034444015021 0ustar jdgjdgfrom __future__ import unicode_literals # for python2 compatibility # -*- coding: utf-8 -*- # created at UC Berkeley 2015 # Authors: Christopher Hench & Alex Estes © 2016 import string def cleantext(text): ''' cleans text of numbers, punctuation, and other non-syllabifiable characters ''' text = ''.join([x for x in text if not x.isdigit()]) text = ''.join( [x for x in text if x not in string.punctuation + '»«˃˂〈〉♦•—¿·']) return text syllabipy-0.2/syllabipy/__init__.py0000644000175000017500000000000013062034444015564 0ustar jdgjdgsyllabipy-0.2/syllabipy/legalipy.py0000644000175000017500000001172213062034444015650 0ustar jdgjdgfrom __future__ import unicode_literals # for python2 compatibility # -*- coding: utf-8 -*- # created at UC Berkeley 2015 # Authors: Christopher Hench & Alex Estes © 2016 import codecs import sys from datetime import datetime from syllabipy.util import cleantext from collections import Counter def getOnsets(text, threshold=.0002, clean=True): ''' takes text and yields list of onsets and words ''' vowels = 'aeiouyàáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūůÿ' if clean: tokens = cleantext(text).split() else: tokens = text.split() onsets = [] for word in tokens: word = word.lower() onset = "" for letter in word: if letter not in vowels: # onset is everying up to first vowel onset += letter else: break onsets.append(onset) onsets = [x for x in onsets if x != ''] # get rid of empty onsets # now remove onsets caused by errors, i.e. less than .02% of onsets freq = Counter(onsets) total_onsets = 0 for k, v in freq.items(): total_onsets += v onsets = [] for k, v in freq.items(): if (v / total_onsets) > threshold: onsets.append(k) return onsets def LegaliPy(word, onsets): ''' This function syllabifies any text in any language solely on the Onset Maximization principle (Principle of Legality) ''' longest_onset = len(max(onsets, key=len)) vowels = 'aeiouyàáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūůÿ' vowelcount = 0 revword = word[::-1] # reverse word to build onsets from back syllset = [] for letter in revword: if letter.lower() in vowels: vowelcount += 1 else: pass if vowelcount == 1: # monosyllabic syllset.append(revword) # begin main algorithm elif vowelcount > 1: syll = "" # to build syllable # following binaries trigger different routes onsetbinary = 0 newsyllbinary = 1 for letter in revword: if newsyllbinary == 1: # if we have a new syllable if letter.lower() not in vowels: syll += letter else: syll += letter newsyllbinary = 0 continue elif newsyllbinary == 0: # no longer new syllable if syll == "": # creates last syllable syll += letter elif (letter.lower() in onsets and syll[-1].lower() in vowels): syll += letter onsetbinary = 1 elif (letter.lower() + syll[-1].lower() in [ons[-2:] for ons in onsets] and syll[-2].lower() in vowels): syll += letter onsetbinary = 1 elif (letter.lower() + syll[-2:][::-1].lower() in [ons[-3:] for ons in onsets] and syll[-3].lower() in vowels): syll += letter onsetbinary = 1 elif (letter + syll[-3:][::-1].lower() in [ons[-4:] for ons in onsets] and syll[-4].lower() in vowels): syll += letter onsetbinary = 1 # order is important for following two conditionals # syllable doesn't end in vowel (onset not yet found) elif letter.lower() in vowels and onsetbinary == 0: syll += letter # syllable ends in vowel, onset found, restart syllable elif letter.lower() in vowels and onsetbinary == 1: syllset.append(syll) syll = letter else: syllset.append(syll) syll = letter newsyllbinary = 1 syllset.append(syll) # reverse syllset then reverse syllables syllset = [syll[::-1] for syll in syllset][::-1] return (syllset) # command line program if __name__ == '__main__': print("\n\nLegaliPy-ing...\n") sfile = sys.argv[1] # input text file to syllabify with open(sfile, 'r', encoding='utf-8') as f: text = f.read() onsets = getOnsets(text) toprintl = [] for token in cleantext(text).split(): toprintl.append(LegaliPy(token, onsets)) toprint = "" for word in toprintl: for syll in word: if syll != word[-1]: toprint += syll toprint += "-" else: toprint += syll toprint += " " onsetprint = (" - ".join([x for x in onsets]) + '\n\n') prologue = "Following onsets > .02 percent deemed 'legal':\n" fmt = '%Y/%m/%d %H:%M:%S' date = "LegaliPyed on " + str(datetime.now().strftime(fmt)) finalwrite = date + "\n\n" + prologue + onsetprint + toprint with open('LegaliPyed.txt', 'w', encoding='utf-8') as f: f.write(finalwrite) print("\nResults saved to LegaliPyed.txt\n\n") syllabipy-0.2/LICENSE0000644000175000017500000000211313062034444012457 0ustar jdgjdgThe MIT License (MIT) Copyright (c) 2016 Alex Estes and Christopher Hench Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. syllabipy-0.2/setup.cfg0000644000175000017500000000004713062034444013277 0ustar jdgjdg[metadata] description-file = README.md