khmerconverter v.1.4/0000755000175000001440000000000010652347056013406 5ustar k-dauserskhmerconverter v.1.4/TODO.TXT0000644000175000001440000000025610627165305014554 0ustar k-dausers- file filters for dialogs - include FK2 (font differ from FK1) - add non-breaking-space with some more characters (khan) -ឲ in Limon is mapped to wrong character khmerconverter v.1.4/modules/0000755000175000001440000000000010652347121015047 5ustar k-dauserskhmerconverter v.1.4/modules/legacyReorder.py0000644000175000001440000005141710627160564020226 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Lnicode fonts to Khmer Legacy Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module reorder unicode string accordding unicode order import unittest # important character to test in order to form a cluster SRAAA = unichr(0x17B6) SRAE = unichr(0x17C1) SRAOE = unichr(0x17BE) SRAOO = unichr(0x17C4) SRAYA = unichr(0x17BF) SRAIE = unichr(0x17C0) SRAAU = unichr(0x17C5) SRAII = unichr(0x17B8) SRAU = unichr(0x17BB) TRIISAP = unichr(0x17CA) MUUSIKATOAN = unichr(0x17C9) SAMYOKSANNYA = unichr(0x17D0) LA = unichr(0x17A1) NYO = unichr(0x1789) BA = unichr(0x1794) YO = unichr(0x1799) SA = unichr(0x179F) COENG = unichr(0x17D2) CORO = unichr(0x17D2) + unichr(0x179A) CONYO = unichr(0x17D2) + unichr(0x1789) SRAOM = unichr(0x17C6) MARK = unichr(0x17EA) #TODO: think about another relacement for the dotted circle; DOTCIRCLE = u'' # possible combination for sra E sraEcombining = { SRAOE:SRAII, SRAYA:SRAYA, SRAIE:SRAIE, SRAOO:SRAAA, SRAAU:SRAAU } CC_RESERVED = 0 CC_CONSONANT = 1 # Consonant of type 1 or independent vowel CC_CONSONANT2 = 2 # Consonant of type 2 CC_CONSONANT3 = 3 # Consonant of type 3 CC_ZERO_WIDTH_NJ_MARK = 4 # Zero Width non joiner character (0x200C) CC_CONSONANT_SHIFTER = 5 CC_ROBAT = 6 # Khmer special diacritic accent -treated differently in state table CC_COENG = 7 # Subscript consonant combining character CC_DEPENDENT_VOWEL = 8 CC_SIGN_ABOVE = 9 CC_SIGN_AFTER = 10 CC_ZERO_WIDTH_J_MARK = 11 # Zero width joiner character CC_COUNT = 12 # This is the number of character classes CF_CLASS_MASK = 0x0000FFFF CF_CONSONANT = 0x01000000 # flag to speed up comparing CF_SPLIT_VOWEL = 0x02000000 # flag for a split vowel -> the first part is added in front of the syllable CF_DOTTED_CIRCLE = 0x04000000 # add a dotted circle if a character with this flag is the first in a # syllable CF_COENG = 0x08000000 # flag to speed up comparing CF_SHIFTER = 0x10000000 # flag to speed up comparing CF_ABOVE_VOWEL = 0x20000000 # flag to speed up comparing # position flags CF_POS_BEFORE = 0x00080000 CF_POS_BELOW = 0x00040000 CF_POS_ABOVE = 0x00020000 CF_POS_AFTER = 0x00010000 CF_POS_MASK = 0x000f0000 # simple classes, they are used in the state table (in this file) to control the length of a syllable # they are also used to know where a character should be placed (location in reference to the base character) # and also to know if a character, when independently displayed, should be displayed with a dotted-circle to # indicate error in syllable construction _xx = CC_RESERVED _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER _c1 = CC_CONSONANT | CF_CONSONANT _c2 = CC_CONSONANT2 | CF_CONSONANT _c3 = CC_CONSONANT3 | CF_CONSONANT _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE # split vowel _va = _da | CF_SPLIT_VOWEL _vr = _dr | CF_SPLIT_VOWEL # Character class tables # _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs... # _sa Sign placed above the base # _sp Sign placed after the base # _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants) # _c2 Consonant of type 2 (only RO) # _c3 Consonant of type 3 # _rb Khmer sign robat u17CC. combining mark for subscript consonants # _cd Consonant-shifter # _dl Dependent vowel placed before the base (left of the base) # _db Dependent vowel placed below the base # _da Dependent vowel placed above the base # _dr Dependent vowel placed behind the base (right of the base) # _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following # it to create a subscript consonant or independent vowel # _va Khmer split vowel in wich the first part is before the base and the second one above the base # _vr Khmer split vowel in wich the first part is before the base and the second one behind (right of) the base khmerCharClasses = [ _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, # 1780 - 178F _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, # 1790 - 179F _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, # 17A0 - 17AF _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, # 17B0 - 17BF _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, # 17C0 - 17CF _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx, # 17D0 - 17DF ] #khmerStateTable[][CC_COUNT] = khmerStateTable = [ # xx c1 c2 c3 zwnj cs rb co dv sa sp zwj [ 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2], # 0 - ground state [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 1 - exit state (or sign to the right of the # syllable) [-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1], # 2 - Base consonant [-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1], # 3 - First ZWNJ before a register shifter # It can only be followed by a shifter or a vowel [-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14], # 4 - First register shifter [-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1], # 5 - Robat [-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1], # 6 - First Coeng [-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14], # 7 - First consonant of type 1 after coeng [-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14], # 8 - First consonant of type 2 after coeng [-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14], # 9 - First consonant or type 3 after ceong [-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1], # 10 - Second Coeng (no register shifter before) [-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14], # 11 - Second coeng consonant (or ind. vowel) no # register shifter before [-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1], # 12 - Second ZWNJ before a register shifter [-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14], # 13 - Second register shifter [-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1], # 14 - ZWJ before vowel [-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1], # 15 - ZWNJ before vowel [-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18], # 16 - dependent vowel [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18], # 17 - sign above [-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1], # 18 - ZWJ after vowel [-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1], # 19 - Third coeng [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1] # 20 - dependent vowel after a Robat ] def getCharClass(uniChar): """ input one unicode character; output an integer which is the Khmer type of the character or 0 """ if (type(uniChar) != unicode): raise TypeError('only accept unicode character') if (len(uniChar) != 1): raise TypeError('only accept one character, but ' + str(len(uniChar)) + ' chars found.') ch = ord(uniChar[0]) if (ch >= 0x1780): ch -= 0x1780 if (ch < len(khmerCharClasses)): return khmerCharClasses[ch] return 0 def reorder(sin): """ Given an input string of unicode cluster to reorder. The return is the visual based cluster (legacy style) string. """ if (type(sin) != unicode): raise TypeError('only accept unicode string') cursor = 0 state = 0 charCount = len(sin) result = u'' while (cursor < charCount): reserved = '' signAbove = '' signAfter = '' base = '' robat = '' shifter = '' vowelBefore = '' vowelBelow = '' vowelAbove = '' vowelAfter = '' coeng = False cluster = '' coeng1 = '' coeng2 = '' shifterAfterCoeng = False while (cursor < charCount): curChar = sin[cursor] kChar = getCharClass(curChar) charClass = kChar & CF_CLASS_MASK state = khmerStateTable[state][charClass] if (state < 0): break ## collect variable for cluster here if (kChar == _xx): reserved = curChar elif (kChar == _sa): # Sign placed above the base signAbove = curChar elif (kChar == _sp): # Sign placed after the base signAfter = curChar elif (kChar == _c1) or (kChar == _c2) or (kChar == _c3): # Consonant if (coeng): if (not coeng1): coeng1 = COENG + curChar else: coeng2 = COENG + curChar coeng = False else: base = curChar elif (kChar == _rb): # Khmer sign robat u17CC robat = curChar elif (kChar == _cs): # Consonant-shifter if (coeng1): shifterAfterCoeng = True shifter = curChar elif (kChar == _dl): # Dependent vowel placed before the base vowelBefore = curChar elif (kChar == _db): # Dependent vowel placed below the base vowelBelow = curChar elif (kChar == _da): # Dependent vowel placed above the base vowelAbove = curChar elif (kChar == _dr): # Dependent vowel placed behind the base vowelAfter = curChar elif (kChar == _co): # Khmer combining mark COENG coeng = True elif (kChar == _va): # Khmer split vowel, see _da vowelBefore = SRAE vowelAbove = sraEcombining[curChar] elif (kChar == _vr): # Khmer split vowel, see _dr vowelBefore = SRAE vowelAfter = sraEcombining[curChar] cursor += 1 # end of while (a cluster has found) # logic of vowel # determine if right side vowel should be marked if (coeng1 and vowelBelow): vowelBelow = MARK + vowelBelow elif ((base == LA or base == NYO) and vowelBelow): vowelBelow = MARK + vowelBelow elif (coeng1 and vowelBefore and vowelAfter): vowelAfter = MARK + vowelAfter # logic when cluster has coeng # should coeng be located on left side coengBefore = '' if (coeng1 == CORO): coengBefore = coeng1 coeng1 = '' elif (coeng2 == CORO): coengBefore = MARK + coeng2 coeng2 = '' if (coeng1 or coeng2): # NYO must change to other form when there is coeng if (base == NYO): base = MARK + base # coeng NYO must be marked if (coeng1 == CONYO): coeng1 = MARK + coeng1 if (coeng1 and coeng2): coeng2 = MARK + coeng2 # logic of shifter with base character if (base and shifter): # special case apply to BA only if (vowelAbove) and (base == BA) and (shifter == TRIISAP): vowelAbove = MARK + vowelAbove elif (vowelAbove): shifter = MARK + shifter elif (signAbove == SAMYOKSANNYA) and (shifter == MUUSIKATOAN): shifter = MARK + shifter elif (signAbove and vowelAfter): shifter = MARK + shifter elif (signAbove): signAbove = MARK + signAbove # add another mark to shifter if (coeng1) and (vowelAbove or signAbove): shifter = MARK + shifter if (base == LA or base == NYO): shifter = MARK + shifter # uncomplete coeng if (coeng and not coeng1): coeng1 = COENG elif (coeng and not coeng2): coeng2 = MARK + COENG # render DOTCIRCLE for standalone sign or vowel if (not base) and (vowelBefore or coengBefore or robat or shifter or coeng1 or coeng2 or vowelAfter or vowelBelow or vowelAbove or signAbove or signAfter): base = DOTCIRCLE # place of shifter shifter1 = '' shifter2 = '' if (shifterAfterCoeng): shifter2 = shifter else: shifter1 = shifter specialCaseBA = False if (base == BA) and ((vowelAfter == SRAAA) or (vowelAfter == SRAAU) or (vowelAfter == MARK + SRAAA) or (vowelAfter == MARK + SRAAU)): # SRAAA or SRAAU will get a MARK if there is coeng, redefine to last char vowelAfter = vowelAfter[-1] specialCaseBA = True if (coeng1) and (coeng1[-1] in [BA, YO, SA]): specialCaseBA = False # cluster formation if (specialCaseBA): cluster = vowelBefore + coengBefore + base + vowelAfter + robat + shifter1 + coeng1 + coeng2 + shifter2 + vowelBelow + vowelAbove + signAbove + signAfter else: cluster = vowelBefore + coengBefore + base + robat + shifter1 + coeng1 + coeng2 + shifter2 + vowelBelow + vowelAbove + vowelAfter + signAbove + signAfter result += cluster + reserved state = 0 # end of while return result class TestReordering(unittest.TestCase): def testKhmerType(self): # make sure the types are correct self.assertEqual(getCharClass(unichr(0x177F)), 0) self.assertEqual(getCharClass(unichr(0x1780)), _c1) self.assertEqual(getCharClass(unichr(0x1790)), _c1) self.assertEqual(getCharClass(unichr(0x17A0)), _c1) self.assertEqual(getCharClass(unichr(0x17B0)), _c1) self.assertEqual(getCharClass(unichr(0x17C0)), _vr) self.assertEqual(getCharClass(unichr(0x17D0)), _sa) self.assertEqual(getCharClass(unichr(0x17D4)), 0) self.assertEqual(getCharClass(unichr(0x17ff)), 0) def testReordering(self): # low vowel under coeng go deeper self.assertEqual(reorder(u'ខ្នុ'), u'ខ្ន' + MARK + u'ុ') self.assertEqual(reorder(u'ត្រូ'), u'្រត' + MARK + u'ូ') self.assertEqual(reorder(u'ព្យួ'), u'ព្យ' + MARK + u'ួ') # vowel under LA or NYO go deeper self.assertEqual(reorder(u'ឡូ'), u'ឡ' + MARK + u'ូ') self.assertEqual(reorder(u'ញួ'), u'ញ' + MARK + u'ួ') # mark vowel after when there is coeng self.assertEqual(reorder(u'ក្បៀ'), u'េក្ប' + MARK + u'ៀ') # coeng RO must on left side self.assertEqual(reorder(u'ក្រ'), u'្រក') self.assertEqual(reorder(u'ស្ត្រ'), MARK + u'្រស្ត') # mark NYO when there is coeng self.assertEqual(reorder(u'ញ្ជ'), MARK + u'ញ្ជ') # coeng NYO under NYO is marked self.assertEqual(reorder(u'ញ្ញ'), MARK + u'ញ' + MARK + u'្ញ') # coeng NYO under other is normal self.assertEqual(reorder(u'ជ្ញ'), u'ជ្ញ') # coeng1 and coeng2, mark coeng2 self.assertEqual(reorder(u'ក្ស្ម'), u'ក្ស' + MARK + u'្ម') # PA has no modification self.assertEqual(reorder(u'ប៉'), u'ប៉') # special case BA TRISSAP, mark vowel above self.assertEqual(reorder(u'ប៊ី'), u'ប៊' + MARK + u'ី') # base and shifter and vowel above, mark shifter self.assertEqual(reorder(u'ប៉ី'), u'ប' + MARK + u'៉ី') self.assertEqual(reorder(u'ស៊ី'), u'ស' + MARK + u'៊ី') # base and muusikatoan and samyok-sannya, mark shifter self.assertEqual(reorder(u'នំប៉័ង'), u'នំប' + MARK + u'៉' + u'័ង') # shifter and sign above and vowel after, mark shifter self.assertEqual(reorder(u'ស៊ាំ'), u'ស' + MARK + u'៊' + u'ាំ') # shifter and sign above, mark sign self.assertEqual(reorder(u'អ៊ំ'), u'អ៊' + MARK + u'ំ') # double mark shifter when there is ceong and sign or vowel above self.assertEqual(reorder(u'ប្ប៉័ង'), u'ប្ប' + MARK + MARK + u'៉' + u'័ង') # uncomplete coeng is still keep self.assertEqual(reorder(u'ក្'), u'ក្') self.assertEqual(reorder(u'ក្ក្'), u'ក្ក' + MARK + u'្') # render standalone vowel or sign with DOTCIRCLE self.assertEqual(reorder(u'ា'), DOTCIRCLE + u'ា') self.assertEqual(reorder(u'េ'), u'េ' + DOTCIRCLE) self.assertEqual(reorder(u'ើ'), u'េ' + DOTCIRCLE + u'ី') self.assertEqual(reorder(u'ំ'), DOTCIRCLE + u'ំ') self.assertEqual(reorder(u'ោះ'), u'េ' + DOTCIRCLE + u'ា' + DOTCIRCLE + u'ះ') # shifter is after ceong self.assertEqual(reorder(u'ន្ស៊ី'), u'ន្ស' + MARK + MARK + u'៊ី') # special case BA and sra A, get alway near to each other self.assertEqual(reorder(u'ប្រា'), u'្របា') self.assertEqual(reorder(u'ប្ដា'), u'បា្ដ') self.assertEqual(reorder(u'ប៉ា'), u'បា៉') self.assertEqual(reorder(u'ប្រៅ'), u'េ្របៅ') self.assertEqual(reorder(u'ប្ដៅ'), u'េបៅ្ដ') self.assertEqual(reorder(u'ប៉ៅ'), u'េបៅ៉') # except there is coeng between them self.assertEqual(reorder(u'ប្បា'), u'ប្បា') # other test of prevention # simple rendering self.assertEqual(reorder(u'គេ'), u'េគ') self.assertEqual(reorder(u'គោ'), u'េគា') self.assertEqual(reorder(u'កៅ'), u'េកៅ') self.assertEqual(reorder(u'លើ'), u'េលី') self.assertEqual(reorder(u'បៀ'), u'េបៀ') self.assertEqual(reorder(u'តឿ'), u'េតឿ') self.assertEqual(reorder(u'កាំ'), u'កាំ') # reorder of more than one cluster self.assertEqual(reorder(u'កាប់គោ'), u'កាប់េគា') self.assertEqual(reorder(u'ខាងលើ'), u'ខាងេលី') self.assertEqual(reorder(u'ចំពោះ'), u'ចំេពាះ') # mix with english text self.assertEqual(reorder(u'កកុះwelcomeកុម្ភៈ'), u'កកុះwelcomeកុម្ភៈ') # two shifter or 3 vowel or 4 sign self.assertEqual(reorder(u'៊៊'), DOTCIRCLE + u'៊' + DOTCIRCLE + u'៊') self.assertEqual(reorder(u'ាិី'), DOTCIRCLE + u'ា' + DOTCIRCLE + u'ិ' + DOTCIRCLE + u'ី') self.assertEqual(reorder(u'ំះ័'), DOTCIRCLE + u'ំ' + DOTCIRCLE + u'ះ' + DOTCIRCLE + u'័') # muusikatoan not convert when vowel is not high self.assertEqual(reorder(u'ម៉្ងៃ'), u'ៃម៉្ង') # two coengs with vowel that place on left and right (some bigger than normal) self.assertEqual(reorder(u'កញ្ច្រៀវ'), u'កេ' + MARK + u'្រ' + MARK + u'ញ្ច' + MARK + u'ៀវ') self.assertEqual(reorder(u'កញ្ច្រោង'), u'កេ' + MARK + u'្រ' + MARK + u'ញ្ច' + MARK + u'ាង') # vowel which under coeng go one step deeper self.assertEqual(reorder(u'ប្ដូ'), u'ប្ដ' + MARK + u'ូ') # don't break the sign self.assertEqual(reorder(u'ចុះ'), u'ចុះ') self.assertEqual(reorder(u'នុ៎ះ'), u'នុ៎ះ') # change sign OM, not shifter self.assertEqual(reorder(u'អ៊ុំ'), u'អ៊ុ' + MARK + u'ំ' ) # this is two cluster self.assertEqual(reorder(u'ាក'), DOTCIRCLE + u'ាក') if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/legacyConvertHTML.py0000644000175000001440000002537210627160441020724 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module convertes an *.html file from Unicode to legacy Khmer format import sys import os import codecs from legacyReorder import * from legacyConverter import * from FontDataXML import * import unittest import StringIO EURO = unichr(0x20AC) LF = unichr(13) CR = unichr(10) ZWSP = unichr(0x200B) ZWNJ = unichr(0x200C) ZWJ = unichr(0x200D) LEGALCHAR = u" %!@«»,;:(){}*+=-\\/$?" + EURO + LF + CR + ZWNJ + ZWSP + ZWJ STARTKHMER = u"«»" + ZWNJ + ZWSP + ZWJ MINUNIC = 0x1780 MAXUNIC = 0x17FF def convertHTML(inputFile, outputFile, outputFont): """converts Khmer Unicode HTML file to Khmer Legacy HTML file inputfilename: Khmer Unicode HTML file outputfilename: Khmer Legacy HTML file outputFont: font to use for the output in a tag """ if (inputFile == outputFile): raise TypeError('input file and output file must be different!') fd = FontData() if (not fd.isConvertable(outputFont)): raise TypeError('unknown output font ' + outputFont + '!') encode = findEncode(inputFile) try: htmlData = codecs.open(inputFile, encoding = encode) except IOError: raise IOError('Cannot open file "' + inputFile + '" for reading!') try: fout = codecs.open(outputFile, encoding = encode, mode = "w") except IOError: raise IOError('Cannot open file "' + outputFile + '" for writing!') convert(htmlData, fout, outputFont) htmlData.close() fout.close() def convert(finobj, foutobj, outputFont): '''gets input and output as file-like object, and get fontType it analyzes, converts the unicode to legacy and then produce the legacy output.''' fd = FontData() data = fd.unicodeData(outputFont) fontName = fd.defaultFont(outputFont) bodyFound = False # not found insideTag = True insideKhmer = False insideComment = False validChar = False unic = u'' keep = u'' for line in finobj: i = -1 if (not bodyFound): found = line.lower().find(''): insideComment = False keep += '-->' i += 2 else: keep += currChar continue if (line[i : i+4] == '" outputFont = 'abc' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, outputFont) self.assertEqual(foutobj.getvalue(), u"") # comment in two lines data = u"" outputFont = 'abc' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, outputFont) self.assertEqual(foutobj.getvalue(), u"") # comment with unicode inside data = u"" outputFont = 'abc' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, outputFont) self.assertEqual(foutobj.getvalue(), u"") # comment with start comment inside data = u"" outputFont = 'abc' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, outputFont) self.assertEqual(foutobj.getvalue(), u"") if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-inherit.xml0000644000175000001440000000110610542134320020177 0ustar k-dausers ]> khmerconverter v.1.4/modules/legacyConvertText.py0000644000175000001440000000552210627160472021103 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Khmer Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module creates a Text file in Khmer unicode format from legacy # input file. from FontDataXML import FontData import legacyReorder import legacyConverter import unittest import tempfile import os def convertTxtFile(inputFile, outputFile, outputFont): """ This function creates plain text file from the khmer unicode to legacy. """ if (inputFile == outputFile): raise TypeError('Input file and output file must be different!') fd = FontData() if (not fd.isConvertable(outputFont)): raise TypeError('Unknown output font ' + outputFont + ' !') try: fileIn = open(inputFile, 'r') except IOError: raise IOError('Cannot open file "' + inputFile + '" for reading!') try: fileOut = open(outputFile, 'w') except IOError: raise IOError('Cannot open file "' + outputFile + '" for writing!') data = fd.unicodeData(outputFont) # reading line by line from the input file, until end of file. for line in fileIn: result = line.decode('utf-8') result = legacyReorder.reorder(result) result = legacyConverter.converter(result, data) fileOut.write(result) fileIn.close() fileOut.close() class TestConvertTxt(unittest.TestCase): def testSameFile(self): # same file raise error self.assertRaises(TypeError, convertTxtFile, 'file1', 'file1', 'fontname') def testNotFound(self): # raise error when file is unreadable self.assertRaises(TypeError, convertTxtFile, 'file', 'file1', 'fontname') def testConversion(self): handle, filename = tempfile.mkstemp() tmpFile = open(filename, 'w') tmpFile.write(u'កខគ'.encode('utf-8')) tmpFile.close() # create a usable filename for output tmpFile = tempfile.TemporaryFile() outputFilename = tmpFile.name tmpFile.close() convertTxtFile(filename, outputFilename, 'abc') tmpFile = open(outputFilename, 'r') result = tmpFile.readline() tmpFile.close() os.remove(filename) os.remove(outputFilename) self.assertEqual(result.decode('utf-8'), 'kxK') if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-doubleunicode2.xml0000644000175000001440000000125510542134320021445 0ustar k-dausers ]> khmerconverter v.1.4/modules/unicodeReorder.py0000644000175000001440000004170110627161427020402 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy to Khmer Unicode Conversion and Vice Versa # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This program takes input as unordered khmer unicode string and produce # an organized khmer unicode string based on the rule: # baseCharacter [+ [Robat/Shifter] + [Coeng*] + [Shifter] + [Vowel] + [Sign]] import unittest BASE = 1 VOWEL = 2 SHIFTER = 4 # is shifter (muusekatoan or triisap) characer COENG = 8 SIGN = 16 LEFT = 32 # vowel appear on left side of base WITHE = 64 # vowel can be combined with SRA-E WITHU = 128 # vowel can be combined with SRA-U POSRAA = 256 # can be under PO SraA MUUS = 512 # shifter place on specific character TRII = 1024 # shifter place on specific character ROBAT = 2048 # is robat character # important character to test in order to form a cluster RO = unichr(0x179A) PO = unichr(0x1796) SRAAA = unichr(0x17B6) SRAE = unichr(0x17C1) SRAOE = unichr(0x17BE) SRAOO = unichr(0x17C4) SRAYA = unichr(0x17BF) SRAIE = unichr(0x17C0) SRAAU = unichr(0x17C5) SRAII = unichr(0x17B8) SRAU = unichr(0x17BB) TRIISAP = unichr(0x17CA) MUUSIKATOAN = unichr(0x17C9) SA = unichr(0x179F) SAMYOKSANNYA = unichr(0x17D0) NYO = unichr(0x1789) ZWSP = unichr(0x200B) # possible combination for sra E sraEcombining = { SRAII:SRAOE, SRAYA:SRAYA, SRAIE:SRAIE, SRAAA:SRAOO, SRAAU:SRAAU } # list of khmer character in unicode table (start from 1780) KHMERCHAR = [ BASE, # ក 0x1780 BASE, # ខ BASE, # គ BASE, # ឃ BASE, # ង BASE, # ច BASE, # ឆ BASE, # ជ BASE, # ឈ BASE + MUUS, # ញ BASE, # ដ BASE, # ឋ BASE, # ឌ BASE, # ឍ BASE, # ណ BASE + POSRAA, # ត BASE, # ថ 0x1790 BASE, # ទ BASE, # ធ BASE + POSRAA, # ន BASE + MUUS, # ប BASE, # ផ BASE, # ព BASE + POSRAA, # ភ BASE, # ម BASE + POSRAA, # យ BASE + POSRAA, # រ BASE + POSRAA, # ល BASE + POSRAA, # វ BASE, # BASE, # BASE + TRII, # ស BASE, # ហ 0x17A0 BASE, # ឡ BASE + TRII, # អ BASE, # អ BASE, # អា BASE, # ឥ BASE, # ឦ BASE, # ឧ BASE, # BASE, # ឩ BASE, # ឪ BASE, # ឫ BASE, # ឬ BASE, # ឭ BASE, # ឮ BASE, # ឯ BASE, # 0x17B0 BASE, # BASE, # ឲ BASE, # 0, 0, # VOWEL + WITHE + WITHU, # ា VOWEL + WITHU, # ិ VOWEL + WITHE + WITHU, # ី VOWEL + WITHU, # ឹ VOWEL + WITHU, # ឺ VOWEL, # ុ VOWEL, # ូ VOWEL, # ួ VOWEL + WITHU, # ើ VOWEL + WITHE, # ឿ VOWEL + WITHE, # ៀ 0x17C0 VOWEL + LEFT, # េ VOWEL + LEFT, # ែ VOWEL + LEFT, # ៃ VOWEL, # ោ VOWEL + WITHE, # ៅ SIGN + WITHU, # ំ SIGN, # ះ SIGN, # ៈ SHIFTER, # ៉ SHIFTER, # ៊ SIGN, # ់ ROBAT, # ៌ SIGN, # ៍ SIGN, # SIGN, # ៏ SIGN + WITHU, # ័​​ 0x17D0 SIGN, # COENG, # ្ SIGN # ] def khmerType(uniChar): """input one unicode character; output an integer which is the Khmer type of the character or 0""" if (type(uniChar) != unicode): raise TypeError('only accept one character') if (len(uniChar) != 1): raise TypeError('only accept one character, but ' + str(len(uniChar)) + ' chars found.') ch = ord(uniChar[0]) if (ch >= 0x1780): ch -= 0x1780 if (ch < len(KHMERCHAR)): return KHMERCHAR[ch] return 0 def reorder(sin): """ take khmer unicode string in visual-based cluster and return the rule-based cluster based on: baseCharacter [+ [Robat/Shifter] + [Coeng*] + [Shifter] + [Vowel] + [Sign]] and if the input is not unicode, return what it is input. """ if (type(sin) != unicode): raise TypeError('only accept unicode string') result = u'' sinLimit = len(sin)-1 i = -1 while i < sinLimit: # flush cluster baseChar = '' robat = '' shifter1 = '' shifter2 = '' coeng1 = '' coeng2 = '' vowel = '' poSraA = False sign = '' keep = '' cluster = '' while i < sinLimit: i += 1 sinType = khmerType(sin[i]) if (sinType & BASE): if (baseChar): # second baseChar -> end of cluster i -= 1 # continue with the found character break baseChar = sin[i] keep = '' continue elif (sinType & ROBAT): if (robat): # second robat -> end of cluster i -= 1 # continue with the found character break robat = sin[i] keep = '' continue elif (sinType & SHIFTER): if (shifter1): # second shifter -> end of cluster i -= 1 # continue with the found character break shifter1 = sin[i] keep = '' continue elif (sinType & SIGN): if (sign): # second sign -> end of cluster i -= 1 # continue with the found character break sign = sin[i] keep = '' continue elif (sinType & COENG): if (i == sinLimit): coeng1 = sin[i] break # if it is coeng RO (and consonent is not blank), it must belong to next cluster # so finish this cluster if ((sin[i+1] == RO) and (baseChar)): i -= 1 break # no coeng yet so dump coeng to coeng1 if (coeng1 == ''): coeng1 = sin[i : i+2] i += 1 keep = '' # coeng1 is coeng RO, the cluster can have two coeng, dump coeng to coeng2 elif (coeng1[1] == RO): coeng2 = sin[i : i+2] i += 1 keep = '' else: i -= 1 break elif (sinType & VOWEL): if (vowel == ''): # if it is sra E ES AI (and consonent is not blank), it must belong to next cluster, # so finish this cluster if ((sinType & LEFT) and (baseChar)): i -= 1 break # give vowel a value found in the unorganized cluster vowel = sin[i] keep = '' elif ((baseChar == PO) and (not poSraA) and ((sin[i] == SRAAA) or (vowel == SRAAA))): poSraA = True if vowel == SRAAA: vowel = sin[i] keep = '' else: # test if sra E is follow by sin which could combine with the following if (vowel == SRAE) and (sinType & WITHE): # give vowel a real sra by eleminate leading sra E vowel = sraEcombining[sin[i]] keep = '' # test if vowel can be combine with sin[i] (e.g. sra U and sra I or vice versa) elif ((vowel == SRAU and (sinType & WITHU)) or ((khmerType(vowel) & WITHU) and sin[i] == SRAU)): # vowel is not Sra I, II, Y, YY, transfer value from sin[i] to vowel if (not(khmerType(vowel) & WITHU)): vowel = sin[i] # select shifter1 base on specific consonants if (baseChar and (khmerType(baseChar) & TRII)): shifter1 = TRIISAP else: shifter1 = MUUSIKATOAN # examine if shifter1 should move shifter2 (base on coeng SA) elif (vowel == SRAE) and (sin[i] == SRAU): if (baseChar and (khmerType(baseChar) & TRII)): shifter1 = TRIISAP else: shifter1 = MUUSIKATOAN else: # sign can't be combine -> end of cluster i -= 1 # continue with the found character break else: # other than khmer -> end of cluster # continue with the next character if (sin[i] == ZWSP): # avoid breaking of cluster if meet zwsp # and move zwsp to end of cluster keep = ZWSP else: keep = sin[i] break # end of while loop # Organization of a cluster: if ((vowel == SRAU) and (sign) and (khmerType(sign) & WITHU)): # samyoksanha + sraU --> MUUS + samyoksanha if (sign == SAMYOKSANNYA): vowel = '' shifter1 = MUUSIKATOAN # examine if shifter1 should move shifter2 (base on coeng) if (shifter1 and coeng1): if (khmerType(coeng1[1]) & TRII): shifter2 = TRIISAP shifter1 = '' elif (khmerType(coeng1[1]) & MUUS): shifter2 = MUUSIKATOAN shifter1 = '' # examine if PO + sraA > NYO, this case can only determin # here since it need all element # coeng2 is priority (if coeng2 exist, coeng1 is always coRO) underPoSraA = coeng2 or coeng1 if (len(underPoSraA) == 2): underPoSraA = khmerType(underPoSraA[1]) & POSRAA # test if coeng is allow under PO + SRAA if ((poSraA and (not underPoSraA) and vowel) or ((baseChar == PO) and (vowel == SRAAA) and (not underPoSraA))): # change baseChar to letter NYO baseChar = NYO if ((vowel == SRAAA) and (not poSraA)): vowel = '' # PO + SraA + SraE if ((poSraA) and (vowel == SRAE)): # PO + sraA is not NYO and there is leading sraE they should be recombined vowel = SRAOO # Rule of cluster # if there are two coeng, ceong1 is always coRO so put it after coeng2 cluster = baseChar + robat + shifter1 + coeng2 + coeng1 + shifter2 + vowel + sign result = result + cluster + keep return result class TestReordering(unittest.TestCase): def testKhmerType(self): # make sure the types are correct self.assertEqual(khmerType(unichr(0x177F)), 0) self.assertEqual(khmerType(unichr(0x1780)) & BASE, BASE) self.assertEqual(khmerType(unichr(0x17B6)), VOWEL + WITHE + WITHU) self.assertEqual(khmerType(unichr(0x17C9)), SHIFTER) self.assertEqual(khmerType(unichr(0x17CB)), SIGN) self.assertEqual(khmerType(unichr(0x17D4)), 0) self.assertEqual(khmerType(unichr(0x17ff)), 0) def testReorder(self): # make sure it output correctly self.assertEqual(reorder(u'កករ'), u'កករ') # make sure it reorder SHIFTER self.assertEqual(reorder(u'បា៉'), u'ប៉ា') self.assertEqual(reorder(u'បូ៊'), u'ប៊ូ') # make sure sra A + OM produce same as sra OM + A self.assertEqual(reorder(u'របំា'), u'របាំ') self.assertEqual(reorder(u'របាំ'), u'របាំ') # make sure ceong go where is suppose to be self.assertEqual(reorder(u'្រកដាស្របដាល់កណ្ដាល'), u'ក្រដាសប្រដាល់កណ្ដាល') # correct leading sra E if there are allow sra (e.g. sra A) self.assertEqual(reorder(u'បេង្គាល ខាងេលី េសៀវេភៅ'), u'បង្គោល ខាងលើ សៀវភៅ') # correct muus of triisap self.assertEqual(reorder(u'សីុបុីអុី'), u'ស៊ីប៉ីអ៊ី') # correct use of shifter on coeng SA self.assertEqual(reorder(u'ន្សីុ'), u'ន្ស៊ី') # sra E is at the right place self.assertEqual(reorder(u'េគ្របែឡងគ្នា'), u'គេប្រឡែងគ្នា') # case PO + a + coeng self.assertEqual(reorder(u'បពា្ញា'), u'បញ្ញា') self.assertEqual(reorder(u'បព្ជាី'), u'បញ្ជី') self.assertEqual(reorder(u'កេ្រព្ជាាង'), u'កញ្ជ្រោង') # english text self.assertEqual(reorder(u'this is english text'), u'this is english text') self.assertEqual(reorder(u'ចំេពាះ'), u'ចំពោះ') self.assertEqual(reorder(u'្របឹក្សាធម្មនុពា្ញ'), u'ប្រឹក្សាធម្មនុញ្ញ') self.assertEqual(reorder(u'ៃហប៊ី'), u'ហៃប៊ី') self.assertEqual(reorder(u'បូពា៌'), u'បូព៌ា') self.assertEqual(reorder(u'បានេស្នី'), u'បានស្នើ') self.assertEqual(reorder(u'្K្ក្េ'), u'្K្ក្េ') self.assertEqual(reorder(u'្'), u'្') # ignore the zero width space seperation self.assertEqual(reorder(u'រ'+unichr(0x200B)+u'ដ្ឋ'+unichr(0x200B)+u'ាភិប'+unichr(0x200B) +u'ាល'), u'រ'+unichr(0x200B)+u'ដ្ឋាភិបាល') self.assertEqual(reorder(u'េ្របី'), u'ប្រើ') self.assertEqual(reorder(u'ប្បុ័ង'), u'ប្ប៉័ង') self.assertEqual(reorder(u'ប្ប័ុង'), u'ប្ប៉័ង') self.assertEqual(reorder(u'េសីុប'), u'ស៊ើប') self.assertEqual(reorder(u'េបីុង'), u'ប៉ើង') self.assertEqual(reorder(u'េសុីប'), u'ស៊ើប') self.assertEqual(reorder(u'កំុេជា'), u'កុំជោ') def testShifter(self): self.assertEqual(reorder(u'៊៊'), u'៊៊') # TWO SHIFTER def testSign(self): self.assertEqual(reorder(u'ះះ'), u'ះះ') # TWO SIGN def testAllCase(self): # resturn two cluster if two bases input self.assertEqual(reorder(u'កង'), u'កង') # resturn two cluster if two robats input self.assertEqual(reorder(u'៌៌'), u'៌៌') # resturn two cluster if two shifters input self.assertEqual(reorder(u'៉៊'), u'៉៊') # ceong ro is belong to the next cluster self.assertEqual(reorder(u'ប្រក'), u'បក្រ') # two ceongs: coeng ro is the sencode ceong self.assertEqual(reorder(u'្រស្ត'), u'ស្ត្រ') def testKhmerTypeError(self): self.assertRaises(TypeError, khmerType, 'KA') self.assertRaises(TypeError, khmerType, 1) self.assertRaises(TypeError, khmerType, {1:1}) def testReorderError(self): self.assertRaises(TypeError, reorder, 'this is ansi') if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/FontDataXML.py0000644000175000001440000004756710627160152017524 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Khmer Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Jens Herden (jens@khmeros.info) import string import sys import unittest from types import * from xml.dom.minidom import parse # Python 2.3 only has sets as a module try: foo = set() del(foo) except: from sets import Set as set MAXUNI = 0x7f # length of unicode table MAXLEG = 0x100 # length of legacy table MAXLENGTH = 10 # maximun length of allowed unicode replacement LEGSEP = ";" # separator for legacy attributes def beautify(fontname): """lowercase and no dash, no underscore""" return fontname.lower().replace("-", " ").replace("_", " ") class FontData: """ reads the fontdata from an XML file into a DOM tree but creates the data structures for the fonts only on demand """ # cache for the font data legacyFontData = None unicodeFontData = None # maps fontnames to fonttypes, which are keys in legacyFontData & unicodeFontData fontNames = None # maps fonttypes to DOM tree elements for reading on demand fontElements = None # maps fonttypes to its parents parents = None class Error(Exception): """ base class for exception from this class""" pass class XMLDataError(Error): """ an exception for errors in the internal structure of the XML file""" pass class FontNotFoundError(Error): """ an exception for errors when the wanted font is not available """ pass def __init__(self): """ constructor reads the xml file into class variables """ # only read if variables are empty if (not FontData.fontNames): self.readXML("fontdata.xml") def listFontTypes(self): """return sorted list of font types: ("abc-zwsp", "abc family", "baidok family", "limon family", "fk family", "truth family", "khek family", ...) """ types = list(set(FontData.fontNames.values())) types.sort() return types def listFontNames(self): """return sorted list of all known font names ("Limon S1", "Baidok3c", ...) """ names = FontData.fontNames.keys() names.sort() return names def listFontNamesForType(self, fonttype): """return sorted list of all known font names for a font type """ nameList = [] for name, type in FontData.fontNames.iteritems(): if ((type == fonttype) and (beautify(type) != name)): nameList.append(name) nameList.sort() return nameList def typeForFontname(self, fontname): """ return fonttype for fontname """ name = beautify(fontname) if (not FontData.fontNames.has_key(name)): raise self.FontNotFoundError("Font: " + name + " is unknown.") return FontData.fontNames[name] def isConvertable(self, fontname): """return True if fontname is known, else return False""" try: self.typeForFontname(fontname) except: return False return True def defaultFont(self, fonttype): """return default font name according to fontname""" if not FontData.fontElements.has_key(fonttype): return fonttype element = FontData.fontElements[fonttype] fontname = element.getAttribute("default") if (fontname): return fontname return fonttype def unicodeData(self, fontname): """return data for unicode FontData according to fontname""" try: fonttype = self.typeForFontname(fontname) except self.FontNotFoundError: raise # read if data not available if (not FontData.unicodeFontData.has_key(fonttype)): self.__readUnicodeData(fonttype) return FontData.unicodeFontData[fonttype] def legacyData(self, fontname): """return data for legacy FontData according to fontname""" try: fonttype = self.typeForFontname(fontname) except self.FontNotFoundError: raise # read if data not available if (not FontData.legacyFontData.has_key(fonttype)): self.__readLegacyData(fonttype) return FontData.legacyFontData[fonttype] # List and Check Encoding encodingData = ["cp1252", "utf-8", "latin-1", "iso-8859-1"] def listEncodingTypes(self): """return list of encodingData for display""" return ["Plain Text (cp1252)", "Plain Text (latin-1/iso-8859-1)", "Unicode (utf-8)"] def canDecode(self, encoding): """return True if encoding is in encodingData, else return False""" return encoding.lower() in self.encodingData # convert from other encoding to cp1252 def changeEncoding(self, sin, encoding): """if encoding is in encodingData but not cp1252, change encoding to cp1252 if return sin """ if (self.canDecode(encoding) and encoding != 'cp1252'): try: sin = sin.decode(encoding) sin = sin.encode('cp1252') except UnicodeEncodeError: raise TypeError("Codecs Error") return sin def __decodeLegacy(self, attribute): """convert the legacy attribute from number to string""" s = '' l = string.split(attribute, LEGSEP); for piece in l: if len(piece) > 0: s += chr(eval(piece)) return s #.encode('cp1252') def readXML(self, filename): try: datasource = open(filename) except IOError: try: datasource = open('modules/' + filename) except IOError: raise IOError('Cannot open ' + filename + ' for reading!') FontData.dom = parse(datasource) FontData.fontNames = dict() FontData.fontElements = dict() FontData.legacyFontData = dict() FontData.unicodeFontData = dict() FontData.parents = dict() fonts = FontData.dom.getElementsByTagName("font") if (len(fonts) == 0): raise self.XMLDataError("no Fonts found in " + filename) for font in fonts: fonttype = font.getAttribute("type").lower() if (FontData.fontElements.has_key(fonttype)): raise self.XMLDataError("Font: " + fonttype + " is defined twice in " + filename) inherit = font.getAttribute("inherit").lower() if (inherit): if (not FontData.fontElements.has_key(inherit)): raise self.XMLDataError("Font " + fonttype + " can not inherit unkown font " + inherit + " in " + filename) # map font to parent FontData.parents[fonttype] = inherit # map name to element FontData.fontElements[fonttype] = font hidden = (font.getAttribute("hidden").lower() == 'true') if (not hidden): # add default fonttype to known fontnames FontData.fontNames[beautify(fonttype)] = fonttype # add alias names aliases = font.getElementsByTagName("alias") for alias in aliases: FontData.fontNames[beautify(alias.getAttribute("name"))] = fonttype def __readUnicodeData(self, fonttype): """ reads the unicode data for one font from the dom tree """ if (not FontData.fontElements.has_key(fonttype)): raise self.FontNotFoundError("Font: " + fonttype + " is unknown.") font = FontData.fontElements[fonttype] # check and resolve inheritance if (FontData.parents.has_key(fonttype)): parent = FontData.parents[fonttype] # do we need to load the data? if (not FontData.unicodeFontData.has_key(parent)): self.__readUnicodeData(parent) # copy variables from parent unicodeDicts = list() for d in FontData.unicodeFontData[parent][0]: unicodeDicts.append(d.copy()) unicodeTable = list(FontData.unicodeFontData[parent][1]) else: # init variables unicodeDicts = list() unicodeTable = ["" for i in range(MAXUNI)] maps = font.getElementsByTagName("maps") if (len(maps) > 0): self.__readGlobalUni(maps[0], unicodeTable, unicodeDicts) self.__readFromUnicode(maps[0], unicodeDicts) FontData.unicodeFontData[fonttype] = (unicodeDicts, unicodeTable) def __readLegacyData(self, fonttype): """ reads the legacy data for one font from the dom tree """ if (not FontData.fontElements.has_key(fonttype)): raise self.FontNotFoundError("Font: " + fonttype + " is unknown.") font = FontData.fontElements[fonttype] # check and resolve inheritance if (FontData.parents.has_key(fonttype)): parent = FontData.parents[fonttype] # do we need to load the data? if (not FontData.legacyFontData.has_key(parent)): self.__readLegacyData(parent) # copy variables from parent legacyDict = FontData.legacyFontData[parent][0].copy() legacyTable = list(FontData.legacyFontData[parent][1]) else: # init variables legacyDict = dict() legacyTable = [unichr(i) for i in range(MAXLEG)] maps = font.getElementsByTagName("maps") if (len(maps) > 0): self.__readGlobal(maps[0], legacyTable, legacyDict) self.__readToUnicode(maps[0], legacyDict) FontData.legacyFontData[fonttype] = [legacyDict, legacyTable] def __readToUnicode(self, element, legacyDict): """ read the legacy replacements """ maps = element.getElementsByTagName("tounicode") if (len(maps) < 1): return for map in maps[0].getElementsByTagName("map"): unicode = map.getAttribute("unicode") legacy = self.__decodeLegacy(map.getAttribute("legacy").encode("cp1252")) l = len(legacy) if (l > 0 and l < MAXLENGTH): if (not legacyDict.has_key(legacy)): legacyDict[legacy] = unicode else: raise self.XMLDataError("Legacy character " + legacy + " defined twice in toUnicode.") def __readFromUnicode(self, element, unicodeDicts): """ read the unicode replacements """ maps = element.getElementsByTagName("fromunicode") if (len(maps) < 1): return for map in maps[0].getElementsByTagName("map"): unicode = map.getAttribute("unicode") legacy = self.__decodeLegacy(map.getAttribute("legacy")) l = len(unicode) if (l > 0 and l < MAXLENGTH): self.__addToUniData(unicode, legacy, unicodeDicts) def __readGlobalUni(self, element, unicodeTable, unicodeDicts): """ read the global replacements for unicode """ maps = element.getElementsByTagName("global") if (len(maps) < 1): return for map in maps[0].getElementsByTagName("map"): unicode = map.getAttribute("unicode") legacy = self.__decodeLegacy(map.getAttribute("legacy")) l = len(unicode) if (l == 1): i = ord(unicode) - 0x1780 if (i >= 0 and i < MAXUNI): if (unicodeTable[i] == ""): unicodeTable[i] = legacy else: raise self.XMLDataError("Unicode character " + ord(unicode).__hex__() + " defined twice in global.") else: self.__addToUniData(unicode, legacy, unicodeDicts) else: if (l > 1 and l < MAXLENGTH): self.__addToUniData(unicode, legacy, unicodeDicts) def __readGlobal(self, element, legacyTable, legacyDict): """ read the global replacements for legacy """ maps = element.getElementsByTagName("global") if (len(maps) < 1): return for map in maps[0].getElementsByTagName("map"): legacy = self.__decodeLegacy(map.getAttribute("legacy").encode("cp1252")) unicode = map.getAttribute("unicode") l = len(legacy) if (l == 1): i = ord(legacy) if (i >= 0 and i < MAXLEG): if (legacyTable[i] == unichr(i)): legacyTable[i] = unicode else: raise self.XMLDataError("Legacy character " + i.__hex__() + " defined twice in global.") elif (l > 0 and l < MAXLENGTH): if (not legacyDict.has_key(legacy)): legacyDict[legacy] = unicode else: raise self.XMLDataError("Legacy character " + legacy + " defined twice in global.") def __addToUniData(self, unicode, legacy, data): """ put the unicode to legacy mapping in the right dict. data will get new dicts if needed """ l = len(unicode) # sanity check if (l > 0 and l < MAXLENGTH): # make sure we have enough dict's while (len(data) < l): data.append(dict()) # insert into dict if (not data[l - 1].has_key(unicode)): data[l - 1][unicode] = legacy else: raise self.XMLDataError("Unicode string " + unicode + " already in datastructure.") # testing class TestFontData(unittest.TestCase): dataClass = FontData() def setUp(self): self.dataClass.readXML("test-fontdata.xml") def testReadXML(self): self.assertRaises(IOError, self.dataClass.readXML, "afilethatdoesnotexist.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.readXML, "test-nofonts.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.readXML, "test-doublefonts.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.readXML, "test-inherit.xml") def testReadXML2(self): self.dataClass.readXML("test-doubleunicode.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.unicodeData, "abc") self.dataClass.readXML("test-doublelegacy.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.legacyData, "abc") self.dataClass.readXML("test-doubleunicode2.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.unicodeData, "abc") self.dataClass.readXML("test-doublelegacy2.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.legacyData, "abc") self.dataClass.readXML("test-doublelegacy3.xml") self.assertRaises(self.dataClass.XMLDataError, self.dataClass.legacyData, "abc") def testListFontNames(self): fonts = self.dataClass.listFontNames() self.assertEqual(len(fonts), 7) def testListFontTypes(self): fonts = self.dataClass.listFontTypes() self.assertEqual(len(fonts), 4) def testLegacyData(self): self.assertRaises(self.dataClass.FontNotFoundError, self.dataClass.legacyData, "unkownFontName%%%%%") # the font 'hidden' is in the XML but should not be visible self.assertRaises(self.dataClass.FontNotFoundError, self.dataClass.legacyData, "hidden") # do we get for all fonts data? fonts = self.dataClass.listFontNames() for font in fonts: data = self.dataClass.legacyData(font) self.assertEqual(len(data), 2) self.assertEqual(type(data[0]), DictType) self.assertEqual(type(data[1]), ListType) def testLegacyData2(self): # test specific fonts; # abc-zwsp & abc-3 inherit from abc for font in ['abc', 'abc-3', 'abc-zwsp']: data = self.dataClass.legacyData(font) self.assertEqual(data[0]['b' + chr(255)], u"ឫ") self.assertEqual(data[1][ord("a")], u"កក") self.assertEqual(data[1][ord("b")], u"ស") self.assertEqual(data[1][ord("c")], unichr(0x200B)) for font in ['abc-3', 'abc-zwsp']: data = self.dataClass.legacyData(font) self.assertEqual(data[1][ord("1")], u"១") self.assertEqual(data[0]['1?'.encode('cp1252')], "") # do we get for all fonts data? fonts = self.dataClass.listFontNames() for font in fonts: data = self.dataClass.legacyData(font) self.assertEqual(len(data), 2) self.assertEqual(type(data[0]), DictType) self.assertEqual(type(data[1]), ListType) def testUnicodeData(self): self.assertRaises(self.dataClass.FontNotFoundError, self.dataClass.unicodeData, "unkownFontName%%%%%") # the font 'hidden' is in the XML but should not be visible self.assertRaises(self.dataClass.FontNotFoundError, self.dataClass.unicodeData, "hidden") # do we get for all fonts data? fonts = self.dataClass.listFontNames() for font in fonts: data = self.dataClass.unicodeData(font) self.assertEqual(len(data), 2) self.assertEqual(type(data[0]), ListType) for d in data[0]: self.assertEqual(type(d), DictType) self.assertEqual(type(data[1]), ListType) def testUnicodeData2(self): # abc-zwsp & abc-3 inherit from abc for font in ['abc', 'abc-3', 'abc-zwsp']: data = self.dataClass.unicodeData(font) self.assertEqual(data[0][0][unichr(0x200B)], "c") self.assertEqual(data[0][1][u"កក"], "a") self.assertEqual(data[0][2][u"ខ្រ"], "__") self.assertEqual(data[1][ord(u"ស") - 0x1780], "b") for font in ['abc-3', 'abc-zwsp']: data = self.dataClass.legacyData(font) self.assertEqual(data[1][ord("1")], u"១") self.assertEqual(data[0]['1?'.encode('cp1252')], "") def testIsConvertable(self): self.failIf(self.dataClass.isConvertable("unkownFontName%%%%%%")) for font in self.dataClass.listFontNames(): self.assert_(self.dataClass.isConvertable(font)) self.assert_(self.dataClass.isConvertable(font.upper())) self.assert_(self.dataClass.isConvertable(font.replace(" ", "-"))) self.assert_(self.dataClass.isConvertable(font.replace(" ", "_"))) def testAddToUniData(self): unicode = u"abcDEFG" legacy = "yes" data = list() self.dataClass._FontData__addToUniData(unicode, legacy, data) self.assertEqual(type(data), ListType) self.assertEqual(len(data), len(unicode)) self.assertEqual(data[len(unicode) - 1][unicode], legacy) def testFontdataxml(self): """ test that all data can be read without error """ self.dataClass.readXML("fontdata.xml") fonts = self.dataClass.listFontNames() for font in fonts: print font self.dataClass.unicodeData(font) self.dataClass.legacyData(font) def testListFontNamesForType(self): """ test that list of font names is correct """ self.dataClass.readXML("fontdata.xml") # the type should not be in the list of fonts for this type for fontType in self.dataClass.listFontTypes(): fontList = self.dataClass.listFontNamesForType(fontType) for font in fontList: self.assert_(not font == fontType) if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-doublelegacy.xml0000644000175000001440000000124310542134320021176 0ustar k-dausers ]> khmerconverter v.1.4/modules/test-fontdata.xml0000644000175000001440000000353610542134320020346 0ustar k-dausers ]> khmerconverter v.1.4/modules/test-nofonts.xml0000644000175000001440000000071510542134320020230 0ustar k-dausers ]> khmerconverter v.1.4/modules/test_fontdata.py0000644000175000001440000000410110627161443020257 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Khmer Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Jens Herden (jens@khmeros.info) import unittest from FontDataXML import FontData # testing the content of fontdata.xml class TestFontData(unittest.TestCase): dataClass = FontData() def setUp(self): self.dataClass.readXML("fontdata.xml") def testABCDataLegacy(self): data = self.dataClass.legacyData("abc") self.assertEqual(data[1][0xb2], unichr(0x201c)) self.assertEqual(data[1][0xb3], unichr(0x201d)) def testABCDataUnicode(self): data = self.dataClass.unicodeData("abc") self.assertEqual(data[0][0][unichr(0x201c)], chr(0xb2)) self.assertEqual(data[0][0][unichr(0x201d)], chr(0xb3)) self.assertEqual(data[0][0][u'«'], chr(0xb2)) self.assertEqual(data[0][0][u'»'], chr(0xb3)) def testLimonDataLegacy(self): data = self.dataClass.legacyData("limon") self.assertEqual(data[1][0x7b], unichr(0x201c)) self.assertEqual(data[1][0x7d], unichr(0x201d)) def testLimonDataUnicode(self): data = self.dataClass.unicodeData("limon") self.assertEqual(data[0][0][unichr(0x201c)], chr(0x7b)) self.assertEqual(data[0][0][unichr(0x201d)], chr(0x7d)) self.assertEqual(data[0][0][u'«'], chr(0x7b)) self.assertEqual(data[0][0][u'»'], chr(0x7d)) self.assertEqual(data[0][0][u'ឲ'], chr(0x5b)) ## def testDumpLimon(self): ## data = self.dataClass.unicodeData("limon") ## print "data: ", data[1] ## print "data0: ", data[0][0] ## print "data1: ", data[0][1] ## print "data2: ", data[0][2] if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-doubleunicode.xml0000644000175000001440000000124710542134320021364 0ustar k-dausers ]> khmerconverter v.1.4/modules/converterGUI.py0000644000175000001440000003235510627162066020012 0ustar k-dausers#Khmer converter # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module create a graphical user interface using basically on Tix module from Tix import * from mimetypes import MimeTypes import tkMessageBox import FontDataXML import os import tkFont import __version__ # Python 2.3 only has sets as a module try: foo = set() del(foo) except: from sets import Set as set # constants TOUNICODE = 'unicode' TOLEGACY = 'legacy' TYPETEXT = 'Plain Text' TYPEODT = 'OpenOffice.org Writer (*.odt)' TYPEHTML = 'Web Page, HTML' CODEISO = 'Plain Text (latin-1/iso-8859-1)' CODEUTF = 'Unicode (utf-8)' DEFAULTFONTSIZE = 11 # default value def setDefault(): directionVar.set(TOUNICODE) cmbDocType.pick(0) cmbEncoding.pick(0) cmbFontInput.pick(INDEXABCZWSP) cmbFontOutput.pick(INDEXKHMEROS) docTypeVar.set(TYPETEXT) fntInput.entry.delete(0, END) outputFileVar.set('') spnSize['state'] = NORMAL fontSizeVar.set(DEFAULTFONTSIZE) spnSize['state'] = DISABLED chkSize.deselect() evUnicodeClick() checkStatus() def checkConvertible(): if (inputFileVar.get() and outputFileVar.get()): btnConvert['state'] = NORMAL else: btnConvert['state'] = DISABLED def checkStatus(): cmbEncoding['state'] = DISABLED cmbEncoding.set_silent(' ') if (directionVar.get() == TOUNICODE): cmbFontInput['state'] = NORMAL cmbFontInput.pick(INDEXABCZWSP) if (docTypeVar.get() == TYPEODT): cmbFontOutput['state'] = NORMAL cmbFontOutput.pick(INDEXKHMEROS) cmbFontInput['state'] = DISABLED cmbFontInput.set_silent(' ') chkSize['state'] = NORMAL elif (docTypeVar.get() == TYPETEXT): cmbEncoding['state'] = NORMAL cmbEncoding.pick(0) cmbFontOutput['state'] = DISABLED cmbFontOutput.set_silent(' ') chkSize['state'] = DISABLED spnSize['state'] = DISABLED chkSize.deselect() elif (docTypeVar.get() == TYPEHTML): cmbFontOutput['state'] = DISABLED cmbFontOutput.set_silent(' ') chkSize['state'] = DISABLED spnSize['state'] = DISABLED chkSize.deselect() else: cmbFontInput['state'] = DISABLED cmbFontInput.set_silent(' ') cmbFontOutput['state'] = NORMAL if (docTypeVar.get() == TYPEODT): chkSize['state'] = NORMAL else: chkSize['state'] = DISABLED spnSize['state'] = DISABLED chkSize.deselect() # event handler def evResetClick(): setDefault() def evSetInput(val): mimeType = mt.guess_type(inputFileVar.get())[0] if (mimeType == 'application/vnd.oasis.opendocument.text'): docTypeVar.set(TYPEODT) elif (mimeType == 'text/html' or mimeType == 'text/xml'): docTypeVar.set(TYPEHTML) elif (mimeType == None): # if mimeType failed to give apropriet value, use file # extension instead. filename = inputFileVar.get().lower() if filename.endswith('.odt'): docTypeVar.set(TYPEODT) elif filename.endswith('.htm') or filename.endswith('.html'): docTypeVar.set(TYPEHTML) else: docTypeVar.set(TYPETEXT) else: docTypeVar.set(TYPETEXT) cmbDocType.set_silent(docTypeVar.get()) # set outputFile according to inputFile (path, filename) = os.path.split(inputFileVar.get()) outputFileVar.set(os.path.join(path, 'converted-' + filename)) checkStatus() checkConvertible() def evSetOutput(val): checkConvertible() def evChkSize(): if chkSizeVar.get(): spnSize['state'] = NORMAL else: spnSize['state'] = DISABLED def evDocumentTypeClick(val): if (val): checkStatus() def evUnicodeClick(): cmbFontOutput.slistbox.listbox.delete(0,END) for font in unicodeFontList: cmbFontOutput.insert(END, font) cmbFontOutput.pick(INDEXKHMEROS) checkStatus() def evLegacyClick(): cmbFontOutput.slistbox.listbox.delete(0,END) for font in legacyFontList: cmbFontOutput.insert(END, font) cmbFontOutput.pick(INDEXABCZWSP) checkStatus() def evHelp(): # create the top level window/frame import help helptext = help.HelpText(top) btnHelp['state'] = DISABLED top.wait_window(helptext.root) # use try because the widget might not exist anymore try: btnHelp['state'] = NORMAL except: pass def evQuit(): top.destroy() def evConvert(): btnConvert.focus_set() encoding = encodingVar.get() if (encoding == CODEISO): encoding = 'iso-8859-1' elif (encoding == CODEUTF): encoding = 'utf-8' else: encoding = 'cp1252' # output font is priority if (cmbFontOutput['state'] == NORMAL): font = fontOutVar.get().lstrip() else: font = fontInVar.get().lstrip() # set font size to zero if it is disabled if (spnSize['state'] == DISABLED): fontSize = None else: fontSize = fontSizeVar.get() direction = directionVar.get() inputFile = inputFileVar.get() docType = docTypeVar.get() outputFile = outputFileVar.get() # check if output file already exist if (os.path.exists(outputFile)): confirm = tkMessageBox.askquestion('Warning', 'The output file does already exist!\n\nDo you want to overwrite?') if (confirm == 'no'): return try: if (direction == TOUNICODE): if (docType == TYPEODT): import unicodeConvertOdt converter = unicodeConvertOdt.unicodeConvertOdt() converter.convertOdtFile(inputFile, outputFile, font, fontSize) elif (docType == TYPEHTML): import unicodeConvertHTML unicodeConvertHTML.convertHTMLFile(inputFile, outputFile, font) else: import unicodeConvertText unicodeConvertText.convertTxtFile(inputFile, outputFile, font, encoding) else: if (docType == TYPEODT): import legacyConvertOdt converter = legacyConvertOdt.legacyConvertOdt() converter.convertOdtFile(inputFile, outputFile, font, fontSize) elif (docType == TYPEHTML): import legacyConvertHTML legacyConvertHTML.convertHTML(inputFile, outputFile, font) else: import legacyConvertText legacyConvertText.convertTxtFile(inputFile, outputFile, font) except Exception, e: tkMessageBox.showerror('Error', e) else: tkMessageBox.showinfo('Information', 'Conversion successful!') # instance of mime type mt = MimeTypes() # create the top level window/frame top = Tk() title = 'Khmer Converter' + __version__.ver top.wm_title(title) if sys.platform[:4] in "win32": top.wm_iconbitmap("converter.ico") top.protocol("WM_DELETE_WINDOW", top.destroy) top.geometry("500x550+150+150") top.minsize(450, 480) frmTop = Frame(top) FONTSIZE = tkFont.Font(family="serif", size=DEFAULTFONTSIZE, weight="normal") #default color bgcol = "light blue" # create frame frmDirection = LabelFrame(frmTop, border=1, borderwidth=1, relief=GROOVE, label ='Conversion direction', labelside=TOP, bg=bgcol) frmDirection.label.configure(font = FONTSIZE) frmInput = LabelFrame(frmTop, border=1, borderwidth=1, relief=GROOVE, label='Input', labelside=TOP, bg=bgcol) frmInput.label.configure(font = FONTSIZE) frmOutput = LabelFrame(frmTop, border=1, borderwidth=1, relief=GROOVE, label='Output', labelside=TOP, bg=bgcol) frmOutput.label.configure(font = FONTSIZE) frmButton = Frame(frmTop, border=1, borderwidth=1) # create some widgets inside # widgets for direction directionVar = StringVar() radUnicode = Radiobutton(frmDirection.frame, text="Legacy to Unicode", font=FONTSIZE, command=evUnicodeClick, value=TOUNICODE, variable=directionVar) radLegacy = Radiobutton(frmDirection.frame, text="Unicode to Legacy", font=FONTSIZE, command=evLegacyClick, value=TOLEGACY, variable=directionVar) # widgets for input inputFileVar = StringVar() frmInputSub = Frame(frmInput.frame) lblInput = Label(frmInputSub, text='File:', font=FONTSIZE) fntInput = FileEntry(frmInputSub, dialogtype = 'tk_getOpenFile', width = 50, command=evSetInput, variable=inputFileVar) fntInput.entry.configure(font=FONTSIZE) docTypeVar = StringVar() cmbDocType = ComboBox(frmInput.frame, label="Document Type:", dropdown=1, editable=1, variable=docTypeVar, options='listbox.height 3', command=evDocumentTypeClick) cmbDocType.entry.configure(width=30, font=FONTSIZE) cmbDocType.label.configure(font = FONTSIZE) fontInVar = StringVar() cmbFontInput = ComboBox(frmInput.frame, label="Font:", dropdown=1, editable=1, variable=fontInVar) cmbFontInput.entry.configure(width=30, font = FONTSIZE) cmbFontInput.label.configure(font = FONTSIZE) encodingVar = StringVar() cmbEncoding = ComboBox(frmInput.frame, label = "Encoding:", dropdown=1, editable=1, variable=encodingVar, options='listbox.height 3' ) cmbEncoding.entry.configure(width=30, font=FONTSIZE) cmbEncoding.label.configure(font=FONTSIZE) # widgets for output outputFileVar = StringVar() frmOutputSub = Frame(frmOutput.frame) lblOutput = Label(frmOutputSub, text='File:', font=FONTSIZE) fntOutput = FileEntry(frmOutputSub, dialogtype='tk_getSaveFile', command=evSetOutput, variable=outputFileVar) fntOutput.entry.configure(font=FONTSIZE) fontOutVar = StringVar() cmbFontOutput = ComboBox(frmOutput.frame, label="Font:", dropdown = 1, editable=1, variable=fontOutVar) cmbFontOutput.entry.configure(width=30, font=FONTSIZE) cmbFontOutput.label.configure(font=FONTSIZE) frmSize = Frame(frmOutput.frame) chkSizeVar = IntVar() chkSize = Checkbutton(frmSize, text='Override size', variable=chkSizeVar, command=evChkSize, font=FONTSIZE) fontSizeVar = IntVar() spnSize = Control(frmSize, label='Size:', min=1, max=100, variable=fontSizeVar, value=DEFAULTFONTSIZE) spnSize.tk_focusFollowsMouse() spnSize.label.configure(font=FONTSIZE) spnSize.entry.configure(font=FONTSIZE, justify=RIGHT) #widgets for buttons btnConvert = Button(frmButton, text='Convert', command=evConvert, width=7, font = FONTSIZE) btnReset = Button(frmButton, text='Reset', command=evResetClick, width=7, font = FONTSIZE) btnHelp = Button(frmButton, text='Help', command=evHelp, width=7, font = FONTSIZE) btnQuit = Button(frmButton, text='Quit', command=evQuit, width=7, font = FONTSIZE) # pack the widgets frmTop.pack(fill=BOTH, expand=1) frmDirection.pack(fill=BOTH, expand=1, padx=10, pady=10) frmInput.pack(fill=BOTH, expand=1, padx=10) frmOutput.pack(fill=BOTH, expand=1, padx=10, pady=10) frmButton.pack(fill=BOTH, expand=1, padx=5) # direction radUnicode.pack(anchor=W, padx=10) radLegacy.pack(anchor=W, padx=10) # input frmInputSub.pack(anchor=W, fill=X, expand=1, padx=10, pady=5) lblInput.pack(side=LEFT, anchor=W) fntInput.pack(side=LEFT, anchor=W, fill=X, expand=1) cmbDocType.pack(anchor=E, padx=10, pady=5) cmbFontInput.pack(anchor=E, padx=10, pady=5) cmbEncoding.pack(anchor=E, padx=10, pady=5) # output frmOutputSub.pack(anchor=W, fill=X, expand=1, padx=10, pady=5) lblOutput.pack(side=LEFT, anchor=W) fntOutput.pack(side=LEFT, anchor=W, fill=X, expand=1) cmbFontOutput.pack(anchor=E, padx=10, pady=5) frmSize.pack(side=RIGHT, padx=10, pady=5) chkSize.pack(side=LEFT, anchor=E, padx=10, pady=5) spnSize.pack(side=LEFT, anchor=E) # buttons btnConvert.pack(side=LEFT, padx=5, anchor=NW) btnReset.pack(side=LEFT, padx=5, anchor=NW) btnHelp.pack(side=LEFT, padx=5, anchor=NW) btnQuit.pack(side=LEFT, padx=5, anchor=NW) # set value for all combo box fd = FontDataXML.FontData() legacyFontList = [] for font in fd.listFontTypes(): legacyFontList.append(font) for fontName in fd.listFontNamesForType(font): legacyFontList.append(" " + fontName) unicodeFontList = ['Khmer OS', 'Khmer OS Bokor', 'Khmer OS Battambang', 'Khmer OS Content', 'Khmer OS Fasthand', 'Khmer OS Freehand','Khmer OS Metal Chrieng', 'Khmer OS Muol', 'Khmer OS Muol Light', 'Khmer OS Muol Pali', 'Khmer OS SiemReap', 'Khmer OS System'] encodingList = fd.listEncodingTypes() documentList = [TYPETEXT, TYPEODT, TYPEHTML] for font in legacyFontList: cmbFontInput.insert(END, font) for font in unicodeFontList: cmbFontOutput.insert(END, font) for encoding in encodingList: cmbEncoding.insert(END, encoding) for documenttype in documentList: cmbDocType.insert(END, documenttype) # get index for abc-zwsp INDEXABCZWSP = 0 for i in range(len(legacyFontList)): if (legacyFontList[i] == 'abc-zwsp'): INDEXABCZWSP = i break # get index for khmer os INDEXKHMEROS = 0 for i in range(len(unicodeFontList)): if (unicodeFontList[i] == 'Khmer OS'): INDEXKHMEROS = i break evResetClick() # set the loop running top.mainloop() khmerconverter v.1.4/modules/legacyConvertOdt.py0000644000175000001440000002505710627160453020711 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module convertes an *.odt file from Unicode to legacy Khmer format from xml.dom import minidom from FontDataXML import FontData import legacyReorder import legacyConverter import unittest import zipfile from zlib import DEFLATED SP = unichr(0x20) ZWSP = unichr(0x200B) ZWNJ = unichr(0x200C) ZWJ = unichr(0x200D) INKHMER = SP + ZWSP + ZWNJ + ZWJ STARTKHMER = u"«»" + ZWNJ + ZWSP + ZWJ MINUNIC = 0x1780 MAXUNIC = 0x17FF KHMERSTYLE = 'kc-1.0-kstyle' class legacyConvertOdt: def __init__(self): self.CONTENTXML = 'content.xml' self.STYLESXML = 'styles.xml' self.fd = FontData() self.outputFont = "ABC-TEXT-05" self.outputFontSize = None self.data = self.fd.unicodeData(self.outputFont) def convertOdtFile(self, inputFileName, outputFileName, outputFont, outputFontSize = None): """This function converts OpenOffice.org Writer file. inputFileName : name of input file to convert outputFileName : name of output file. Default value is converted-inputFileName. outputFont : legacy output font name. Default depends on the font type. outputFontSize : force the font size the output file will use. value = None to ignore. """ if (not self.fd.isConvertable(outputFont)): raise TypeError('unknown output font ' + outputFont + '!') if (inputFileName == outputFileName): raise TypeError('input file and output file must be different!') try: # read zip file (.odt) zipIn = zipfile.ZipFile(inputFileName, "r") except IOError: raise IOError('Cannot open file "' + inputFileName + '" for reading!') if (not (self.CONTENTXML and self.STYLESXML) in zipIn.namelist()): raise TypeError('Input file' + inputFileName + 'is not an odt file!') try: # create new zip file (.odt) zipOut = zipfile.ZipFile(outputFileName, "w", DEFLATED) except IOError: raise IOError('Cannot open file "' + outputFileName + '" for writing!') # get data for the font self.outputFont = self.fd.defaultFont(outputFont) self.data = self.fd.unicodeData(self.outputFont) if (outputFontSize): self.outputFontSize = str(outputFontSize) + 'pt' for file in zipIn.namelist(): fdata = zipIn.read(file) # do the converting for content.xml only if (file == self.CONTENTXML): fdata = self.processContent(fdata) # TODO: do we need to test the type? When do we not want to encode in UTF-8 ? if (type(fdata) == unicode): fdata = fdata.encode('utf-8') elif (file == self.STYLESXML): fdata = self.processStyle(fdata) # TODO: do we need to test the type? When do we not want to encode in UTF-8 ? if (type(fdata) == unicode): fdata = fdata.encode('utf-8') zipOut.writestr(file, fdata) zipOut.close() zipIn.close() def processContent(self, xmlData): """ input: xml data in unicode string return: xml data string in legacy encoding where text is converted """ self.xmldoc = minidom.parseString(xmlData) officeNode = self.xmldoc.getElementsByTagName('office:text') officeAutoStylesNode = self.xmldoc.getElementsByTagName('office:automatic-styles')[0] officeFontFaceDecls = self.xmldoc.getElementsByTagName('office:font-face-decls')[0] # add font information self.addFontInfo(officeAutoStylesNode, officeFontFaceDecls) # go through office node and convert to legacy. self.goThru(officeNode, self.convertIfUnicode) return self.xmldoc.toxml() def processStyle(self, xmldata): """change font name and size, convert data to legacy in xmldata @param xmldata: xml string to parse.""" self.xmldoc = minidom.parseString(xmldata) officeAutoStylesNode = self.xmldoc.getElementsByTagName('office:automatic-styles')[0] officeFontFaceDecls = self.xmldoc.getElementsByTagName('office:font-face-decls')[0] officeMasterStylesNode = self.xmldoc.getElementsByTagName('office:master-styles') # go through node, replace font, and convert data to legacy. self.addFontInfo(officeAutoStylesNode, officeFontFaceDecls) self.goThru(officeMasterStylesNode, self.convertIfUnicode) return self.xmldoc.toxml('utf-8') def goThru (self, nodelist, function): """go through nodelist and call function with child node as argument. @param nodelist: dom's node list. @param function: function to call, child argument will be provided by goThru.""" for node in nodelist: if node.hasChildNodes(): for child in node.childNodes: function(child) self.goThru (node.childNodes, function) def addFontInfo(self, autoStyleNode, declsNode): """add "style:style" to node.""" # add font declaration styleFontFaceNode = self.xmldoc.createElement('style:font-face') styleFontFaceNode.setAttribute('style:name', self.outputFont) styleFontFaceNode.setAttribute('svg:font-family', self.outputFont) declsNode.appendChild(styleFontFaceNode) # add font style styleNode = self.xmldoc.createElement('style:style') styleNode.setAttribute('style:family', 'text') styleNode.setAttribute('style:name', KHMERSTYLE) styleTextPropNode = self.xmldoc.createElement('style:text-properties') styleTextPropNode.setAttribute('style:font-name', self.outputFont) if (self.outputFontSize): styleTextPropNode.setAttribute('fo:font-size', self.outputFontSize) styleNode.appendChild(styleTextPropNode) autoStyleNode.appendChild(styleNode) def convertIfUnicode(self, node): """ take Khmer Unicode data out of current node, convert it and put it in a new node which mark as khmerConverter_DefaultStyle. """ if not node.nodeValue: return node sin = node.data newNode = self.xmldoc.createDocumentFragment() cursor = 0 charCount = len(sin) while (cursor < charCount): khmStr = u'' othStr = u'' while (cursor < charCount): val = ord(sin[cursor]) # in khmer range if ((val >= MINUNIC) and (val <= MAXUNIC)) or (STARTKHMER.find(unichr(val)) != -1) or (len(khmStr) > 0 and INKHMER.find(unichr(val)) != -1): if (othStr): break khmStr += sin[cursor] # in other range else: if (khmStr): break othStr += sin[cursor] cursor += 1 # end of while (khmer string or other string found) if (khmStr): # convert khmer text khmStr = legacyReorder.reorder(khmStr) khmStr = legacyConverter.converter(khmStr, self.data) khmStr = khmStr.decode('cp1252') # add new khmer node khmNode = self.xmldoc.createElement('text:span') khmNode.setAttribute('text:style-name', KHMERSTYLE) # add data txtNode = self.xmldoc.createTextNode(khmStr) khmNode.appendChild(txtNode) newNode.appendChild(khmNode) elif (othStr): txtNode = self.xmldoc.createTextNode(othStr) newNode.appendChild(txtNode) node.parentNode.replaceChild(newNode, node) class TestConvertOdt(unittest.TestCase): def testSameFile(self): # same file raise error self.assertRaises(TypeError, legacyConvertOdt().convertOdtFile, 'file1', 'file1', 'abc') def testWrongFont(self): # same file raise error self.assertRaises(TypeError, legacyConvertOdt().convertOdtFile, 'file1', 'file2', 'fontTHATdoesNOTexist') def testOpenUnavailableFile(self): # raise error when file is unavailable self.assertRaises(IOError, legacyConvertOdt().convertOdtFile, 'AfileTHATdoesNOTexist', 'file1', 'abc') def testProcessContent(self): header = u"" fontDeclOpen = u"" fontDeclClose = u"" autoStyleOpen = u"" autoStyleClose = u"" contentOpen = u"" contentClose = u"" myXml = header + \ fontDeclOpen + fontDeclClose + \ autoStyleOpen + autoStyleClose + \ contentOpen + \ "កខគabcច ឆ ជxyz" + \ contentClose convertedXml = header + \ fontDeclOpen + \ u"" + \ fontDeclClose + \ autoStyleOpen + \ "" + \ autoStyleClose + \ contentOpen + \ "kxKabcc q Cxyz" + \ contentClose self.assertEqual(legacyConvertOdt().processContent(myXml.encode('utf-8')), convertedXml) if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-doublefonts.xml0000644000175000001440000000101110542134320021054 0ustar k-dausers ]> khmerconverter v.1.4/modules/help.py0000644000175000001440000000677710627160410016366 0ustar k-dausers# Khmer converter # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module create a class for displaying help text from Tix import * import sys import __version__ class HelpText: def __init__(self, parent): self.root = Toplevel(parent) self.root.title('Help') if sys.platform[:4] in "win32": self.root.wm_iconbitmap("converter.ico") self.help = ScrolledText (self.root, scrollbar = 'y') self.help.pack(fill = BOTH, expand = 1) self.help.text['font'] = 'serif 12' self.help.subwidget_list['text'].insert(END, """Khmer Converter Version: """ + __version__.ver + """ Copyright (c) 2006 by The WordForge Foundation (All Rights Reserved) This program converts plain text, OpenOffice.org Writer (odt) or HTML File from legacy to unicode format or vice versa. The currently supported legacy fonts are ABC, ABC-ZWSP, Baidok, FK, Khek, Limon, and Truth. It is supported to run on Linux and Windows platform. But it should work on any platform that runs Python as well.\n\n\n""") self.help.subwidget_list['text'].insert(END, """USAGE: 1. Choose Conversion direction: * Legacy to Unicode: convert old Khmer font file (ABC, Limon,...) to Khmer Unicode file * Unicode to Legacy: convert Khmer unicode file to old Khmer font file (ABC, Limon, ...) 2. Choose Input: * File: file to convert * Document Type: - Plain text: just the normal text file. - OpenOffice.org Writer: for OpenOffice Writer file with extension .odt - HTML: for web page file. * Font: old fonts of the input file * Encoding: - Plain Text (cp1252): for file with Khmer legacy character. - Plain Text (latin-1/iso-8859-1): for file with Khmer legacy character. - UTF-8 : for file with unicode character. 3. Choose Ouput: * File: result file after conversion. * Font (for OpenOffice.Org writer): font for output file. * Override size (OpenOffice.Org writer): force converter to use specific size for output file, leave unchecked to use font size according to input file. 4. Click "Convert" button * A message box will confirm you whether it is successful or not.\n\n\n""") self.help.subwidget_list['text'].insert(END, """AUTHORS - Hok Kakada (hokkakada@khmeros.info)\n - Keo Sophon (keosophon@khmeros.info)\n - San Titvirak (titvirak@khmeros.info)\n - Seth Chanratha (sethchanratha@khmeros.info)\n\n\n""") self.help.subwidget_list['text'].insert(END, """LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. Please see more details about license\nhttp://www.gnu.org/licenses/lgpl.html\n\n""") self.help.text['state'] = 'disabled' khmerconverter v.1.4/modules/unicodeConvertHTML.py0000644000175000001440000003220410627160602021075 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy fonts to Khmer Unicode Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module creates an HTML file in Khmer unicode format from legacy # input file. import sys import codecs from unicodeReorder import * from unicodeProcess import * from FontDataXML import * import htmlentitydefs import unittest import StringIO LF = unichr(13) CR = unichr(10) MAXLEGACY = 0xFF def convertHTMLFile(inputFile, outputFile, fontType): """converts Khmer Legacy HTML file to Khmer Unicode HTML file inputfilename: name of Khmer Legacy HTML file you wanna convert. outputfilename: Khmer Unicode HTML file) fontType: font for the conversion """ if (inputFile == outputFile): raise TypeError('input file and output file must not be the same!') fd = FontData() if (not fd.isConvertable(fontType)): raise TypeError('unknown output font ' + fontType + '!') encode = findEncode(inputFile) try: ## htmlData = codecs.open(inputFile, encoding = encode) #TODO: open file with encoding htmlData = open(inputFile) except IOError: raise IOError('Cannot open file "' + inputFile + '" for reading!') try: fout = codecs.open(outputFile, encoding = "utf-8", mode = "w") except IOError: raise IOError('Cannot open file "' + outputFile + '" for writing!') convert(htmlData, fout, fontType, encode) htmlData.close() fout.close() def convert(finobj, foutobj, fontName, encode): '''conversion process. finobj : input file-like object in legacy format. foutobj : output file-like object in unicode format after conversion fontName : legacy font name of the input file. encode : the encoding that input file use.''' fd = FontData() fontType = fd.typeForFontname(fontName) data = fd.legacyData(fontType) bodyFound = False # not found insideTag = True insideLegacy = False insideComment = False legacy = '' keep = u'' setCharSet = '\n\n' headPart = '' for line in finobj: i = -1 if (not bodyFound): found = line.lower().find('' + setCharSet + '\n') else: foundCharset = headPartLower.find('charset=', foundHead + 5) if (foundCharset == -1): headEnd = headPartLower.find('>', foundHead + 5) if (headEnd == -1): # very broken HTML foutobj.write(headPart + setCharSet) else: foutobj.write(headPart[ : headEnd + 1] + setCharSet + headPart[headEnd + 1: ]) else: # charset found! foutobj.write(headPart[ : foundCharset+8] + 'utf-8' + headPart[foundCharset+8+len(encode) : ]) while (i < len(line) - 1): i += 1 val = ord(line[i]) currChar = unichr(val) if (insideComment): if (line[i : i+3] == '-->'): insideComment = False keep += '-->' i += 2 else: keep += currChar continue if (line[i : i+4] == '' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc','iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + 'sala') def testConversion(self): # convert one character data ='salak' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaក') # convert two character data ='salakx' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaកខ') def testEntity(self): # test character with value less than 0xFF data ='salak' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaក') # test ក data ='salaក' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaក') # test ក data ='salaក' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaក') # test entities with no ; data ='salaកក' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj, foutobj, 'abc', 'iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaកក') # test © data ='sala©' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj,foutobj,'abc','iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'sala្ច') # test © data ='sala©' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj,foutobj,'abc','iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + u'salaចៀ័ផយ') # test & data ='sala&' finobj = StringIO.StringIO(data) foutobj = StringIO.StringIO() convert(finobj,foutobj,'abc','iso-8859-1') self.assertEqual(foutobj.getvalue(), u'' + self.METALF + 'sala' + unichr(0x17d0) + '') if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/unicodeConvertText.py0000644000175000001440000000626310627160664021273 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy fonts to Khmer Unicode Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module creates a Text file in Khmer unicode format from legacy # input file. import unicodeProcess import unicodeReorder from FontDataXML import FontData import unittest def convertTxtFile(inputFileName, outputFileName, fontType, encoding): """ converts Khmer legacy plain text file and produce a unicode output file inputfilename: Legacy plain text file outputfilename: Khmer Unicode plain text file fontType: type "abc" or font name "ABC-TEXT-5" encoding: cp1252, utf-8, iso-8859-1 """ if (inputFileName == outputFileName): raise TypeError('input file and output file must not be the same!') fd = FontData() if (not fd.canDecode(encoding)): raise TypeError('unknow encoding!') try: fin = open(inputFileName, "r") except IOError: raise IOError('Cannot open file "' + inputFileName + '" for reading!') try: fout = open(outputFileName, "w") except IOError: raise IOError('Cannot open file "' + outputFileName + '" for writing!') data = fd.legacyData(fontType) # reading line by line from the input file, until end of file. for line in fin: sin = fd.changeEncoding(line, encoding) result = unicodeProcess.process(sin, data) bufout = unicodeReorder.reorder(result) fout.write(bufout.encode('utf-8')) fin.close() fout.close() class TestConvertTxt(unittest.TestCase): def setUp(self): pass def testSameFile(self): # same file raise error self.assertRaises(TypeError, convertTxtFile, 'file1', 'file1', None, None) def testEncoding(self): # assert error if file is unreadable self.assertRaises(TypeError, convertTxtFile, 'file', 'file1', None, 'blablabla') def testConversion(self): import tempfile import os handle, filename = tempfile.mkstemp() tmpFile = open(filename, 'w') tmpFile.write('kxK') tmpFile.close() # create a usable filename for output #TODO: this does not work here, Jens tmpFile = tempfile.TemporaryFile() outputFilename = tmpFile.name tmpFile.close() convertTxtFile(filename, outputFilename, 'abc', 'cp1252') tmpFile = open(outputFilename, 'r') result = tmpFile.readline() tmpFile.close() os.remove(filename) os.remove(outputFilename) self.assertEqual(result.decode('utf-8'), u'កខគ') if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/legacyConverter.py0000644000175000001440000001024010627160514020553 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Unicode to Khmer Legacy fonts Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This program converts an reordered unicode string based on legacy style to legacy font import unittest import sys #convert from unicode to legacy def converter(sin, data): '''sin as reordered unicode string based on legacy style data the font data for the conversion returns legacy string where unkown unicode codepoints are dropped ''' dicts = data[0] # dictionary not in unicode range replaceData = data[1] # list with character replacement values sout = '' listLength = len(replaceData) i = 0 end = len(sin) while (i < end): for j in range( len(dicts)-1, -1, -1): if (dicts[j] == None): continue try: sout += dicts[j][sin[i : i+j+1]] i += j +1 break except KeyError: continue else: c = sin[i] n = ord(c) - 0x1780 if ((n >= 0) and (n < listLength)): sout += replaceData[n] elif (ord(c) < 0x7f ): # keep ascii characters sout += c.encode('cp1252') i += 1 return sout class TestConvert(unittest.TestCase): MARK = unichr(0x17EA) condenseData1 = { unichr(0x200b):chr(0x20), #ZWSP unichr(0x200c):"", #ZWNJ unichr(0x200d):"" # ZWJ } condenseData2 = {u'បា': 'BAA', u'្ក':'Cok', u'្ស':'Cos', MARK + u'ី':'I'} condenseData3 = { MARK + MARK +u'៉':chr(0xFA), # Musekatoan (U long) MARK + MARK +u'៊':chr(0xFA), # Trisap (U long) } condenseData6 = { u'ខ្ញ'+ MARK + u'ុំ':chr(0xB4) # ខ្ញុំ one code point in limon } replaceData = ['k', 'x', 'K', 'X'] #dicts = [condenseData1, condenseData2, condenseData3] dicts = [condenseData1, condenseData2, condenseData3, None, None, condenseData6] data = [dicts, replaceData] def setUp(self): pass def testConversion(self): self.assertEqual(converter(unichr(0x200b), self.data), chr(0x20)) # in dict1 self.assertEqual(converter(unichr(0x200c), self.data), "") self.assertEqual(converter(u'បា', self.data), 'BAA') # in dict2 self.assertEqual(converter(u'្ក', self.data), 'Cok') self.assertEqual(converter(u'្ស'+ self.MARK + self.MARK + u'៊' + self.MARK + u'ី', self.data), 'Cos' + chr(0xFA) + 'I') # in dict3 self.assertEqual(converter(u'ខ្ញ'+ self.MARK + u'ុំ',self.data), chr(0xB4)) # in dict6 self.assertEqual(converter(u'ក', self.data), 'k') # in list self.assertEqual(converter(u'ខ', self.data), 'x') self.assertEqual(converter(u'ឃ', self.data), 'X') def testNoConversion(self): # keep characters we do not know self.assertEqual(converter(u'?', self.data), '?') # neither in dict nor in list self.assertEqual(converter(u'\n', self.data), '\n') self.assertEqual(converter(u'', self.data), '') # remove unknown unicode character self.assertEqual(converter(unichr(255), self.data), '') self.assertEqual(converter(unichr(0x1980), self.data), '') def testConvertLongFirst(self): # convert longer match first # 123: A,1234: Z # 1234 => Z... not A4 data = (({"0":"X"}, {"09":"M"}, {"123":"A"}, {"1234":"Z"}) , []) # dictionary not in unicode range # list with character replacement values self.assertEqual(converter("1234", data), "Z") if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/fontdata.dtd0000644000175000001440000000525510542134320017344 0ustar k-dausers khmerconverter v.1.4/modules/fontdata.xml0000644000175000001440000016600210557767662017422 0ustar k-dausers ]> khmerconverter v.1.4/modules/unicodeConvertOdt.py0000644000175000001440000002401410627160626021065 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy fonts to Khmer Unicode Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This module convertes an *.odt file from legacy Khmer to Unicode format from xml.dom import minidom from FontDataXML import FontData import zipfile import unicodeProcess import unicodeReorder import unittest from zlib import DEFLATED class unicodeConvertOdt: def __init__(self): self.CONTENTXML = 'content.xml' self.STYLESXML = 'styles.xml' self.convertibleStyle = {} self.fd = FontData() self.outputFont = "Khmer OS" self.outputFontSize = None def convertOdtFile(self, inputFileName, outputFileName, outputFont = None, outputFontSize = None): """This function convert OpenOffice.Org writer file inputFileName: the name of file you want to convert. outputFileName: the result file name. Default value is converted-inputFileName outputFont: font name to override. default value is Khmer OS. outputFontSize: a value to override font size in odt file, value = None to ignore.""" self.outputFont = outputFont if (outputFontSize): self.outputFontSize = str(outputFontSize) + 'pt' if (inputFileName == outputFileName): raise TypeError('input file and output file must be different!') try: # read zip file (.odt) zipIn = zipfile.ZipFile(inputFileName, "r") except IOError: raise IOError('Cannot open file "' + inputFileName + '" for reading!') if (not (self.CONTENTXML and self.STYLESXML) in zipIn.namelist()): raise TypeError('Input file' + inputFileName + 'is not an odt file!') try: # create new zip file (.odt) zipOut = zipfile.ZipFile(outputFileName, "w", DEFLATED) except IOError: raise IOError('Cannot open file "' + outputFileName + '" for writing!') zipOut.debug = 3 for file in zipIn.namelist(): fdata = zipIn.read(file) if (file == self.CONTENTXML): # read data to contentXml for later processing. contentXml = fdata continue elif (file == self.STYLESXML): fdata = self.processStyle(fdata) zipOut.writestr(file, fdata) # process the content.xml only after already read the styles.xml. fdata = self.processContent(contentXml) zipOut.writestr(self.CONTENTXML, fdata) zipOut.close() zipIn.close() def processContent(self, xmldata): """change font name and size, convert data to unicode in xmldata @param xmldata: xml string to parse.""" self.xmldoc = minidom.parseString(xmldata) officeNode = self.xmldoc.getElementsByTagName('office:text') officeDocContentNode = self.xmldoc.getElementsByTagName('office:document-content') # go through node, replace font, and convert data to unicode. self.goThru(officeDocContentNode, self.replaceFont) self.goThru(officeNode, self.convertIfLegacy) return self.xmldoc.toxml('utf-8') def processStyle(self, xmldata): """change font name and size, convert data to unicode in xmldata @param xmldata: xml string to parse.""" self.xmldoc = minidom.parseString(xmldata) officeDocStylesNode = self.xmldoc.getElementsByTagName('office:document-styles') # go through node, replace font, and convert data to unicode. self.goThru(officeDocStylesNode, self.replaceFont) self.goThru(officeDocStylesNode, self.convertIfLegacy) return self.xmldoc.toxml('utf-8') def goThru (self, nodelist, function): """go through nodelist and call function with child node as argument. @param nodelist: dom's node list. @param function: function to call, child argument will be provided by goThru.""" for node in nodelist: if node.hasChildNodes(): for child in node.childNodes: function(child) self.goThru (node.childNodes, function) def replaceFont(self, node): """look for node which has "style:font-name" attribute and change its value to fontName.""" if (not hasattr(node, "getAttribute")): return fontName = node.getAttribute('style:font-name') fontType = None if (fontName): try: fontType = self.fd.typeForFontname(fontName) except: pass if (fontType and hasattr(node.parentNode, "getAttribute")): # add name to convertible list self.convertibleStyle[unicode(node.parentNode.getAttribute('style:name'))] = fontType node.removeAttribute('style:font-name') node.setAttribute('style:font-name-complex', self.outputFont) if (self.outputFontSize): node.setAttribute('style:font-size-complex', self.outputFontSize) styleName = node.getAttribute('style:name') if (styleName): # if node's parent style is also convertible, node is also convertible. # search in child if child also has style:font-name (which will override parent) # then will not add to convertible list. if node.hasChildNodes(): for child in node.childNodes: if (child.hasAttribute('style:font-name')) and (hasattr(child, "getAttribute")): fontName = child.getAttribute('style:font-name') try: fontType = self.fd.typeForFontname(fontName) except: return parentStyleName = node.getAttribute('style:parent-style-name') if self.convertibleStyle.has_key(parentStyleName): # add to convertible style self.convertibleStyle[styleName] = self.convertibleStyle[parentStyleName] try: fontType = self.fd.typeForFontname(styleName) except: return self.convertibleStyle[styleName] = fontType node.setAttribute('style:name', self.outputFont) node.setAttribute('svg:font-family', self.outputFont) def convertIfLegacy(self, node): """look the node for information of legacy font and convert to unicode, otherwise return False. @param node: node to look to and convert if necessary.""" if (not node.nodeValue): return False if (not (hasattr(node, "parentNode") or hasattr(node.parentNode, "getAttribute") or hasattr(node.parentNode, "parentNode") or hasattr(node.parentNode.parentNode, "getAttribute"))): return False # if node don have font specified, but it's under parent that in convertible list # do also convert node. styleName = node.parentNode.getAttribute(u'text:style-name') parentStyleName = node.parentNode.parentNode.getAttribute(u'text:style-name') if (styleName in self.convertibleStyle): style = styleName elif (parentStyleName in self.convertibleStyle): style = parentStyleName else: return False # legacy font data's referal. fontname = self.convertibleStyle[style] sin = node.data try: sin = sin.encode('cp1252') except UnicodeEncodeError: result = u'' part = '' for char in sin: try: tmpChar = char.encode('cp1252') except UnicodeEncodeError: if (part): part = unicodeProcess.process(part, self.fd.legacyData(fontname)) result += unicodeReorder.reorder(part) part = '' result += char else: part += tmpChar if (part): part = unicodeProcess.process(part, self.fd.legacyData(fontname)) result += unicodeReorder.reorder(part) sin = result else: sin = unicodeProcess.process(sin, self.fd.legacyData(fontname)) sin = unicodeReorder.reorder(sin) newtext = self.xmldoc.createTextNode(sin) # create text of Node node.parentNode.replaceChild(newtext, node) class TestConvertOdt(unittest.TestCase): def testSameFile(self): # same file raise error self.assertRaises(TypeError, unicodeConvertOdt().convertOdtFile, 'file1', 'file1') def testUnreadable(self): # assert error if file is unreadable self.assertRaises(IOError, unicodeConvertOdt().convertOdtFile, '!@#$%^&', 'file2') def testModifyStyle(self): xmldata = """""" modxmldata = xmldata.replace("Limon S1", "Khmer OS") self.assertEqual(unicodeConvertOdt().processStyle(xmldata), modxmldata) if __name__ == '__main__': unittest.main() khmerconverter v.1.4/modules/test-doublelegacy2.xml0000644000175000001440000000125510542134320021263 0ustar k-dausers ]> khmerconverter v.1.4/modules/test-doublelegacy3.xml0000644000175000001440000000125110542202101021251 0ustar k-dausers ]> khmerconverter v.1.4/modules/__init__.py0000644000175000001440000000000010542134320017137 0ustar k-dauserskhmerconverter v.1.4/modules/unicodeProcess.py0000644000175000001440000000710610627160662020417 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy fonts to Khmer Unicode Conversion # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) import unittest import sys from types import * def process(sin, data): """convert from legacy to unicode sin : string input as legacy encoding data: list for legacy to unicode conversion return value: unicode string """ if (data == None or type(data) != ListType or len(data) != 2 or type(data[0]) != DictType or type(data[1]) != ListType): raise TypeError("Wrong data for conversion.") if (type(sin) == unicode): raise TypeError("Input must not be Unicode string.") condenseData = data[0] # dictionary with character combinations and replacements replaceData = data[1] # list with character replacement values sout = u'' listLength = len(replaceData) i = 0 end = len(sin) while (i < end): for key in condenseData.keys(): if (key == sin[i : i+len(key)]): sout += condenseData[key] i += len(key) break else: n = ord(sin[i]) if (n < listLength): sout += replaceData[n] else: sout += unichr(n) i += 1 return sout class TestProcessing(unittest.TestCase): def setUp(self): self.data = [ { "12":u"_", u"b¤".encode("cp1252"):u"ឬ", u"B£".encode("cp1252"):u"ឭ", u"B¤".encode("cp1252"):u"ឮ", "abcd":u"" }, [u"*", u"cbc", u"ក", u"កគ", u""] ] def testConversion(self): # make sure conversions works like expected self.assertEqual(process(chr(0), self.data), u"*") self.assertEqual(process(chr(1), self.data), u"cbc") self.assertEqual(process(chr(2), self.data), u"ក") self.assertEqual(process(chr(3), self.data), u"កគ") self.assertEqual(process(chr(4), self.data), u"") self.assertEqual(process(chr(3) + chr(0), self.data), u"កគ*") def testInvalid(self): # make sure conversions does not break self.assertEqual(process(unichr(255).encode('cp1252'), self.data), unichr(255)) self.assertEqual(process(unichr(len(self.data[1])).encode('cp1252'), self.data), unichr(len(self.data[1]))) def testTypeError(self): #make sure module will raise TypeError when data is wrong self.assertRaises(TypeError, process,'sala', None) self.assertRaises(TypeError, process,'sala', 1) def testCondense(self): self.assertEqual(process('12'.encode('cp1252'), self.data), u"_") self.assertEqual(process('1212'.encode('cp1252'), self.data), u"__") self.assertEqual(process('12x12'.encode('cp1252'), self.data), u"_x_") self.assertEqual(process(u'b¤'.encode('cp1252'), self.data), u"ឬ") self.assertEqual(process(u'b¤B£B¤'.encode('cp1252'), self.data), u"ឬឭឮ") self.assertEqual(process('abcd', self.data), u"") if __name__ == '__main__': unittest.main() khmerconverter v.1.4/LICENSE.TXT0000644000175000001440000006447010542134322015070 0ustar k-dausers GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! khmerconverter v.1.4/khmerconverter.py0000755000175000001440000001337210627157751017032 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy to Khmer Unicode Conversion and Vice Versa # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This program creates a Plain Text, OpenOffice.org Writer (odt), or HTML file # in Khmer Unicode/Legacy format from Legacy/Unicode input file respectively. from optparse import OptionParser from modules import FontDataXML import __version__ import sys import os if (sys.argv[0].endswith('py')): py = 'python ' else: py = '' usage = py + """%prog [OPTION] input [output] or: """ + py + """%prog [OPTION]\n Font encoding converter for Khmer text. Converts between legacy and Unicode in both directions. Currently supported file formats are: Plain Text OpenOffice writer document HTML""" strVersion = "%prog Version" + __version__.ver + """ \n Copyright (C) 2006 The WordForge Foundation. www.khmeros.info. This is free software. You may redistribute copies of it under the terms of the GNU Lesser General Public License . There is NO WARRANTY, to the extent permitted by law. Written by Hok Kakada, Keo Sophon, San Titvirak, Seth Chanratha. """ parser = OptionParser(usage = usage, version = strVersion) parser.add_option("-l", "--list", action="store_true", dest="listFont", default=False, help="list all supported fonts") parser.add_option("-c", "--codec", action="store_true", dest="listcodectypes", default=False, help="list all supported codecs for text files") parser.add_option("-o", "--oldfont", action="store_true", dest="oldfont", default=False, help="convert from unicode to old fonts (legacy)") parser.add_option("-e", "--encoding", dest="encoding", action="store", type="string", help="codec for the input file, default is 'cp1252'", metavar="codec", default="cp1252") parser.add_option("-f", "--font", dest="font", action="store", type="string", help="fontname for output encoding, default is 'abc-zwsp'", metavar="fontname", default="abc-zwsp") parser.add_option("-s", "--size", dest="fontSize", action="store", type="int", help="force the program to use specific size for khmer font", metavar="value", default=None) parser.add_option("-t", "--timer", action="store_true", dest="showtimer", default=False, help="print the needed time for the conversion") (options, args) = parser.parse_args() argc = len(args) fd = FontDataXML.FontData() # print all codec type if (options.listcodectypes): print 'Supported input encodings:', fd.listEncodingTypes() sys.exit() # print all font names if (options.listFont): print 'Supported fonts:' l = fd.listFontNames() for line in l: print line sys.exit() if (len(sys.argv) == 1): from modules import converterGUI sys.exit() if (argc == 0): sys.stderr.write("Please enter a file name or a legal option!\nUse the --help option for more info.\n") sys.exit() inputFileName = args[0] if not os.path.exists(args[0]): sys.stderr.write(inputFileName + ' does not exist!\n') sys.exit() if (argc < 2): #output file is in the same folder as input file (path, filename) = os.path.split(inputFileName) outputFileName = os.path.join(path, 'converted-' + filename) else: outputFileName = args[1] # User give outputFileName by her own if (inputFileName == outputFileName): sys.stderr.write("Input file and output file must be different!\n") sys.exit() (path, filename) = os.path.split(outputFileName) # check if output folder entered by user exist if (path and not os.path.exists(path)): sys.stderr.write('The path does not exist!\n') sys.exit() # check if output file already exist if (os.path.exists(outputFileName)): sys.stderr.write('The output file is already existed!\n') sys.exit() if (options.showtimer): timer = time.clock() # convert from unicode to legacy if (options.oldfont): if (inputFileName.endswith('.odt')): from modules import legacyConvertOdt converter = legacyConvertOdt.legacyConvertOdt() converter.convertOdtFile(inputFileName, outputFileName, options.font, options.fontSize) elif(inputFileName.endswith('.htm') or inputFileName.endswith('.html')): from modules import legacyConvertHTML legacyConvertHTML.convertHTML(inputFileName, outputFileName, options.font) else: from modules import legacyConvertText legacyConvertText.convertTxtFile(inputFileName, outputFileName, options.font) # convert from legacy to unicode else: if (inputFileName.endswith('.odt')): from modules import unicodeConvertOdt converter = unicodeConvertOdt.unicodeConvertOdt() converter.convertOdtFile(inputFileName, outputFileName, options.font, options.fontSize) elif(inputFileName.endswith('.htm') or inputFileName.endswith('.html')): from modules import unicodeConvertHTML unicodeConvertHTML.convertHTMLFile(inputFileName, outputFileName, options.font) else: from modules import unicodeConvertText unicodeConvertText.convertTxtFile(inputFileName, outputFileName, options.font, options.encoding) if (options.showtimer): timer = time.clock() print ">>> Total conversion time:", timer, 'seconds' khmerconverter v.1.4/converter.ico0000644000175000001440000001635610557770104016122 0ustar k-dausersh6 00F( ʦ>]|$$HHll>](|2<FU$mHl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hl>]|(2<FU$mHl>]|$$HHll>](|2<FUm$Hl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hlڐ>]|(2<FUm$Hlʹ>]|$$HHll>]|(2<FU$mHl>*]?|Ti~$Hl>>]]||$Hl*>?]T|i~$Hlڐ>](|2<FUm$Hlʹ,,,999EEERRR___lllxxx( @ʦ>]|$$HHll>](|2<FU$mHl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hl>]|(2<FU$mHl>]|$$HHll>](|2<FUm$Hl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hlڐ>]|(2<FUm$Hlʹ>]|$$HHll>]|(2<FU$mHl>*]?|Ti~$Hl>>]]||$Hl*>?]T|i~$Hlڐ>](|2<FUm$Hlʹ,,,999EEERRR___lllxxxpC AAAAAAAAAAA0Ax(0` ʦ>]|$$HHll>](|2<FU$mHl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hl>]|(2<FU$mHl>]|$$HHll>](|2<FUm$Hl*>?]T|i~$Hl>>]]||$Hl>*]?|Ti~ٓ$Hlڐ>]|(2<FUm$Hlʹ>]|$$HHll>]|(2<FU$mHl>*]?|Ti~$Hl>>]]||$Hl*>?]T|i~$Hlڐ>](|2<FUm$Hlʹ,,,999EEERRR___lllxxx???xp`@p@ @ @@@@@@@@@@@@@@@@@@@@@@@ @@ @@<~?????khmerconverter v.1.4/README.TXT0000644000175000001440000000525310627165250014745 0ustar k-dausers khmerConverter Copyright (c) 2006 by The WordForge Foundation (All Rights Reserved) www.khmeros.info Version: 1.4 Date: 1 June 2007 Developed by: Hok Kakada (hokkakada@khmeros.info) Keo Sophon (keosophon@khmeros.info) San Titvirak (titvirak@khmeros.info) Seth Chanratha (sethchanratha@khmeros.info) # This program creates a Plain Text, OpenOffice.org Writer (odt), or HTML file # in Khmer Unicode/Legacy format from Legacy/Unicode input file respectively. # Currently it supports legacy font types: ABC, ABC-ZWSP, Baidok, Fk, Khek, Limon, Truth USAGE: ====== On Windows Platform: ==================== khmerconverter.exe inputFile [outputFile] [OPTION] inputFile --> converted-inputFile or: khmerconverter.exe [OPTION] or: khmerconverter.exe On Linux and all other Platforms: ================================= python khmerconverter.py inputFile [outputFile] [OPTION] inputFile --> converted-inputFile or: python khmerconverter.py [OPTION] or: python khmerconverter.py Requirements: - Tix installed (for GUI mode) Running it without any option brings up the graphical user interface. OPTION: ======= --version show program's version number and exit -h, --help show this help message and exit -l, --list list all supported fonts -c, --codec list all supported codecs for text files -o, --oldfont convert from unicode to old fonts (legacy) -e codec, --encoding=codec codec for the input file, default is 'cp1252' -f fontname, --font=fontname fontname for output encoding, default is 'abc-zwsp' -s value, --size=value force the program to use specific size for khmer font -t, --time print the needed time for the conversion IMPORTANT NOTE: If the result of converted odt file (OpenOffice.org Writer) is not properly, there might be problems of complex formatting which break khmer clusters. You might have to follow these steps to solve the problem: 1. Select the section where is not correct, then choose default formatting: (a) "right clicking" or (b) "format menu" then "default formatting" 2. Change the selection to your desire font, for instance "Limon S1" 3. Save and try to convert again. IMPORTANT CHANGES: ================= 1. Added more KhmerOS fonts: KhmerOScontent.ttf, KhmerOSmuollight.ttf, KhmerOSmuolpali.ttf, KhmerOSsiemreap.ttf, 2. Fixed the bug that when we type in the disired size in spinsize box then click on convert immediately, the size didn't change. khmerconverter v.1.4/__version__.py0000644000175000001440000000016210627154467016244 0ustar k-dausers#!/usr/bin/env python # -*- coding: utf-8 -*- """this file contains the version of KhmerConverter""" ver = "1.4" khmerconverter v.1.4/CHANGELOG.TXT0000644000175000001440000000233310645647015015277 0ustar k-dausers31 July 2007: (release version 1.4) - Change the focuse from using tab to mouse - And more support for Khmer Unicode fonts,such as Khmer OS Content, Khmer OS Muollight, Khmer OS Muol Pali,Khmer OS Siemreap 30 January 2007: (release version 1.3) - Improved more coplex styles of ODT files - The last change in each widget is not used when we click on convert. - update README file 21 December 2006:(release version 1.2) - Improved fontdata.xml: removing duplicated legacy entries. - Add one test case: test-doublelegacy3.xml - Add more Alias name for Khek - Improving coplex styles of ODT files 01 December 2006:(release version 1.1) - Conversion of ODT files improved: - Header/footer is now also convertible. - Complex styles of document is searched and converted. - Add "Khmer OS Battambang" to converter GUI. - improved converterGUI to run with python2.3 01 August 2006: - 4 unittests had already been fixed in the following files: 1. modules/legacyReorder.py 2. modules/legacyConvertHTML.py 3. modules/legacyConvertOdt.py 4. modules/unicodeConvertHTML.py 10 June 2006: - release of version 1.0 khmerconverter v.1.4/create-exe.py0000644000175000001440000000314110627160153015772 0ustar k-dausers#!/usr/bin/python # -*- coding: utf8 -*- # Khmer Legacy to Khmer Unicode Conversion and Vice Versa # (c) 2006 The WordForge Foundation, all rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public License # as published by the Free Software Foundation; either version 2.1 # of the License, or (at your option) any later version. # # See the LICENSE file for more details. # # Developed by: # Hok Kakada (hokkakada@khmeros.info) # Keo Sophon (keosophon@khmeros.info) # San Titvirak (titvirak@khmeros.info) # Seth Chanratha (sethchanratha@khmeros.info) # # This program creates a Plain Text, OpenOffice.org Writer (odt), or HTML file # in Khmer Unicode/Legacy format from Legacy/Unicode input file respectively. # # create an executable file on Windows # command: python setup.py # Requirements: distutils package and py2exe installer import glob import os import sys from distutils.core import setup import py2exe def files(folder): for path in glob.glob(folder+'/*'): if os.path.isfile(path): yield path data_files=[ ('.', glob.glob(sys.prefix+'/DLLs/tix81*.dll')), ('tcl/tix8.1', files(sys.prefix+'/tcl/tix8.1')), ('tcl/tix8.1/bitmaps', files(sys.prefix+'/tcl/tix8.1/bitmaps')), ('tcl/tix8.1/pref', files(sys.prefix+'/tcl/tix8.1/pref')), ] setup( script_args=['py2exe'], data_files=data_files, windows=[{"script": "khmerconverter.py", "icon_resources": [(1, "converter.ico")]}], packages = ['modules'] ) khmerconverter v.1.4/__init__.py0000644000175000001440000000017010627155170015511 0ustar k-dausers"""Khmer Legacy to Khmer Unicode Conversion and Vice Versa # (c) 2006 The WordForge Foundation, all rights reserved."""