Vaše názori posjelajťe na adresu ludevit @ juls.savba.sk
'''
def form(text='', nfkd='none'):
text = cgi.escape(text)
r = u'''
'''
return r
def format_translation(text):
if not text:
return ''
r = '''
%s
''' % cgi.escape(text).replace('\n', ' ')
return r
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
print "Content-type: text/html" # HTML is following
print # blank line, end of headers
print header()
f = cgi.FieldStorage()
text = f.getfirst("text", "kokot")
do_nfkd = f.getfirst("nfkd", False)
nfkd = 'none'
if do_nfkd:
nfkd = 'hack'
if text:
writelog(text)
try:
text = unicode(text, 'utf-8')[:40000] # safeguard
translator = Translator(tables_ludevit.table_voc, tables_ludevit.table_ort, tables_ludevit.postprocess)
t = translator.translate_text(text, nfkd)
except UnicodeDecodeError:
t = u'''
Text ňebou v UTF-8 koduvaní. Možno váš browser ňepodporuje
UTF-8. Všetki modernje browseri toto podporujú, skúsťe novú versiu. Ibažebi
sťe k stránke ňepristupovali z običajnjeho počítača, ale z dajakjeho
inšjeho zarjaďeňja, napriklad z PDA, kďe browseri často UTF-8 aňi žjadne
Slovenskje písmeni ňepodporujú. To je nám lúto.
'''
print format_translation(t)
else:
t = u'''
Toto je automatickí prekladač textu zo spisovnej Slovenčini do štúrovskej.
Napíšťe krátki text v spisovnom nárečí so správnou diakritikou a klikňiťe na «prelož».
'''
print format_translation(t)
print form(text='', nfkd=nfkd)
print footer()
ludevit-9.2/examples/www-simple/why_nfkd.html 0000664 0000000 0000000 00000002462 13627222544 0021503 0 ustar 00root root 0000000 0000000
ludevít
Kďe sa zvuki mekko vislovujú takjeto sa zmekčujúcou čjarkou viznačujú, ale
písmeni „d“ a „t“ ju v dobe modernej inakšje označujú, značka táto skoro ako
dlhá čjarka má podobu.
Abi sme historickú vernosť zachovali, normalizujeme tjeto dve písmeni na
unicodovskí „NFKD“ spuosob, to značí že zmekčujúce čjarki sú ako samostatnje
kombinujúce písmeni (combining characters, kombinierende diakritische
Zeichen) reprezentovanje, keď sa s predchádzajúcou písmenou vjažu, v
browseroch zrjedka bíva úplná podpora kombinujúcich písmen, a tak sa často
písmena nad predchádzajúcou ňezmeňená zobrazí, čo vizerá temer ako puovodní
historickí spuosob písaňja. Žjal, mnoho browserou alebo tjeto čjarki zle
zobrazuje, alebo naopak tak ako má čjarku s písmenou skombinuje dobre a ináč
zobrazí, a teda tento spuosob ňje vždi dobrje vísledki dáva.
ludevit-9.2/examples/www/ 0000775 0000000 0000000 00000000000 13627222544 0015511 5 ustar 00root root 0000000 0000000 ludevit-9.2/examples/www/converthtml.py 0000775 0000000 0000000 00000014167 13627222544 0020444 0 ustar 00root root 0000000 0000000 #!/usr/bin/python
import sys, re, urlparse, urllib
class NullParser:
"just copies data"
def __init__(self):
self.output_buffer = ''
def feed(self, data):
self.output_buffer += data
def close(self):
pass
def pull(self):
r = self.output_buffer
self.output_buffer = ''
return r
class BasicParser:
def __init__(self):
self.in_tag = False
self.current_tag = ''
self.current_data = ''
def feed(self, data):
self.process(data)
def process(self, txt):
for c in txt:
if self.in_tag:
self.current_tag += c
if c=='>':
self.process_tag(self.current_tag)
self.in_tag = False
self.current_tag = ''
else:
assert not self.current_tag
if c != '<':
self.current_data += c
else:
self.process_data(self.current_data)
self.in_tag = True
self.current_data = ''
self.current_tag = c # i.e., <
def close(self):
if self.in_tag: # open < at the and of dosument
assert self.current_tag
self.process_tag(self.current_tag)
else:
self.process_data(self.current_data)
def process_tag(self, tagstr):
"to be subclassed"
return tagstr
def process_data(self, datastr):
"to be subclassed"
return datastr
class CopyParser(BasicParser):
def __init__(self):
self.output_buffer = ''
BasicParser.__init__(self)
def process_tag(self, tagstr):
"to be subclassed"
self.output_buffer += tagstr
def process_data(self, datastr):
"to be subclassed"
self.output_buffer += datastr
def pull(self):
r = self.output_buffer
self.output_buffer = ''
return r
class CopyAndModifyParser(CopyParser):
def __init__(self):
self.in_script = False
self.in_style = False
CopyParser.__init__(self)
def process_tag(self, tagstr):
newtag = self.modify_tag(tagstr)
self.output_buffer += newtag
if re.match(r'(?is))', tagstr):
self.in_style = True
elif re.match(r'(?is) tags...
newdata = datastr
else:
newdata = self.modify_data(datastr)
self.output_buffer += newdata
def modify_tag(self, tagstr):
"to be subclassed"
return tagstr
def modify_data(self, datastr):
"to be subclassed"
return datastr
# inspired from feedparser by Mark Pilgrim
relative_uris = {
'a': ('href',),
'applet': ('codebase',),
'area': ('href',),
'blockquote': ('cite',),
'body': ('background',),
'del': ('cite',),
'form': ('action',),
'frame': ('longdesc', 'src'),
'iframe': ('longdesc', 'src'),
'head': ('profile',),
'img': ('longdesc', 'src', 'usemap'),
'input': ('src', 'usemap'),
'ins': ('cite',),
'link': ('href',),
'object': ('classid', 'codebase', 'data', 'usemap'),
'q': ('cite',),
'script': ('src',)
}
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
def _urljoin(base, uri):
uri = _urifixer.sub(r'\1\3', uri)
return urlparse.urljoin(base, uri)
def get_uri_tag_value(tagstr, k):
"try to get value of 'k' attribute from a given html tag"
m = re.search(r"""\b%s\=['"](.*?)['"]""" % k, tagstr, re.I+re.S)
if not m: # not in quotes? hmm...
m = re.search(r"""\b%s\=(.*?)\s(?=\>)""" % k, tagstr, re.I+re.S)
if not m: # nothing found
return None
return m.start(), m.end(), m.group(1)
class ModifyHrefParser(CopyAndModifyParser):
# also rewrite href's to go through our cgi script
def __init__(self, cgi_url, base_url):
self.cgi_url = cgi_url
self.base_url = base_url
CopyAndModifyParser.__init__(self)
def modify_tag(self, tagstr):
"NOT to be subclassed"
if not self.cgi_url:
return tagstr
m = re.search(r'<([A-Za-z]+?)\b', tagstr, re.S)
if m:
tag = m.group(1)
if tag.lower()=='a':
spanval = get_uri_tag_value(tagstr, 'href')
if spanval: # found, we need to replace the reference
start, end, val = spanval
url = _urljoin(self.base_url, val)
newval = self.cgi_url+'?'+'url='+urllib.quote_plus(url, safe='')
tagstr = tagstr[:start]+''+'href'+'="'+newval+'"'+tagstr[end:]
return tagstr
class MyParser(ModifyHrefParser):
def modify_data(self, datastr):
return re.sub('[a-z]', 'a', datastr)
if __name__=='__main__':
filehandle = open('a.html', 'r')
parser = MyParser(base_url='http://www.sme.sk')
while True:
data = filehandle.read(1000) # read in the data in chunks
if not data: break # we've reached the end of the file - python could do with a do:...while syntax...
parser.feed(data)
sys.stdout.write(parser.pull()) # you can output data whilst processing using the push method
#processedfile = parser.close() # or all in one go using close
parser.close() # Even if using push you will still need a final close
sys.stdout.write( parser.pull())
filehandle.close()
ludevit-9.2/examples/www/fetch.py 0000775 0000000 0000000 00000016424 13627222544 0017166 0 ustar 00root root 0000000 0000000 #!/usr/bin/python
import urllib2
import re
import socket
from htmlentitydefs import name2codepoint
# one minute default timeout for everything - ugly, but urllib2 does not expose timeout API for sockets...
socket.setdefaulttimeout(60)
from ludevit_trans.translator import Translator
from ludevit_trans import tables_ludevit
from converthtml import ModifyHrefParser, NullParser
# read page in chunks of this size
CHUNKSIZE = 5000
# size of the first chunk, used to guess charset and add base url
FIRSTCHUNKSIZE = 5000
BASE_CGI='http://www.juls.savba.sk/ludevit/'
def _replace_entity(m):
s = m.group(1)
if s[0] == u'#':
s = s[1:]
try:
if s[0] in u'xX':
c = int(s[1:], 16)
else:
c = int(s)
return unichr(c)
except ValueError:
return m.group(0)
else:
try:
return unichr(name2codepoint[s])
except (ValueError, KeyError):
return m.group(0)
_entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape_entities(s):
return _entity_re.sub(_replace_entity, s)
class LudevitParser(ModifyHrefParser):
def __init__(self, encoding, cgi_url, base_url):
self.encoding = encoding
ModifyHrefParser.__init__(self, cgi_url, base_url)
self.translator = Translator(tables_ludevit.table_voc, tables_ludevit.table_ort, tables_ludevit.postprocess)
def modify_data(self, datastr):
# re-encoding here is slow, as compared with encoding the whole chunk
# before feeding it to the parser
# however, this deals better with the rare case of non-ascii characters
# in href's URLs...
txt = datastr.decode(self.encoding, 'replace')
txt = unescape_entities(txt)
tran = self.translator.translate_text(txt)
tran = tran.encode(self.encoding, 'xmlcharrefreplace')
return tran
def guess_charset_from_meta(txt):
charset = None
m = re.search(r'meta\s*http-equiv\="Content-Type"\s*content\="text/html;\s*charset\=(.+?)"', txt, re.I+re.S)
if m:
charset = m.group(1)
return charset
def guess_if_base(txt):
"find out if there is a BASE URL in the html page"
return re.search(r'base\s*href\=', txt, re.I+re.S)
def report_error(text):
headers = 'Content-Type: text/plain\r\n'
body = 'An error has occurred: ' + text +'\nOops.\n'
return headers, body
def prepare_page(url, user_agent):
"open url, read some bytes (to guess charset)"
do_add_base_url = False
# sanity checks
if len(url)>512:
charset = 'us-ascii'
headers, first_chunk = report_error('Overlong URL')
f = None
return False, charset, headers, first_chunk, f, do_add_base_url
protocol = url[:10]
if ':' not in protocol:
charset = 'us-ascii'
headers, first_chunk = report_error('Invalid protocol')
f = None
return False, charset, headers, first_chunk, f, do_add_base_url
protocol = protocol.split(':')[0]
protocol = protocol.lower()
if protocol not in ['http', 'https', 'ftp', 'gopher']:
charset = 'us-ascii'
headers, first_chunk = report_error('Unsupported protocol')
f = None
return False, charset, headers, first_chunk, f, do_add_base_url
req = urllib2.Request(url)
req.add_header('User-Agent', user_agent)
try:
f = urllib2.urlopen(req)
except urllib2.HTTPError:
# redir to original url
headers = 'Location: %s\r\n' % url
charset = 'us-ascii'
first_chunk = ''
f = None
return False, charset, headers, first_chunk, f, do_add_base_url
except urllib2.URLError, exc:
charset = 'us-ascii'
headers, first_chunk = report_error(str(exc))
f = None
return False, charset, headers, first_chunk, f, do_add_base_url
resp_info = f.info()
del resp_info['Content-Length']
headers = ''.join(resp_info.headers)
ct = f.info().get('Content-Type', '')
do_translate = ct.lower().startswith('text')
if not do_translate:
return False, None, headers, '', f, do_add_base_url
charset_from_headers = None
if ct:
fields = ct.split(';')
for field in fields:
fs = field.strip()
if fs.lower().startswith('charset='):
charset_from_headers = fs[len('charset='):].strip().lower()
break
first_chunk = f.read(FIRSTCHUNKSIZE)
charset_from_meta = guess_charset_from_meta(first_chunk)
charset = charset_from_headers or charset_from_meta
# if charset_from_meta and charset_from_headers and (charset_from_meta != charset_from_headers):
# we should honour the charset form headers, as per http standard
# this code was clever, but e.g. www.nku.gov.sk fails the test
#if 'windows-1250' in [charset_from_meta, charset_from_headers]:
# charset = 'windows-1250'
#elif 'iso-8859-2' in [charset_from_meta, charset_from_headers]:
# charset = 'iso-8859-2'
# there could be cp852 or MacRoman2 test here, but who uses such encodings nowadays?
# some do, but let's assume they do not differ in headers and meta...
# else:
# charset = charset_from_meta # not standard conforming, but probably better
# fallback, if everything failed
if not charset:
charset = 'windows-1250'
if ct.lower().startswith('text/html') and not guess_if_base(first_chunk): # if a base url is present in the original html, do not add another one...
do_add_base_url = True
return True, charset, headers, first_chunk, f, do_add_base_url
def add_base_url(chunk, base):
"try to find element and add a tag into it"
bastag = ''%base
headtag = ''+bastag+''
if re.search(r'', chunk, re.I+re.S):
r = re.sub(r'(?i)()', r'\1'+bastag, chunk)
# try to add head
elif re.search(r'', chunk, re.I+re.S):
r = re.sub(r'(?i)()', r'\1'+headtag , chunk)
elif chunk.startswith(')', r'\1'+headtag, chunk)
else:
# no , no , ho ... just add it
r = headtag+chunk
return r
def translate_page(url, user_agent):
pp = prepare_page(url, user_agent)
do_translate, charset, headers, chunk, f, do_add_base_url = pp
yield headers
yield '\r\n'
if do_translate:
if do_add_base_url:
# use the page as base url
base = f.geturl() # in case of redirect
chunk = add_base_url(chunk, base)
else:
base = ''
parser = LudevitParser(charset, BASE_CGI, base)
else:
parser = NullParser()
while f: # f could be None to signalize the url has not been successfuly opened
newchunk = f.read(CHUNKSIZE)
if not newchunk:
break
chunk += newchunk
# we have to be careful not to tear utf-8 characters apart...
if ord(chunk[-1])<128:
parser.feed(chunk)
chunk = ''
yield parser.pull()
parser.feed(chunk)
parser.close()
yield parser.pull()
if __name__=='__main__':
import sys
url = sys.argv[1]
user_agent = sys.argv[2]
for c in translate_page(url, user_agent):
sys.stdout.write(c)
ludevit-9.2/examples/www/index.cgi 0000775 0000000 0000000 00000011042 13627222544 0017305 0 ustar 00root root 0000000 0000000 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import cgi, sys, codecs, time, random, os
from ludevit_trans.translator import Translator
from ludevit_trans import tables_ludevit
from fetch import translate_page
logdir='/var/log/ludevit/'
def writelog(text):
"write text to a logfile, text is a plain 8-bit string, not unicode"
if not logdir or not text:
return
remote_addr = os.environ.get('REMOTE_ADDR', '')
logdir_now = os.path.join(logdir, time.strftime('%Y-%m-%d', time.gmtime()))
try:
if not os.path.exists(logdir_now):
os.makedirs(logdir_now)
fname = time.strftime('%Y%m%d_%H%M%S', time.gmtime())+'_%02x'%random.randint(0,0xff)
fname = os.path.join(logdir_now, fname)
f = file(fname, 'w')
f.write(remote_addr+'\n')
f.write(text)
f.close()
except (IOError, OSError):
pass
return
def get_user_agent():
agent = os.environ.get('HTTP_USER_AGENT', 'Speccy/82 [en] (ZX Spectrum; U)')
return agent
def header():
r = u'''
ludevít
'''
return r
def footer():
return u'''