289 lines
7.5 KiB
Python
289 lines
7.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
# -*- test-case-name: pytils.test.test_translit -*-
|
|||
|
# pytils - simple processing for russian strings
|
|||
|
# Copyright (C) 2006-2007 Yury Yurevich
|
|||
|
#
|
|||
|
# http://www.pyobject.ru/projects/pytils/
|
|||
|
#
|
|||
|
# This program is free software; you can redistribute it and/or
|
|||
|
# modify it under the terms of the GNU General Public License
|
|||
|
# as published by the Free Software Foundation, version 2
|
|||
|
# of the License.
|
|||
|
#
|
|||
|
# This program is distributed in the hope that it will be useful,
|
|||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|||
|
# GNU General Public License for more details.
|
|||
|
"""
|
|||
|
Simple transliteration
|
|||
|
"""
|
|||
|
|
|||
|
# __id__ = __revision__ = "$Id: translit.py 102 2007-07-12 12:33:36Z the.pythy $"
|
|||
|
#__url__ = "$URL: https://pythy.googlecode.com/svn/tags/pytils/0_2_2/pytils/translit.py $"
|
|||
|
|
|||
|
import re
|
|||
|
import sys
|
|||
|
|
|||
|
|
|||
|
TRANSTABLE = (
|
|||
|
(u"'", u"'"),
|
|||
|
(u'"', u'"'),
|
|||
|
(u"‘", u"'"),
|
|||
|
(u"’", u"'"),
|
|||
|
(u"«", u'"'),
|
|||
|
(u"»", u'"'),
|
|||
|
(u"–", u"-"),
|
|||
|
(u"…", u"..."),
|
|||
|
(u"№", u"#"),
|
|||
|
## верхний регистр
|
|||
|
# трехбуквенные замены
|
|||
|
(u"Щ", u"Sch"),
|
|||
|
# при замене русский->английский будет первая замена,
|
|||
|
# т.е. Sch
|
|||
|
# а вот если английский->русский, то вариант SCH и Sch --
|
|||
|
# оба пройдут
|
|||
|
(u"Щ", u"SCH"),
|
|||
|
# двухбуквенные замены
|
|||
|
(u"Ё", u"Yo"),
|
|||
|
(u"Ё", u"YO"),
|
|||
|
(u"Ж", u"Zh"),
|
|||
|
(u"Ж", u"ZH"),
|
|||
|
(u"Ц", u"Ts"),
|
|||
|
(u"Ц", u"TS"),
|
|||
|
(u"Ч", u"Ch"),
|
|||
|
(u"Ч", u"CH"),
|
|||
|
(u"Ш", u"Sh"),
|
|||
|
(u"Ш", u"SH"),
|
|||
|
(u"Ы", u"Yi"),
|
|||
|
(u"Ы", u"YI"),
|
|||
|
(u"Ю", u"Yu"),
|
|||
|
(u"Ю", u"YU"),
|
|||
|
(u"Я", u"Ya"),
|
|||
|
(u"Я", u"YA"),
|
|||
|
(u"ИЙ", u"IY"),
|
|||
|
# однобуквенные замены
|
|||
|
(u"А", u"A"),
|
|||
|
(u"Б", u"B"),
|
|||
|
(u"В", u"V"),
|
|||
|
(u"Г", u"G"),
|
|||
|
(u"Д", u"D"),
|
|||
|
(u"Е", u"E"),
|
|||
|
(u"З", u"Z"),
|
|||
|
(u"И", u"I"),
|
|||
|
(u"Й", u"J"),
|
|||
|
(u"К", u"K"),
|
|||
|
(u"Л", u"L"),
|
|||
|
(u"М", u"M"),
|
|||
|
(u"Н", u"N"),
|
|||
|
(u"О", u"O"),
|
|||
|
(u"П", u"P"),
|
|||
|
(u"Р", u"R"),
|
|||
|
(u"С", u"S"),
|
|||
|
(u"Т", u"T"),
|
|||
|
(u"У", u"U"),
|
|||
|
(u"Ф", u"F"),
|
|||
|
(u"Х", u"H"),
|
|||
|
(u"Э", u"E"),
|
|||
|
(u"Ъ", u"`"),
|
|||
|
(u"Ы", u"Y"),
|
|||
|
(u"Ь", u"'"),
|
|||
|
## нижний регистр
|
|||
|
# трехбуквенные замены
|
|||
|
(u"щ", u"sch"),
|
|||
|
# двухбуквенные замены
|
|||
|
(u"ё", u"yo"),
|
|||
|
(u"ж", u"zh"),
|
|||
|
(u"ц", u"ts"),
|
|||
|
(u"ч", u"ch"),
|
|||
|
(u"ш", u"sh"),
|
|||
|
(u"ы", u"yi"),
|
|||
|
(u"ю", u"yu"),
|
|||
|
(u"я", u"ya"),
|
|||
|
(u"я", u"ja"),
|
|||
|
(u"ий", u"iy"),
|
|||
|
# однобуквенные замены
|
|||
|
(u"а", u"a"),
|
|||
|
(u"б", u"b"),
|
|||
|
(u"в", u"v"),
|
|||
|
(u"г", u"g"),
|
|||
|
(u"д", u"d"),
|
|||
|
(u"е", u"e"),
|
|||
|
(u"з", u"z"),
|
|||
|
(u"и", u"i"),
|
|||
|
(u"й", u"j"),
|
|||
|
(u"к", u"k"),
|
|||
|
(u"л", u"l"),
|
|||
|
(u"м", u"m"),
|
|||
|
(u"н", u"n"),
|
|||
|
(u"о", u"o"),
|
|||
|
(u"п", u"p"),
|
|||
|
(u"р", u"r"),
|
|||
|
(u"с", u"s"),
|
|||
|
(u"т", u"t"),
|
|||
|
(u"у", u"u"),
|
|||
|
(u"ф", u"f"),
|
|||
|
(u"х", u"h"),
|
|||
|
(u"э", u"e"),
|
|||
|
(u"ъ", u"`"),
|
|||
|
(u"ь", u"'"),
|
|||
|
# для полноты английского алфавит (в slugify)
|
|||
|
# дополняем английскими буквами, которых
|
|||
|
# не в парах
|
|||
|
(u"c", u"c"),
|
|||
|
(u"q", u"q"),
|
|||
|
(u"y", u"y"),
|
|||
|
(u"x", u"x"),
|
|||
|
(u"w", u"w"),
|
|||
|
(u"1", u"1"),
|
|||
|
(u"2", u"2"),
|
|||
|
(u"3", u"3"),
|
|||
|
(u"4", u"4"),
|
|||
|
(u"5", u"5"),
|
|||
|
(u"6", u"6"),
|
|||
|
(u"7", u"7"),
|
|||
|
(u"8", u"8"),
|
|||
|
(u"9", u"9"),
|
|||
|
(u"0", u"0"),
|
|||
|
) #: Translation table
|
|||
|
|
|||
|
RU_ALPHABET = [x[0] for x in TRANSTABLE] #: Russian alphabet that we can translate
|
|||
|
EN_ALPHABET = [x[1] for x in TRANSTABLE] #: English alphabet that we can detransliterate
|
|||
|
ALPHABET = RU_ALPHABET + EN_ALPHABET #: Alphabet that we can (de)transliterate
|
|||
|
|
|||
|
|
|||
|
def translify(in_string):
|
|||
|
"""
|
|||
|
Translify russian text
|
|||
|
|
|||
|
@param in_string: input string
|
|||
|
@type in_string: C{unicode}
|
|||
|
|
|||
|
@return: transliterated string
|
|||
|
@rtype: C{str}
|
|||
|
|
|||
|
@raise TypeError: when in_string is not C{unicode}
|
|||
|
@raise ValueError: when string doesn't transliterate completely
|
|||
|
"""
|
|||
|
if not isinstance(in_string, unicode):
|
|||
|
raise TypeError("Argument must be unicode, not %s" % type(in_string))
|
|||
|
|
|||
|
translit = in_string
|
|||
|
for symb_in, symb_out in TRANSTABLE:
|
|||
|
translit = translit.replace(symb_in, symb_out)
|
|||
|
|
|||
|
try:
|
|||
|
translit = str(translit)
|
|||
|
except UnicodeEncodeError:
|
|||
|
raise ValueError("Unicode string doesn't transliterate completely, " + \
|
|||
|
"is it russian?")
|
|||
|
|
|||
|
return translit
|
|||
|
|
|||
|
|
|||
|
def detranslify(in_string):
|
|||
|
"""
|
|||
|
Detranslify
|
|||
|
|
|||
|
@param in_string: input string
|
|||
|
@type in_string: C{basestring}
|
|||
|
|
|||
|
@return: detransliterated string
|
|||
|
@rtype: C{str}
|
|||
|
|
|||
|
@raise TypeError: when in_string neither C{str}, no C{unicode}
|
|||
|
@raise ValueError: if in_string is C{str}, but it isn't ascii
|
|||
|
"""
|
|||
|
if not isinstance(in_string, basestring):
|
|||
|
raise TypeError("Argument must be basestring, not %s" % type(in_string))
|
|||
|
|
|||
|
# в unicode
|
|||
|
try:
|
|||
|
russian = unicode(in_string)
|
|||
|
except UnicodeDecodeError:
|
|||
|
raise ValueError("We expects when in_string is str type," + \
|
|||
|
"it is an ascii, but now it isn't. Use unicode " + \
|
|||
|
"in this case.")
|
|||
|
|
|||
|
for symb_out, symb_in in TRANSTABLE:
|
|||
|
russian = russian.replace(symb_in, symb_out)
|
|||
|
|
|||
|
return russian
|
|||
|
|
|||
|
|
|||
|
def slugify(in_string):
|
|||
|
"""
|
|||
|
Prepare string for slug (i.e. URL or file/dir name)
|
|||
|
|
|||
|
@param in_string: input string
|
|||
|
@type in_string: C{basestring}
|
|||
|
|
|||
|
@return: slug-string
|
|||
|
@rtype: C{str}
|
|||
|
|
|||
|
@raise TypeError: when in_string isn't C{unicode} or C{str}
|
|||
|
@raise ValueError: if in_string is C{str}, but it isn't ascii
|
|||
|
"""
|
|||
|
if not isinstance(in_string, basestring):
|
|||
|
raise TypeError("Argument must be basestring, not %s" % type(in_string))
|
|||
|
try:
|
|||
|
u_in_string = unicode(in_string).lower()
|
|||
|
except UnicodeDecodeError:
|
|||
|
raise ValueError("We expects when in_string is str type," + \
|
|||
|
"it is an ascii, but now it isn't. Use unicode " + \
|
|||
|
"in this case.")
|
|||
|
# convert & to "and"
|
|||
|
u_in_string = re.sub('\&\;|\&', ' and ', u_in_string)
|
|||
|
# replace spaces by hyphen
|
|||
|
u_in_string = re.sub('[-\s]+', '-', u_in_string)
|
|||
|
# remove symbols that not in alphabet
|
|||
|
u_in_string = u''.join([symb for symb in u_in_string if symb in ALPHABET])
|
|||
|
# translify it
|
|||
|
out_string = translify(u_in_string)
|
|||
|
# remove non-alpha
|
|||
|
return re.sub('[^\w\s-]', '', out_string).strip().lower()
|
|||
|
|
|||
|
|
|||
|
def dirify(in_string):
|
|||
|
"""
|
|||
|
Alias for L{slugify}
|
|||
|
"""
|
|||
|
slugify(in_string)
|
|||
|
|
|||
|
|
|||
|
def provide_unicode(stext, encoding, default=u"неизвестно"):
|
|||
|
"""
|
|||
|
Provide Unicode from text
|
|||
|
|
|||
|
@param stext: text
|
|||
|
@type stext: C{str}
|
|||
|
|
|||
|
@param encoding: encoding if input text
|
|||
|
@type encoding: C{str}
|
|||
|
|
|||
|
@return: C{unicode}
|
|||
|
"""
|
|||
|
try:
|
|||
|
utext = str(stext).decode(encoding)
|
|||
|
except UnicodeDecodeError, err:
|
|||
|
utext = default % {'error': err, 'value': u""}
|
|||
|
return utext
|
|||
|
|
|||
|
|
|||
|
def provide_str(utext, encoding, default="unknown"):
|
|||
|
"""
|
|||
|
Provide text from Unicode
|
|||
|
|
|||
|
@param utext: unicode text
|
|||
|
@type utext: C{unicode}
|
|||
|
|
|||
|
@param encoding: encoding of output text
|
|||
|
@type encoding: C{str}
|
|||
|
|
|||
|
@return: C{str}
|
|||
|
"""
|
|||
|
try:
|
|||
|
stext = unicode(utext).encode(encoding)
|
|||
|
except UnicodeEncodeError, err:
|
|||
|
stext = default % {'error': err, 'value': ""}
|
|||
|
return stext
|