58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import re
|
||
|
import htmlentitydefs
|
||
|
|
||
|
RE = {
|
||
|
'space': re.compile('[ ]{2,}', re.U | re.S),
|
||
|
'cl': re.compile('[\n]{2,}', re.U | re.S),
|
||
|
'br': re.compile('<\s*br[\s/]*>', re.U | re.S),
|
||
|
'inner': re.compile('<[^>]*>[^<]+<\s*/[^>]*>', re.U | re.S),
|
||
|
'html': re.compile('<[^>]*>', re.U | re.S),
|
||
|
'entity': re.compile('&#?\w+;', re.U)
|
||
|
}
|
||
|
|
||
|
UNSUPPORT = {
|
||
|
'—': '-'
|
||
|
}
|
||
|
|
||
|
|
||
|
class Clear:
|
||
|
def text(self, text, inner=False):
|
||
|
text = self._unsupport(text).replace(u'\r', u'\n')
|
||
|
text = RE['br'].sub(u'\n', text)
|
||
|
if inner:
|
||
|
text = RE['inner'].sub(u'', text)
|
||
|
text = RE['html'].sub(u'', text)
|
||
|
text = self.char(text)
|
||
|
text = RE['space'].sub(u' ', text)
|
||
|
return RE['cl'].sub(u'\n', text).strip()
|
||
|
|
||
|
def string(self, text, space=u''):
|
||
|
return self.text(text).replace(u'\n', space).strip()
|
||
|
|
||
|
def char(self, text):
|
||
|
return RE['entity'].sub(self._unescape, self._unsupport(text))
|
||
|
|
||
|
def _unsupport(self, text):
|
||
|
for tag, value in UNSUPPORT.iteritems():
|
||
|
text = text.replace(tag, value)
|
||
|
return text
|
||
|
|
||
|
def _unescape(self, m):
|
||
|
text = m.group(0)
|
||
|
if text[:2] == u"&#":
|
||
|
try:
|
||
|
if text[:3] == u"&#x":
|
||
|
return unichr(int(text[3:-1], 16))
|
||
|
else:
|
||
|
return unichr(int(text[2:-1]))
|
||
|
except ValueError:
|
||
|
pass
|
||
|
else:
|
||
|
try:
|
||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||
|
except KeyError:
|
||
|
pass
|
||
|
return text
|