plugin.video.torrenter/resources/scrapers/html.py

58 lines
1.6 KiB
Python
Raw Permalink Normal View History

2015-01-09 14:11:21 +03:00
# -*- coding: utf-8 -*-
import re
import htmlentitydefs
RE = {
'space': re.compile('[ ]{2,}', re.U | re.S),
'cl': re.compile('[\n]{2,}', re.U | re.S),
'br': re.compile('<\s*br[\s/]*>', re.U | re.S),
'inner': re.compile('<[^>]*>[^<]+<\s*/[^>]*>', re.U | re.S),
'html': re.compile('<[^>]*>', re.U | re.S),
'entity': re.compile('&#?\w+;', re.U)
}
UNSUPPORT = {
'&#151;': '-'
}
class Clear:
def text(self, text, inner=False):
text = self._unsupport(text).replace(u'\r', u'\n')
text = RE['br'].sub(u'\n', text)
if inner:
text = RE['inner'].sub(u'', text)
text = RE['html'].sub(u'', text)
text = self.char(text)
text = RE['space'].sub(u' ', text)
return RE['cl'].sub(u'\n', text).strip()
def string(self, text, space=u''):
return self.text(text).replace(u'\n', space).strip()
def char(self, text):
return RE['entity'].sub(self._unescape, self._unsupport(text))
def _unsupport(self, text):
for tag, value in UNSUPPORT.iteritems():
text = text.replace(tag, value)
return text
def _unescape(self, m):
text = m.group(0)
if text[:2] == u"&#":
try:
if text[:3] == u"&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text