plugin.video.torrenter/resources/scrapers/html.py

# -*- coding: utf-8 -*-

import re
import htmlentitydefs

RE = {
    'space': re.compile('[ ]{2,}', re.U | re.S),
    'cl': re.compile('[\n]{2,}', re.U | re.S),
    'br': re.compile('<\s*br[\s/]*>', re.U | re.S),
    'inner': re.compile('<[^>]*>[^<]+<\s*/[^>]*>', re.U | re.S),
    'html': re.compile('<[^>]*>', re.U | re.S),
    'entity': re.compile('&#?\w+;', re.U)
}

UNSUPPORT = {
    '&#151;': '-'
}


class Clear:
    def text(self, text, inner=False):
        text = self._unsupport(text).replace(u'\r', u'\n')
        text = RE['br'].sub(u'\n', text)
        if inner:
            text = RE['inner'].sub(u'', text)
        text = RE['html'].sub(u'', text)
        text = self.char(text)
        text = RE['space'].sub(u' ', text)
        return RE['cl'].sub(u'\n', text).strip()

    def string(self, text, space=u''):
        return self.text(text).replace(u'\n', space).strip()

    def char(self, text):
        return RE['entity'].sub(self._unescape, self._unsupport(text))

    def _unsupport(self, text):
        for tag, value in UNSUPPORT.iteritems():
            text = text.replace(tag, value)
        return text

    def _unescape(self, m):
        text = m.group(0)
        if text[:2] == u"&#":
            try:
                if text[:3] == u"&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
lol why? 2015-01-09 14:11:21 +03:00			`# -- coding: utf-8 --`

			`import re`
			`import htmlentitydefs`

			`RE = {`
			`'space': re.compile('[ ]{2,}', re.U \| re.S),`
			`'cl': re.compile('[\n]{2,}', re.U \| re.S),`
			`'br': re.compile('<\sbr[\s/]>', re.U \| re.S),`
			`'inner': re.compile('<[^>]>[^<]+<\s/[^>]*>', re.U \| re.S),`
			`'html': re.compile('<[^>]*>', re.U \| re.S),`
			`'entity': re.compile('&#?\w+;', re.U)`
			`}`

			`UNSUPPORT = {`
			`'': '-'`
			`}`


			`class Clear:`
			`def text(self, text, inner=False):`
			`text = self._unsupport(text).replace(u'\r', u'\n')`
			`text = RE['br'].sub(u'\n', text)`
			`if inner:`
			`text = RE['inner'].sub(u'', text)`
			`text = RE['html'].sub(u'', text)`
			`text = self.char(text)`
			`text = RE['space'].sub(u' ', text)`
			`return RE['cl'].sub(u'\n', text).strip()`

			`def string(self, text, space=u''):`
			`return self.text(text).replace(u'\n', space).strip()`

			`def char(self, text):`
			`return RE['entity'].sub(self._unescape, self._unsupport(text))`

			`def _unsupport(self, text):`
			`for tag, value in UNSUPPORT.iteritems():`
			`text = text.replace(tag, value)`
			`return text`

			`def _unescape(self, m):`
			`text = m.group(0)`
			`if text[:2] == u"&#":`
			`try:`
			`if text[:3] == u"&#x":`
			`return unichr(int(text[3:-1], 16))`
			`else:`
			`return unichr(int(text[2:-1]))`
			`except ValueError:`
			`pass`
			`else:`
			`try:`
			`text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])`
			`except KeyError:`
			`pass`
			`return text`