unicode with charset
parent
1e4d1843cb
commit
5001d68da9
|
@ -3,6 +3,7 @@ from base64 import b64encode, b64decode, encodestring, decodestring, urlsafe_b64
|
|||
import lybmods.lybcfg as lybcfg
|
||||
from lxml import etree
|
||||
from chardet import detect
|
||||
import re
|
||||
|
||||
SESSION_KEY = '_cp_username'
|
||||
mdict = {
|
||||
|
@ -73,13 +74,17 @@ def ctype(c_t):
|
|||
def htfile_tounicode(htfile):
|
||||
if type(htfile) is str:
|
||||
return htfile
|
||||
ct = etree.HTML(htfile).xpath('//meta/@http-equiv')
|
||||
html = etree.HTML(htfile)
|
||||
ct = html.xpath('//meta/@http-equiv')
|
||||
|
||||
enc = detect(htfile)['encoding']
|
||||
if ct != []:
|
||||
c_t = ct[0].getparent().attrib['content']
|
||||
meta_elem = ct[0].getparent()
|
||||
c_t = meta_elem.attrib['content']
|
||||
if 'charset' in c_t:
|
||||
enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0]
|
||||
return str(htfile, enc, 'ignore')
|
||||
meta_elem.attrib['content'] = re.sub('charset=[^;" ]+', '', meta_elem.attrib['content'])
|
||||
return str(etree.tounicode(html, method='html', pretty_print = True), enc, 'ignore')
|
||||
|
||||
def getbin(sess, hhash):
|
||||
cur = sess.db.cursor()
|
||||
|
|
Loading…
Reference in New Issue