unicode with charset

master
inpos 2017-12-26 15:28:16 +03:00
parent 1e4d1843cb
commit 5001d68da9
1 changed files with 8 additions and 3 deletions

View File

@ -3,6 +3,7 @@ from base64 import b64encode, b64decode, encodestring, decodestring, urlsafe_b64
import lybmods.lybcfg as lybcfg import lybmods.lybcfg as lybcfg
from lxml import etree from lxml import etree
from chardet import detect from chardet import detect
import re
SESSION_KEY = '_cp_username' SESSION_KEY = '_cp_username'
mdict = { mdict = {
@ -73,13 +74,17 @@ def ctype(c_t):
def htfile_tounicode(htfile): def htfile_tounicode(htfile):
if type(htfile) is str: if type(htfile) is str:
return htfile return htfile
ct = etree.HTML(htfile).xpath('//meta/@http-equiv') html = etree.HTML(htfile)
ct = html.xpath('//meta/@http-equiv')
enc = detect(htfile)['encoding'] enc = detect(htfile)['encoding']
if ct != []: if ct != []:
c_t = ct[0].getparent().attrib['content'] meta_elem = ct[0].getparent()
c_t = meta_elem.attrib['content']
if 'charset' in c_t: if 'charset' in c_t:
enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0] enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0]
return str(htfile, enc, 'ignore') meta_elem.attrib['content'] = re.sub('charset=[^;" ]+', '', meta_elem.attrib['content'])
return str(etree.tounicode(html, method='html', pretty_print = True), enc, 'ignore')
def getbin(sess, hhash): def getbin(sess, hhash):
cur = sess.db.cursor() cur = sess.db.cursor()