From 5001d68da93f12d9e3476fee7fe1bf7ffa4fdd6e Mon Sep 17 00:00:00 2001 From: inpos Date: Tue, 26 Dec 2017 15:28:16 +0300 Subject: [PATCH] unicode with charset --- lybmods/lybtools.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lybmods/lybtools.py b/lybmods/lybtools.py index 43c4974..094b636 100644 --- a/lybmods/lybtools.py +++ b/lybmods/lybtools.py @@ -3,6 +3,7 @@ from base64 import b64encode, b64decode, encodestring, decodestring, urlsafe_b64 import lybmods.lybcfg as lybcfg from lxml import etree from chardet import detect +import re SESSION_KEY = '_cp_username' mdict = { @@ -73,13 +74,17 @@ def ctype(c_t): def htfile_tounicode(htfile): if type(htfile) is str: return htfile - ct = etree.HTML(htfile).xpath('//meta/@http-equiv') + html = etree.HTML(htfile) + ct = html.xpath('//meta/@http-equiv') + enc = detect(htfile)['encoding'] if ct != []: - c_t = ct[0].getparent().attrib['content'] + meta_elem = ct[0].getparent() + c_t = meta_elem.attrib['content'] if 'charset' in c_t: enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0] - return str(htfile, enc, 'ignore') + meta_elem.attrib['content'] = re.sub('charset=[^;" ]+', '', meta_elem.attrib['content']) + return str(etree.tounicode(html, method='html', pretty_print = True), enc, 'ignore') def getbin(sess, hhash): cur = sess.db.cursor()