unicode with charset
parent
1e4d1843cb
commit
5001d68da9
|
@ -3,6 +3,7 @@ from base64 import b64encode, b64decode, encodestring, decodestring, urlsafe_b64
|
||||||
import lybmods.lybcfg as lybcfg
|
import lybmods.lybcfg as lybcfg
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from chardet import detect
|
from chardet import detect
|
||||||
|
import re
|
||||||
|
|
||||||
SESSION_KEY = '_cp_username'
|
SESSION_KEY = '_cp_username'
|
||||||
mdict = {
|
mdict = {
|
||||||
|
@ -73,13 +74,17 @@ def ctype(c_t):
|
||||||
def htfile_tounicode(htfile):
|
def htfile_tounicode(htfile):
|
||||||
if type(htfile) is str:
|
if type(htfile) is str:
|
||||||
return htfile
|
return htfile
|
||||||
ct = etree.HTML(htfile).xpath('//meta/@http-equiv')
|
html = etree.HTML(htfile)
|
||||||
|
ct = html.xpath('//meta/@http-equiv')
|
||||||
|
|
||||||
enc = detect(htfile)['encoding']
|
enc = detect(htfile)['encoding']
|
||||||
if ct != []:
|
if ct != []:
|
||||||
c_t = ct[0].getparent().attrib['content']
|
meta_elem = ct[0].getparent()
|
||||||
|
c_t = meta_elem.attrib['content']
|
||||||
if 'charset' in c_t:
|
if 'charset' in c_t:
|
||||||
enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0]
|
enc = c_t.split('charset')[1].strip().split('=')[1].strip().split(' ')[0]
|
||||||
return str(htfile, enc, 'ignore')
|
meta_elem.attrib['content'] = re.sub('charset=[^;" ]+', '', meta_elem.attrib['content'])
|
||||||
|
return str(etree.tounicode(html, method='html', pretty_print = True), enc, 'ignore')
|
||||||
|
|
||||||
def getbin(sess, hhash):
|
def getbin(sess, hhash):
|
||||||
cur = sess.db.cursor()
|
cur = sess.db.cursor()
|
||||||
|
|
Loading…
Reference in New Issue