'''Этот модуль предназначен для конвертации в C-код и последующей компиляции в исполняюмую библиотеку. Здесь находятся тяжёлые функции.''' import cherrypy from lxml import etree from lybmods import lybtools from urllib.request import FancyURLopener from urllib.parse import urlencode from urllib.parse import urljoin, parse_qs, urlsplit from lybmods.lybclasses import Cat from lybmods import lybhtdata from base64 import decodestring import re import hashlib class URLOpener(FancyURLopener): version = 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0.6) Gecko/20100101 Firefox/10.0.6 Iceweasel/10.0.6' def html(self, catid, url, htfile, md = None, sub = False, ins = False): htfile = lybtools.htfile_tounicode(htfile) body = etree.ElementTree(etree.HTML(htfile).xpath('//body')[0]) sessdata = cherrypy.session strip_tags = ["script", "link"] etree.strip_elements(body, *strip_tags, with_tail=False) chg_tags = ["body", "a", "form", "input", "noscript"] etree.strip_tags(body, *chg_tags) etree.strip_tags(body, etree.Comment) #safe_tags = ['img'] for elem in body.xpath('//*'): if elem.tag == 'body': elem.tag = 'old-body' attr = elem.attrib if elem.tag in chg_tags: etree.strip_attributes(elem, *attr) if "class" in attr: etree.strip_attributes(elem, "class") if "id" in attr: etree.strip_attributes(elem, "id") if "onclick" in attr: etree.strip_attributes(elem, "onclick") if "style" in attr: attr['style'] = re.sub('url\(.+\)', 'url()', attr['style']) #if elem.tag not in safe_tags and (elem.text is None or elem.text.strip() == '') and elem.getchildren() == []: # elem.getparent().remove(elem) # continue if "src" in attr: m = re.search('data:(\S+);base64,(.+)', attr['src']) if not m: srcurl = urljoin(url, attr['src']) srcobjquery = urlsplit(srcurl)[3] srcqdict = parse_qs(srcobjquery) if 'lybsrcobj' in list(srcqdict.keys()): ohash = srcqdict['lybsrcobj'][0] srcquerydata = {'lybsrcobj': ohash} srcquery = urlencode(srcquerydata) # if ins: page = '/getobj?' # else: # page = '/edit/tmpstore?' # if ohash not in sessdata: # if md: # cat = Cat(int(catid)) # doco = cat[int(md)] # sessdata[ohash] = doco[ohash] elem.set('src', page + srcquery) continue try: srcu = URLOpener().open(srcurl) except: continue if srcu.code >= 400: continue srcdata = srcu.read() cont_type = srcu.headers['Content-Type'] srcftype = cont_type and lybtools.ctype(srcu.headers['Content-Type']) or 'none' else: srcdata = decodestring(m.group(2).encode('utf-8')) srcftype = m.group(1) srchashname = hashlib.sha1(srcdata).hexdigest() if srcftype == 'text/html': if elem.tag == 'img': continue srcdata = self.html(catid, srcu.url, srcdata, sub = True) if srchashname not in sessdata: sessdata[srchashname] = {'body': srcdata, 'type': srcftype} srcquerydata = {'lybsrcobj': srchashname} srcquery = urlencode(srcquerydata) if ins: page = '/getobj?' else: page = '/edit/tmpstore?' elem.set('src', page + srcquery) etree.strip_tags(body, 'old-body') ht_ml = etree.tounicode(body, method='html', pretty_print = True) if not sub and not ins: return self.ne(catid, url=url, html=ht_ml, md=md) else: return ht_ml def insert_doc(self, catid = 0, url = None, html = None, doc_name = '', md = None): catid = int(catid) doc_name = doc_name.strip() cat = Cat(catid) catname = cat.name if md: md = int(md) modify_doc = '' else: modify_doc = '' html = self.html(catid, '/', html, ins = True) if doc_name == '': return self.root.buildhtml('Архив [Укажи имя документа]', lybhtdata.nicedit_html1 + lybhtdata.nicedit_html2.format(docname=doc_name, url=url, modify_doc=modify_doc, textarea=lybhtdata.nicedit_textarea.format(input_html=html), catid=catid, catname = catname)) did = cat.docidbyname(doc_name) if did and did != md: return self.root.buildhtml('Архив [Документ с таким именем в разделе "' + cat.name + '" существует]', lybhtdata.nicedit_html1 + lybhtdata.nicedit_html2.format(docname=doc_name, url=url, modify_doc=modify_doc, textarea=lybhtdata.nicedit_textarea.format(input_html=html), catid=catid, catname = catname)) if md: cat[md] = { 'name': doc_name, 'body': html} did = md else: did = cat.insert({ 'name': doc_name, 'body': html}) xdata = etree.HTML(html) if md: new_bin_list = [ parse_qs(urlsplit(x)[3])['lybsrcobj'][0] for x in xdata.xpath('//@src') if 'lybsrcobj' in parse_qs(urlsplit(x)[3])] old_bin_list = cat[did].bins list_to_del = list(set(old_bin_list) - set(new_bin_list)) for bhash in list_to_del: del cat[did][bhash] for src in xdata.xpath('//@src'): srcquery = urlsplit(src)[3] try: src_obj = parse_qs(srcquery)['lybsrcobj'][0] except: continue try: obj = cherrypy.session[src_obj] except KeyError: continue cat[did][src_obj] = obj cherrypy.session.clear() args = {'doc':did, 'catid':catid} return self.root.get_doc(**args) def ne(self, catid, url = None, html = None, md = None, docname = "Новый документ"): html = lybhtdata.nicedit_textarea.format(input_html=html) xhtml = etree.HTML(html).xpath('//textarea[@id=\'nicedit-js-area\']')[0] if md: docname = Cat(int(catid))[int(md)].name modify_doc = '' else: modify_doc = '' return self.root.path(int(catid), lcat = True) + '

' + lybhtdata.nicedit_html1 + lybhtdata.nicedit_html2.format(url=url, catid=catid, catname = Cat(int(catid)).name, docname = docname, modify_doc=modify_doc, textarea=etree.tounicode(xhtml, method="html", pretty_print=True) )