File: //lib/python3.6/site-packages/textile/utils.py
from __future__ import unicode_literals
import six
try:
import regex as re
except ImportError:
import re
from six.moves import urllib, html_parser
urlparse = urllib.parse.urlparse
HTMLParser = html_parser.HTMLParser
from collections import OrderedDict
from xml.etree import ElementTree
from textile.regex_strings import valign_re_s, halign_re_s
def decode_high(text):
"""Decode encoded HTML entities."""
h = HTMLParser()
text = '&#{0};'.format(text)
return h.unescape(text)
def encode_high(text):
"""Encode the text so that it is an appropriate HTML entity."""
return ord(text)
def encode_html(text, quotes=True):
"""Return text that's safe for an HTML attribute."""
a = (
('&', '&'),
('<', '<'),
('>', '>'))
if quotes:
a = a + (("'", '''),
('"', '"'))
for k, v in a:
text = text.replace(k, v)
return text
def generate_tag(tag, content, attributes=None):
"""Generate a complete html tag using the ElementTree module. tag and
content are strings, the attributes argument is a dictionary. As
a convenience, if the content is ' /', a self-closing tag is generated."""
content = six.text_type(content)
# In PY2, ElementTree tostringlist only works with bytes, not with
# unicode().
enc = 'unicode'
if six.PY2:
enc = 'UTF-8'
if not tag:
return content
element = ElementTree.Element(tag, attrib=attributes)
# FIXME: Kind of an ugly hack. There *must* be a cleaner way. I tried
# adding text by assigning it to element_tag.text. That results in
# non-ascii text being html-entity encoded. Not bad, but not entirely
# matching php-textile either.
element_tag = ElementTree.tostringlist(element, encoding=enc,
method='html')
if six.PY2:
element_tag = [v.decode(enc) for v in element_tag]
element_tag.insert(len(element_tag) - 1, content)
element_text = ''.join(element_tag)
return element_text
def has_raw_text(text):
"""checks whether the text has text not already enclosed by a block tag"""
# The php version has orders the below list of tags differently. The
# important thing to note here is that the pre must occur before the p or
# else the regex module doesn't properly match pre-s. It only matches the
# p in pre.
r = re.compile(r'<(pre|p|blockquote|div|form|table|ul|ol|dl|h[1-6])[^>]*?>.*</\1>',
re.S).sub('', text.strip()).strip()
r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r)
return '' != r
def is_rel_url(url):
"""Identify relative urls."""
(scheme, netloc) = urlparse(url)[0:2]
return not scheme and not netloc
def is_valid_url(url):
parsed = urlparse(url)
if parsed.scheme == '':
return True
return False
def list_type(list_string):
listtypes = {
list_string.startswith('*'): 'u',
list_string.startswith('#'): 'o',
(not list_string.startswith('*') and not list_string.startswith('#')):
'd'
}
return listtypes.get(True, False)
def normalize_newlines(string):
out = string.strip()
out = re.sub(r'\r\n?', '\n', out)
out = re.compile(r'^[ \t]*\n', flags=re.M).sub('\n', out)
out = re.sub(r'"$', '" ', out)
return out
def parse_attributes(block_attributes, element=None, include_id=True, restricted=False):
vAlign = {'^': 'top', '-': 'middle', '~': 'bottom'}
hAlign = {'<': 'left', '=': 'center', '>': 'right', '<>': 'justify'}
style = []
aclass = ''
lang = ''
colspan = ''
rowspan = ''
block_id = ''
span = ''
width = ''
result = OrderedDict()
if not block_attributes:
return result
matched = block_attributes
if element == 'td':
m = re.search(r'\\(\d+)', matched)
if m:
colspan = m.group(1)
m = re.search(r'/(\d+)', matched)
if m:
rowspan = m.group(1)
if element == 'td' or element == 'tr':
m = re.search(r'(^{0})'.format(valign_re_s), matched)
if m:
style.append("vertical-align:{0}".format(vAlign[m.group(1)]))
if not restricted:
m = re.search(r'\{([^}]*)\}', matched)
if m:
style.extend(m.group(1).rstrip(';').split(';'))
matched = matched.replace(m.group(0), '')
m = re.search(r'\[([^\]]+)\]', matched, re.U)
if m:
lang = m.group(1)
matched = matched.replace(m.group(0), '')
m = re.search(r'\(([^()]+)\)', matched, re.U)
if m:
aclass = m.group(1)
matched = matched.replace(m.group(0), '')
m = re.search(r'([(]+)', matched)
if m:
style.append("padding-left:{0}em".format(len(m.group(1))))
matched = matched.replace(m.group(0), '')
m = re.search(r'([)]+)', matched)
if m:
style.append("padding-right:{0}em".format(len(m.group(1))))
matched = matched.replace(m.group(0), '')
m = re.search(r'({0})'.format(halign_re_s), matched)
if m:
style.append("text-align:{0}".format(hAlign[m.group(1)]))
m = re.search(r'^(.*)#(.*)$', aclass)
if m:
block_id = m.group(2)
aclass = m.group(1)
if element == 'col':
pattern = r'(?:\\(\d+)\.?)?\s*(\d+)?'
csp = re.match(pattern, matched)
span, width = csp.groups()
if colspan:
result['colspan'] = colspan
if style:
# Previous splits that created style may have introduced extra
# whitespace into the list elements. Clean it up.
style = [x.strip() for x in style]
result['style'] = '{0};'.format("; ".join(style))
if aclass:
result['class'] = aclass
if block_id and include_id:
result['id'] = block_id
if lang:
result['lang'] = lang
if rowspan:
result['rowspan'] = rowspan
if span:
result['span'] = span
if width:
result['width'] = width
return result
def pba(block_attributes, element=None, include_id=True, restricted=False):
"""Parse block attributes."""
attrs = parse_attributes(block_attributes, element, include_id, restricted)
if not attrs:
return ''
result = ' '.join(['{0}="{1}"'.format(k, v) for k, v in attrs.items()])
return ' {0}'.format(result)