filters.py - mozsearch

Enable keyboard shortcuts

# mako/filters.py

# Copyright 2006-2020 the Mako authors and contributors <see AUTHORS file>

# This module is part of Mako and is released under

# the MIT License: http://www.opensource.org/licenses/mit-license.php

import codecs

import re

from mako import compat

from mako.compat import codepoint2name

from mako.compat import name2codepoint

from mako.compat import quote_plus

from mako.compat import unquote_plus

xml_escapes = {

    "&": "&amp;",

    ">": "&gt;",

    "<": "&lt;",

    '"': "&#34;",  # also &quot; in html-only

    "'": "&#39;",  # also &apos; in html-only

# XXX: &quot; is valid in HTML and XML

#      &apos; is not valid HTML, but is valid XML

def legacy_html_escape(s):

    """legacy HTML escape for non-unicode mode."""

    s = s.replace("&", "&amp;")

    s = s.replace(">", "&gt;")

    s = s.replace("<", "&lt;")

    s = s.replace('"', "&#34;")

    s = s.replace("'", "&#39;")

    return s

try:

    import markupsafe

    html_escape = markupsafe.escape

except ImportError:

    html_escape = legacy_html_escape

def xml_escape(string):

    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)

def url_escape(string):

    # convert into a list of octets

    string = string.encode("utf8")

    return quote_plus(string)

def legacy_url_escape(string):

    # convert into a list of octets

    return quote_plus(string)

def url_unescape(string):

    text = unquote_plus(string)

    if not is_ascii_str(text):

        text = text.decode("utf8")

    return text

def trim(string):

    return string.strip()

class Decode(object):

    def __getattr__(self, key):

        def decode(x):

            if isinstance(x, compat.text_type):

                return x

            elif not isinstance(x, compat.binary_type):

                return decode(str(x))

            else:

                return compat.text_type(x, encoding=key)

        return decode

decode = Decode()

_ASCII_re = re.compile(r"\A[\x00-\x7f]*\Z")

def is_ascii_str(text):

    return isinstance(text, str) and _ASCII_re.match(text)

################################################################

class XMLEntityEscaper(object):

    def __init__(self, codepoint2name, name2codepoint):

        self.codepoint2entity = dict(

                (c, compat.text_type("&%s;" % n))

                for c, n in codepoint2name.items()

        self.name2codepoint = name2codepoint

    def escape_entities(self, text):

        """Replace characters with their character entity references.

        Only characters corresponding to a named entity are replaced.

"""

        return compat.text_type(text).translate(self.codepoint2entity)

    def __escape(self, m):

        codepoint = ord(m.group())

        try:

            return self.codepoint2entity[codepoint]

        except (KeyError, IndexError):

            return "&#x%X;" % codepoint

    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')

    def escape(self, text):

        """Replace characters with their character references.

        Replace characters by their named entity references.

        Non-ASCII characters, if they do not have a named entity reference,

        are replaced by numerical character references.

        The return value is guaranteed to be ASCII.

"""

        return self.__escapable.sub(

            self.__escape, compat.text_type(text)

        ).encode("ascii")

    # XXX: This regexp will not match all valid XML entity names__.

    # (It punts on details involving involving CombiningChars and Extenders.)

    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef

    __characterrefs = re.compile(

        r"""& (?:

                                          \#(\d+)

                                          | \#x([\da-f]+)

                                          | ( (?!\d) [:\w] [-.:\w]+ )

                                          ) ;""",

        re.X | re.UNICODE,

    def __unescape(self, m):

        dval, hval, name = m.groups()

        if dval:

            codepoint = int(dval)

        elif hval:

            codepoint = int(hval, 16)

        else:

            codepoint = self.name2codepoint.get(name, 0xFFFD)

            # U+FFFD = "REPLACEMENT CHARACTER"

        if codepoint < 128:

            return chr(codepoint)

        return chr(codepoint)

    def unescape(self, text):

        """Unescape character references.

        All character references (both entity references and numerical

        character references) are unescaped.

"""

        return self.__characterrefs.sub(self.__unescape, text)

_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)

html_entities_escape = _html_entities_escaper.escape_entities

html_entities_unescape = _html_entities_escaper.unescape

def htmlentityreplace_errors(ex):

    """An encoding error handler.

    This python codecs error handler replaces unencodable

    characters with HTML entities, or, if no HTML entity exists for

    the character, XML character references::

        >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')

        'The cost was &euro;12.'

"""

    if isinstance(ex, UnicodeEncodeError):

        # Handle encoding errors

        bad_text = ex.object[ex.start : ex.end]

        text = _html_entities_escaper.escape(bad_text)

        return (compat.text_type(text), ex.end)

    raise ex

codecs.register_error("htmlentityreplace", htmlentityreplace_errors)

# TODO: options to make this dynamic per-compilation will be added in a later

# release

DEFAULT_ESCAPES = {

    "x": "filters.xml_escape",

    "h": "filters.html_escape",

    "u": "filters.url_escape",

    "trim": "filters.trim",

    "entity": "filters.html_entities_escape",

    "unicode": "unicode",

    "decode": "decode",

    "str": "str",

    "n": "n",

if compat.py3k:

    DEFAULT_ESCAPES.update({"unicode": "str"})

NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()

NON_UNICODE_ESCAPES["h"] = "filters.legacy_html_escape"

NON_UNICODE_ESCAPES["u"] = "filters.legacy_url_escape"