Source code

Revision control

Copy as Markdown

Other Tools

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import re
try:
from html import unescape as html_unescape
except ImportError:
from HTMLParser import HTMLParser
html_parser = HTMLParser()
html_unescape = html_parser.unescape
from .base import (
Entity, Comment, Junk,
Parser
)
class DTDEntityMixin:
@property
def val(self):
'''Unescape HTML entities into corresponding Unicode characters.
Named (&), decimal (&), and hex (& and &) formats
are supported. Unknown entities are left intact.
As of Python 3.7 the following 252 named entities are
recognized and unescaped:
'''
return html_unescape(self.raw_val)
def value_position(self, offset=0):
# DTDChecker already returns tuples of (line, col) positions
if isinstance(offset, tuple):
line_pos, col_pos = offset
line, col = super().value_position()
if line_pos == 1:
col = col + col_pos
else:
col = col_pos
line += line_pos - 1
return line, col
else:
return super().value_position(offset)
class DTDEntity(DTDEntityMixin, Entity):
pass
class DTDParser(Parser):
# ":" | [A-Z] | "_" | [a-z] |
# [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
# | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
# [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
# [#x10000-#xEFFFF]
CharMinusDash = '\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
NameStartChar = ':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
# + \U00010000-\U000EFFFF seems to be unsupported in python
# NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
# [#x0300-#x036F] | [#x203F-#x2040]
NameChar = NameStartChar + r'\-\.0-9' + '\xB7\u0300-\u036F\u203F-\u2040'
Name = '[' + NameStartChar + '][' + NameChar + ']*'
reKey = re.compile('<!ENTITY[ \t\r\n]+(?P<key>' + Name + ')[ \t\r\n]+'
'(?P<val>\"[^\"]*\"|\'[^\']*\'?)[ \t\r\n]*>',
re.DOTALL | re.M)
# add BOM to DTDs, details in bug 435002
reHeader = re.compile('^\ufeff')
reComment = re.compile('<!--(?P<val>-?[%s])*?-->' % CharMinusDash,
re.S)
rePE = re.compile('<!ENTITY[ \t\r\n]+%[ \t\r\n]+(?P<key>' + Name + ')'
'[ \t\r\n]+SYSTEM[ \t\r\n]+'
'(?P<val>\"[^\"]*\"|\'[^\']*\')[ \t\r\n]*>[ \t\r\n]*'
'%' + Name + ';'
'(?:[ \t]*(?:' + XmlComment + '[ \t\r\n]*)*\n?)?')
class Comment(Comment):
@property
def val(self):
if self._val_cache is None:
# Strip "<!--" and "-->" to comment contents
self._val_cache = self.all[4:-3]
return self._val_cache
def getNext(self, ctx, offset):
'''
Overload Parser.getNext to special-case ParsedEntities.
Just check for a parsed entity if that method claims junk.
<!ENTITY % foo SYSTEM "url">
%foo;
'''
if offset == 0 and self.reHeader.match(ctx.contents):
offset += 1
entity = Parser.getNext(self, ctx, offset)
if (entity and isinstance(entity, Junk)) or entity is None:
m = self.rePE.match(ctx.contents, offset)
if m:
entity = DTDEntity(
ctx, None, None, m.span(), m.span('key'), m.span('val'))
return entity
def createEntity(self, ctx, m, current_comment, white_space):
valspan = m.span('val')
valspan = (valspan[0]+1, valspan[1]-1)
return DTDEntity(ctx, current_comment, white_space,
m.span(), m.span('key'), valspan)