# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at
import re
import bisect
import codecs
from collections import Counter
from compare_locales.keyedtuple import KeyedTuple
from compare_locales.paths import File
__constructors = []
# The allowed capabilities for the Parsers. They define the exact strategy
# used by ContentComparer.merge.
# Don't perform any merging
# Copy the entire reference file
# Remove broken entities from localization
# Without CAN_MERGE, en-US is not good to use for localization.
# Add missing and broken entities from the reference to localization
# This effectively means that en-US is good to use for localized files.
class Entry:
Abstraction layer for a localizable entity.
Currently supported are grammars of the form:
1: entity definition
2: entity key (name)
3: entity value
<!ENTITY key "value">
<--- definition ---->
def __init__(
self, ctx, pre_comment, inner_white, span, key_span, val_span
self.ctx = ctx
self.span = span
self.key_span = key_span
self.val_span = val_span
self.pre_comment = pre_comment
self.inner_white = inner_white
def position(self, offset=0):
"""Get the 1-based line and column of the character
with given offset into the Entity.
If offset is negative, return the end of the Entity.
if offset < 0:
pos = self.span[1]
pos = self.span[0] + offset
return self.ctx.linecol(pos)
def value_position(self, offset=0):
"""Get the 1-based line and column of the character
with given offset into the value.
If offset is negative, return the end of the value.
assert self.val_span is not None
if offset < 0:
pos = self.val_span[1]
pos = self.val_span[0] + offset
return self.ctx.linecol(pos)
def _span_start(self):
start = self.span[0]
if hasattr(self, 'pre_comment') and self.pre_comment is not None:
start = self.pre_comment.span[0]
return start
def all(self):
start = self._span_start()
end = self.span[1]
return self.ctx.contents[start:end]
def key(self):
return self.ctx.contents[self.key_span[0]:self.key_span[1]]
def raw_val(self):
if self.val_span is None:
return None
return self.ctx.contents[self.val_span[0]:self.val_span[1]]
def val(self):
return self.raw_val
def __repr__(self):
return self.key
re_br = re.compile('<br[ \t\r\n]*/?>', re.U)
re_sgml = re.compile(r'</?\w+.*?>', re.U | re.M)
def count_words(self):
"""Count the words in an English string.
Replace a couple of xml markup to make that safer, too.
value = self.re_br.sub('\n', self.val)
value = self.re_sgml.sub('', value)
return len(value.split())
def equals(self, other):
return self.key == other.key and self.val == other.val
class StickyEntry(Entry):
"""Subclass of Entry to use in for syntax fragments
which should always be overwritten in the serializer.
class Entity(Entry):
def localized(self):
'''Is this entity localized.
Always true for monolingual files.
In bilingual files, this is a dynamic property.
return True
def unwrap(self):
"""Return the literal value to be used by tools.
return self.raw_val
def wrap(self, raw_val):
"""Create literal entity based on reference and raw value.
This is used by the serialization logic.
start = self._span_start()
all = (
self.ctx.contents[start:self.val_span[0]] +
raw_val +
return LiteralEntity(self.key, raw_val, all)
class LiteralEntity(Entity):
"""Subclass of Entity to represent entities without context slices.
It's storing string literals for key, raw_val and all instead of spans.
def __init__(self, key, val, all):
super().__init__(None, None, None, None, None, None)
self._key = key
self._raw_val = val
self._all = all
def key(self):
return self._key
def raw_val(self):
return self._raw_val
def all(self):
return self._all
class PlaceholderEntity(LiteralEntity):
"""Subclass of Entity to be removed in merges.
def __init__(self, key):
super().__init__(key, "", "\nplaceholder\n")
class Comment(Entry):
def __init__(self, ctx, span):
self.ctx = ctx
self.span = span
self.val_span = None
self._val_cache = None
def key(self):
return None
def val(self):
if self._val_cache is None:
self._val_cache = self.all
return self._val_cache
def __repr__(self):
return self.all
class OffsetComment(Comment):
'''Helper for file formats that have a constant number of leading
chars to strip from comments.
Offset defaults to 1
comment_offset = 1
def val(self):
if self._val_cache is None:
self._val_cache = ''.join(
l[self.comment_offset:] for l in self.all.splitlines(True)
return self._val_cache
class Junk:
An almost-Entity, representing junk data that we didn't parse.
This way, we can signal bad content as stuff we don't understand.
And the either fix that, or report real bugs in localizations.
junkid = 0
def __init__(self, ctx, span):
self.ctx = ctx
self.span = span
self.__class__.junkid += 1
self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
def position(self, offset=0):
"""Get the 1-based line and column of the character
with given offset into the Entity.
If offset is negative, return the end of the Entity.
if offset < 0:
pos = self.span[1]
pos = self.span[0] + offset
return self.ctx.linecol(pos)
def all(self):
return self.ctx.contents[self.span[0]:self.span[1]]
def raw_val(self):
return self.all
def val(self):
return self.all
def error_message(self):
params = (self.val,) + self.position() + self.position(-1)
return (
'Unparsed content "%s" from line %d column %d'
' to line %d column %d' % params
def __repr__(self):
return self.key
class Whitespace(Entry):
'''Entity-like object representing an empty file with whitespace,
if allowed
def __init__(self, ctx, span):
self.ctx = ctx
self.span = self.key_span = self.val_span = span
def __repr__(self):
return self.raw_val
class BadEntity(ValueError):
'''Raised when the parser can't create an Entity for a found match.
class Parser:
capabilities = CAN_SKIP | CAN_MERGE
reWhitespace = re.compile('[ \t\r\n]+', re.M)
Comment = Comment
# NotImplementedError would be great, but also tedious
reKey = reComment = None
class Context:
"Fixture for content and line numbers"
def __init__(self, contents):
self.contents = contents
# cache split lines
self._lines = None
def linecol(self, position):
"Returns 1-based line and column numbers."
if self._lines is None:
nl = re.compile('\n', re.M)
self._lines = [m.end()
for m in nl.finditer(self.contents)]
line_offset = bisect.bisect(self._lines, position)
line_start = self._lines[line_offset - 1] if line_offset else 0
col_offset = position - line_start
return line_offset + 1, col_offset + 1
def __init__(self):
if not hasattr(self, 'encoding'):
self.encoding = 'utf-8'
self.ctx = None
def readFile(self, file):
'''Read contents from disk, with universal_newlines'''
if isinstance(file, File):
file = file.fullpath
# python 2 has binary input with universal newlines,
# python 3 doesn't. Let's split code paths
with open(
encoding=self.encoding, errors='replace',
) as f:
def readContents(self, contents):
'''Read contents and create parsing context.
contents are in native encoding, but with normalized line endings.
(contents, _) = codecs.getdecoder(self.encoding)(contents, 'replace')
def readUnicode(self, contents):
self.ctx = self.Context(contents)
def parse(self):
return KeyedTuple(self)
def __iter__(self):
return self.walk(only_localizable=True)
def walk(self, only_localizable=False):
if not self.ctx:
# loading file failed, or we just didn't load anything
ctx = self.ctx
contents = ctx.contents
next_offset = 0
while next_offset < len(contents):
entity = self.getNext(ctx, next_offset)
if isinstance(entity, (Entity, Junk)):
yield entity
elif not only_localizable:
yield entity
next_offset = entity.span[1]
def getNext(self, ctx, offset):
'''Parse the next fragment.
Parse comments first, then white-space.
If an entity follows, create that entity with such pre_comment and
inner white-space. If not, emit comment or white-space as standlone.
It's OK that this might parse whitespace more than once.
Comments are associated with entities if they're not separated by
blank lines. Multiple consecutive comments are joined.
junk_offset = offset
m = self.reComment.match(ctx.contents, offset)
if m:
current_comment = self.Comment(ctx, m.span())
if offset < 2 and 'License' in current_comment.val:
# Heuristic. A early comment with "License" is probably
# a license header, and should be standalone.
# Not glueing ourselves to offset == 0 as we might have
# skipped a BOM.
return current_comment
offset = m.end()
current_comment = None
m = self.reWhitespace.match(ctx.contents, offset)
if m:
white_space = Whitespace(ctx, m.span())
offset = m.end()
if (
current_comment is not None
and white_space.raw_val.count('\n') > 1
# standalone comment
# return the comment, and reparse the whitespace next time
return current_comment
if current_comment is None:
return white_space
white_space = None
m = self.reKey.match(ctx.contents, offset)
if m:
return self.createEntity(ctx, m, current_comment, white_space)
except BadEntity:
# fall through to Junk, probably
if current_comment is not None:
return current_comment
if white_space is not None:
return white_space
return self.getJunk(ctx, junk_offset, self.reKey, self.reComment)
def getJunk(self, ctx, offset, *expressions):
junkend = None
for exp in expressions:
m =, offset)
if m:
junkend = min(junkend, m.start()) if junkend else m.start()
return Junk(ctx, (offset, junkend or len(ctx.contents)))
def createEntity(self, ctx, m, current_comment, white_space):
return Entity(
ctx, current_comment, white_space,
m.span(), m.span('key'), m.span('val')
def findDuplicates(cls, entities):
found = Counter(entity.key for entity in entities)
for entity_id, cnt in found.items():
if cnt > 1:
yield f'{entity_id} occurs {cnt} times'