merge.py - mozsearch

Enable keyboard shortcuts

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

'''Merge resources across channels.

Merging resources is done over a series of parsed resources, or source

strings.

The nomenclature is that the resources are ordered from newest to oldest.

The generated file structure is taken from the newest file, and then the

next-newest, etc. The values of the returned entities are taken from the

newest to the oldest resource, too.

In merge_resources, there's an option to choose the values from oldest

to newest instead.

'''

from collections import OrderedDict, defaultdict

from codecs import encode

from functools import reduce

from compare_locales import parser as cl

from compare_locales.parser.base import StickyEntry

from compare_locales.compare.utils import AddRemove

class MergeNotSupportedError(ValueError):

    pass

def merge_channels(name, resources):

    try:

        parser = cl.getParser(name)

    except UserWarning:

        raise MergeNotSupportedError(

            f'Unsupported file format ({name}).')

    entities = merge_resources(parser, resources)

    return encode(serialize_legacy_resource(entities), parser.encoding)

def merge_resources(parser, resources, keep_newest=True):

    '''Merge parsed or unparsed resources, returning a enumerable of Entities.

    Resources are ordered from newest to oldest in the input. The structure

    of the generated content is taken from the newest resource first, and

    then filled by the next etc.

    Values are also taken from the newest, unless keep_newest is False,

    then values are taken from the oldest first.

'''

    def parse_resource(resource):

        # The counter dict keeps track of number of identical comments.

        counter = defaultdict(int)

        if isinstance(resource, bytes):

            parser.readContents(resource)

            resource = parser.walk()

        pairs = [get_key_value(entity, counter) for entity in resource]

        return OrderedDict(pairs)

    def get_key_value(entity, counter):

        if isinstance(entity, cl.Comment):

            counter[entity.val] += 1

            # Use the (value, index) tuple as the key. AddRemove will

            # de-deplicate identical comments at the same index.

            return ((entity.val, counter[entity.val]), entity)

        if isinstance(entity, cl.Whitespace):

            # Use the Whitespace instance as the key so that it's always

            # unique. Adjecent whitespace will be folded into the longer one in

            # prune.

            return (entity, entity)

        return (entity.key, entity)

    entities = reduce(

        lambda x, y: merge_two(x, y, keep_newer=keep_newest),

        map(parse_resource, resources))

    return entities.values()

def merge_two(newer, older, keep_newer=True):

    '''Merge two OrderedDicts.

    The order of the result dict is determined by `newer`.

    The values in the dict are the newer ones by default, too.

    If `keep_newer` is False, the values will be taken from the older

    dict.

'''

    diff = AddRemove()

    diff.set_left(newer.keys())

    diff.set_right(older.keys())

    # Create a flat sequence of all entities in order reported by AddRemove.

    get_entity = get_newer_entity if keep_newer else get_older_entity

    contents = [(key, get_entity(newer, older, key)) for _, key in diff]

    def prune(acc, cur):

        _, entity = cur

        if entity is None:

            # Prune Nones which stand for duplicated comments.

            return acc

        if len(acc) and isinstance(entity, cl.Whitespace):

            _, prev_entity = acc[-1]

            if isinstance(prev_entity, cl.Whitespace):

                # Prefer the longer whitespace.

                if len(entity.all) > len(prev_entity.all):

                    acc[-1] = (entity, entity)

                return acc

        acc.append(cur)

        return acc

    pruned = reduce(prune, contents, [])

    return OrderedDict(pruned)

def get_newer_entity(newer, older, key):

    entity = newer.get(key, None)

    # Always prefer the newer version.

    if entity is not None:

        return entity

    return older.get(key)

def get_older_entity(newer, older, key):

    entity = older.get(key, None)

    # If we don't have an older version, or it's a StickyEntry,

    # get a newer version

    if entity is None or isinstance(entity, StickyEntry):

        return newer.get(key)

    return entity

def serialize_legacy_resource(entities):

    return "".join(entity.all for entity in entities)