expectedtree.py - mozsearch

Enable keyboard shortcuts

# mypy: allow-untyped-defs

from math import log

from collections import defaultdict

class Node:

    def __init__(self, prop, value):

        self.prop = prop

        self.value = value

        self.parent = None

        self.children = set()

        # Populated for leaf nodes

        self.run_info = set()

        self.result_values = defaultdict(int)

    def add(self, node):

        self.children.add(node)

        node.parent = self

    def __iter__(self):

        yield self

        for node in self.children:

            yield from node

    def __len__(self):

        return 1 + sum(len(item) for item in self.children)

def entropy(results):

    """This is basically a measure of the uniformity of the values in results

    based on the shannon entropy"""

    result_counts = defaultdict(int)

    total = float(len(results))

    for values in results.values():

        # Not sure this is right, possibly want to treat multiple values as

        # distinct from multiple of the same value?

        for value in values:

            result_counts[value] += 1

    entropy_sum = 0

    for count in result_counts.values():

        prop = float(count) / total

        entropy_sum -= prop * log(prop, 2)

    return entropy_sum

def split_results(prop, results):

    """Split a dictionary of results into a dictionary of dictionaries where

    each sub-dictionary has a specific value of the given property"""

    by_prop = defaultdict(dict)

    for run_info, value in results.items():

        by_prop[run_info[prop]][run_info] = value

    return by_prop

def build_tree(properties, dependent_props, results, tree=None):

    """Build a decision tree mapping properties to results

    :param properties: - A list of run_info properties to consider

                         in the tree

    :param dependent_props: - A dictionary mapping property name

                              to properties that should only be considered

                              after the properties in the key. For example

                              {"os": ["version"]} means that "version" won't

                              be used until after os.

    :param results: Dictionary mapping run_info to set of results

    :tree: A Node object to use as the root of the (sub)tree"""

    if tree is None:

        tree = Node(None, None)

    prop_index = {prop: i for i, prop in enumerate(properties)}

    all_results = defaultdict(int)

    for result_values in results.values():

        for result_value, count in result_values.items():

            all_results[result_value] += count

    # If there is only one result we are done

    if not properties or len(all_results) == 1:

        for value, count in all_results.items():

            tree.result_values[value] += count

        tree.run_info |= set(results.keys())

        return tree

    results_partitions = []

    remove_properties = set()

    for prop in properties:

        result_sets = split_results(prop, results)

        if len(result_sets) == 1:

            # If this property doesn't partition the space then just remove it

            # from the set to consider

            remove_properties.add(prop)

            continue

        new_entropy = 0.

        results_sets_entropy = []

        for prop_value, result_set in result_sets.items():

            results_sets_entropy.append((entropy(result_set), prop_value, result_set))

            new_entropy += (float(len(result_set)) / len(results)) * results_sets_entropy[-1][0]

        results_partitions.append((new_entropy,

                                   prop,

                                   results_sets_entropy))

    # In the case that no properties partition the space

    if not results_partitions:

        for value, count in all_results.items():

            tree.result_values[value] += count

        tree.run_info |= set(results.keys())

        return tree

    # split by the property with the highest entropy

    results_partitions.sort(key=lambda x: (x[0], prop_index[x[1]]))

    _, best_prop, sub_results = results_partitions[0]

    # Create a new set of properties that can be used

    new_props = properties[:prop_index[best_prop]] + properties[prop_index[best_prop] + 1:]

    new_props.extend(dependent_props.get(best_prop, []))

    if remove_properties:

        new_props = [item for item in new_props if item not in remove_properties]

    for _, prop_value, results_sets in sub_results:

        node = Node(best_prop, prop_value)

        tree.add(node)

        build_tree(new_props, dependent_props, results_sets, node)

    return tree