SuffixTree.sys.mjs

comm-central/mailnews/db/gloda/modules/SuffixTree.sys.mjs (file symbol)

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**

 * Given a list of strings and a corresponding map of items that those strings

 *  correspond to, build a suffix tree.

*/

export function MultiSuffixTree(aStrings, aItems) {

  if (aStrings.length != aItems.length) {

    throw new Error("Array lengths need to be the same.");

  let s = "";

  const offsetsToItems = [];

  let lastLength = 0;

  for (let i = 0; i < aStrings.length; i++) {

    s += aStrings[i];

    offsetsToItems.push(lastLength, s.length, aItems[i]);

    lastLength = s.length;

  this._construct(s);

  this._offsetsToItems = offsetsToItems;

  this._numItems = aItems.length;

/**

 * @class

*/

function State(aStartIndex, aEndIndex, aSuffix) {

  this.start = aStartIndex;

  this.end = aEndIndex;

  this.suffix = aSuffix;

/**

 * Since objects are basically hash-tables anyways, we simply create an

 *  attribute whose name is the first letter of the edge string.  (So, the

 *  edge string can conceptually be a multi-letter string, but since we would

 *  split it were there any ambiguity, it's okay to just use the single letter.)

 *  This avoids having to update the attribute name or worry about tripping our

 *  implementation up.

*/

State.prototype = {

  get isExplicit() {

    // our end is not inclusive...

    return this.end <= this.start;

},

  get isImplicit() {

    // our end is not inclusive...

    return this.end > this.start;

},

  get length() {

    return this.end - this.start;

},

  toString() {

    return (

      "[Start: " +

      this.start +

      " End: " +

      this.end +

      (this.suffix ? " non-null suffix]" : " null suffix]")

);

},

};

/**

 * Suffix tree implemented using Ukkonen's algorithm.

 * @class

*/

export function SuffixTree(aStr) {

  this._construct(aStr);

/**

 * States are

*/

SuffixTree.prototype = {

/**

   * Find all items matching the provided substring.

*/

  findMatches(aSubstring) {

    const results = [];

    let state = this._root;

    let index = 0;

    const end = aSubstring.length;

    while (index < end) {

      state = state[aSubstring[index]];

      // bail if there was no edge

      if (state === undefined) {

        return results;

      // bail if the portion of the edge we traversed is not equal to that

      //  portion of our pattern

      const actualTraverseLength = Math.min(state.length, end - index);

      if (

        this._str.substring(state.start, state.start + actualTraverseLength) !=

        aSubstring.substring(index, index + actualTraverseLength)

) {

        return results;

      index += state.length;

    // state should now be the node which itself and all its children match...

    // The delta is to adjust us to the offset of the last letter of our match;

    //  the edge we traversed to get here may have found us traversing more

    //  than we wanted.

    // index - end captures the over-shoot of the edge traversal,

    // index - end + 1 captures the fact that we want to find the last letter

    //  that matched, not just the first letter beyond it

    // However, if this state is a leaf node (end == 'infinity'), then 'end'

    //  isn't describing an edge at all and we want to avoid accounting for it.

/*

    if (state.end != this._infinity)

      //delta = index - end + 1;

      delta = end - (index - state.length);

    else */

    const delta = index - state.length - end + 1;

    this._resultGather(state, results, {}, end, delta, true);

    return results;

},

  _resultGather(aState, aResults, aPresence, aPatLength, aDelta) {

    // find the item that this state originated from based on the state's

    //  start character.  offsetToItem holds [string start index, string end

    //  index (exclusive), item reference].  So we want to binary search to

    //  find the string whose start/end index contains the state's start index.

    let low = 0;

    let high = this._numItems - 1;

    let mid, stringStart, stringEnd;

    const patternLast = aState.start - aDelta;

    while (low <= high) {

      mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums

      stringStart = this._offsetsToItems[mid * 3];

      const startDelta = stringStart - patternLast;

      stringEnd = this._offsetsToItems[mid * 3 + 1];

      const endDelta = stringEnd - patternLast;

      if (startDelta > 0) {

        high = mid - 1;

      } else if (endDelta <= 0) {

        low = mid + 1;

      } else {

        break;

    // - The match occurred completely inside a source string.  Success.

    // - The match spans more than one source strings, and is therefore not

    //   a match.

    // at this point, we have located the origin string that corresponds to the

    //  start index of this state.

    // - The match terminated with the end of the preceding string, and does

    //   not match us at all.  We, and potentially our children, are merely

    //   serving as a unique terminal.

    // - The

    const patternFirst = patternLast - (aPatLength - 1);

    if (patternFirst >= stringStart) {

      if (!(stringStart in aPresence)) {

        aPresence[stringStart] = true;

        aResults.push(this._offsetsToItems[mid * 3 + 2]);

    // bail if we had it coming OR

    // if the result terminates at/part-way through this state, meaning any

    //  of its children are not going to be actual results, just hangers

    //  on.

/*

    if (bail || (end <= aState.end)) {

dump("  bailing! (bail was: " + bail + ")\n");

      return;

*/

    // process our children...

    for (const key in aState) {

      // edges have attributes of length 1...

      if (key.length == 1) {

        const statePrime = aState[key];

        this._resultGather(

          statePrime,

          aResults,

          aPresence,

          aPatLength,

          aDelta + aState.length, // (alreadyAdjusted ? 0 : aState.length),

          false

);

},

/**

   * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,

   *  which means no work to do and we return immediately) follow that state

   *  (and then the successive states)'s transitions until we run out of

   *  transitions.  This happens either when we find an explicit state, or

   *  find ourselves partially along an edge (conceptually speaking).  In

   *  the partial case, we return the state prior to the edge traversal.

   * (The information about the 'edge' is contained on its target State;

   *  we can do this because a state is only referenced by one other state.)

*/

  _canonize(aState, aStart, aEnd) {

    if (aEnd <= aStart) {

      return [aState, aStart];

    let statePrime;

    // we treat an aState of null as 'bottom', which has transitions for every

    //  letter in the alphabet to 'root'.  rather than create all those

    //  transitions, we special-case here.

    if (aState === null) {

      statePrime = this._root;

    } else {

      statePrime = aState[this._str[aStart]];

    while (statePrime.length <= aEnd - aStart) {

      // (no 1 adjustment required)

      aStart += statePrime.length;

      aState = statePrime;

      if (aStart < aEnd) {

        statePrime = aState[this._str[aStart]];

    return [aState, aStart];

},

/**

   * Given a reference 'pair' whose state may or may not be explicit (and for

   *  which we will perform the required splitting to make it explicit), test

   *  whether it already possesses a transition corresponding to the provided

   *  character.

   * @returns A list of: whether we had to make it explicit, the (potentially)

   *    new explicit state.

*/

  _testAndSplit(aState, aStart, aEnd, aChar) {

    if (aStart < aEnd) {

      // it's not explicit

      const statePrime = aState[this._str[aStart]];

      const length = aEnd - aStart;

      if (aChar == this._str[statePrime.start + length]) {

        return [true, aState];

      // do splitting... aState -> rState -> statePrime

      const rState = new State(statePrime.start, statePrime.start + length);

      aState[this._str[statePrime.start]] = rState;

      statePrime.start += length;

      rState[this._str[statePrime.start]] = statePrime;

      return [false, rState];

    // it's already explicit

    if (aState === null) {

      // bottom case... shouldn't happen, but hey.

      return [true, aState];

    return [aChar in aState, aState];

},

  _update(aState, aStart, aIndex) {

    let oldR = this._root;

    const textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)

    // because of the way we store the 'end' value as a one-past form, we do

    //  not need to subtract 1 off of aIndex.

    let [endPoint, rState] = this._testAndSplit(

      aState,

      aStart,

      aIndex, // no -1

      textAtIndex

);

    while (!endPoint) {

      const rPrime = new State(aIndex, this._infinity);

      rState[textAtIndex] = rPrime;

      if (oldR !== this._root) {

        oldR.suffix = rState;

      oldR = rState;

      [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1

      [endPoint, rState] = this._testAndSplit(

        aState,

        aStart,

        aIndex, // no -1

        textAtIndex

);

    if (oldR !== this._root) {

      oldR.suffix = aState;

    return [aState, aStart];

},

  _construct(aStr) {

    this._str = aStr;

    // just needs to be longer than the string.

    this._infinity = aStr.length + 1;

    // this._bottom = new State(0, -1, null);

    this._root = new State(-1, 0, null); // null === bottom

    let state = this._root;

    let start = 0;

    for (let i = 0; i < aStr.length; i++) {

      [state, start] = this._update(state, start, i); // treat as flowing -1...

      [state, start] = this._canonize(state, start, i + 1); // 1-length string

},

  dump(aState, aIndent, aKey) {

    if (aState === undefined) {

      aState = this._root;

    if (aIndent === undefined) {

      aIndent = "";

      aKey = ".";

    if (aState.isImplicit) {

      let snip;

      if (aState.length > 10) {

        snip =

          this._str.slice(

            aState.start,

            Math.min(aState.start + 10, this._str.length)

          ) + "...";

      } else {

        snip = this._str.slice(

          aState.start,

          Math.min(aState.end, this._str.length)

);

      dump(

        aIndent +

          aKey +

          ":" +

          snip +

          "(" +

          aState.start +

          ":" +

          aState.end +

          ")\n"

);

    } else {

      dump(

        aIndent +

          aKey +

          ": (explicit:" +

          aState.start +

          ":" +

          aState.end +

          ")\n"

);

    const nextIndent = aIndent + "  ";

    const keys = Object.keys(aState).filter(c => c.length == 1);

    for (const key of keys) {

      this.dump(aState[key], nextIndent, key);

},

};

MultiSuffixTree.prototype = SuffixTree.prototype;