Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* Given a list of strings and a corresponding map of items that those strings
* correspond to, build a suffix tree.
*/
export function MultiSuffixTree(aStrings, aItems) {
if (aStrings.length != aItems.length) {
throw new Error("Array lengths need to be the same.");
}
let s = "";
const offsetsToItems = [];
let lastLength = 0;
for (let i = 0; i < aStrings.length; i++) {
s += aStrings[i];
offsetsToItems.push(lastLength, s.length, aItems[i]);
lastLength = s.length;
}
this._construct(s);
this._offsetsToItems = offsetsToItems;
this._numItems = aItems.length;
}
/**
* @class
*/
function State(aStartIndex, aEndIndex, aSuffix) {
this.start = aStartIndex;
this.end = aEndIndex;
this.suffix = aSuffix;
}
/**
* Since objects are basically hash-tables anyways, we simply create an
* attribute whose name is the first letter of the edge string. (So, the
* edge string can conceptually be a multi-letter string, but since we would
* split it were there any ambiguity, it's okay to just use the single letter.)
* This avoids having to update the attribute name or worry about tripping our
* implementation up.
*/
State.prototype = {
get isExplicit() {
// our end is not inclusive...
return this.end <= this.start;
},
get isImplicit() {
// our end is not inclusive...
return this.end > this.start;
},
get length() {
return this.end - this.start;
},
toString() {
return (
"[Start: " +
this.start +
" End: " +
this.end +
(this.suffix ? " non-null suffix]" : " null suffix]")
);
},
};
/**
* Suffix tree implemented using Ukkonen's algorithm.
*
* @class
*/
export function SuffixTree(aStr) {
this._construct(aStr);
}
/**
* States are
*/
SuffixTree.prototype = {
/**
* Find all items matching the provided substring.
*/
findMatches(aSubstring) {
const results = [];
let state = this._root;
let index = 0;
const end = aSubstring.length;
while (index < end) {
state = state[aSubstring[index]];
// bail if there was no edge
if (state === undefined) {
return results;
}
// bail if the portion of the edge we traversed is not equal to that
// portion of our pattern
const actualTraverseLength = Math.min(state.length, end - index);
if (
this._str.substring(state.start, state.start + actualTraverseLength) !=
aSubstring.substring(index, index + actualTraverseLength)
) {
return results;
}
index += state.length;
}
// state should now be the node which itself and all its children match...
// The delta is to adjust us to the offset of the last letter of our match;
// the edge we traversed to get here may have found us traversing more
// than we wanted.
// index - end captures the over-shoot of the edge traversal,
// index - end + 1 captures the fact that we want to find the last letter
// that matched, not just the first letter beyond it
// However, if this state is a leaf node (end == 'infinity'), then 'end'
// isn't describing an edge at all and we want to avoid accounting for it.
/*
if (state.end != this._infinity)
//delta = index - end + 1;
delta = end - (index - state.length);
else */
const delta = index - state.length - end + 1;
this._resultGather(state, results, {}, end, delta, true);
return results;
},
_resultGather(aState, aResults, aPresence, aPatLength, aDelta) {
// find the item that this state originated from based on the state's
// start character. offsetToItem holds [string start index, string end
// index (exclusive), item reference]. So we want to binary search to
// find the string whose start/end index contains the state's start index.
let low = 0;
let high = this._numItems - 1;
let mid, stringStart, stringEnd;
const patternLast = aState.start - aDelta;
while (low <= high) {
mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
stringStart = this._offsetsToItems[mid * 3];
const startDelta = stringStart - patternLast;
stringEnd = this._offsetsToItems[mid * 3 + 1];
const endDelta = stringEnd - patternLast;
if (startDelta > 0) {
high = mid - 1;
} else if (endDelta <= 0) {
low = mid + 1;
} else {
break;
}
}
// - The match occurred completely inside a source string. Success.
// - The match spans more than one source strings, and is therefore not
// a match.
// at this point, we have located the origin string that corresponds to the
// start index of this state.
// - The match terminated with the end of the preceding string, and does
// not match us at all. We, and potentially our children, are merely
// serving as a unique terminal.
// - The
const patternFirst = patternLast - (aPatLength - 1);
if (patternFirst >= stringStart) {
if (!(stringStart in aPresence)) {
aPresence[stringStart] = true;
aResults.push(this._offsetsToItems[mid * 3 + 2]);
}
}
// bail if we had it coming OR
// if the result terminates at/part-way through this state, meaning any
// of its children are not going to be actual results, just hangers
// on.
/*
if (bail || (end <= aState.end)) {
dump(" bailing! (bail was: " + bail + ")\n");
return;
}
*/
// process our children...
for (const key in aState) {
// edges have attributes of length 1...
if (key.length == 1) {
const statePrime = aState[key];
this._resultGather(
statePrime,
aResults,
aPresence,
aPatLength,
aDelta + aState.length, // (alreadyAdjusted ? 0 : aState.length),
false
);
}
}
},
/**
* Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
* which means no work to do and we return immediately) follow that state
* (and then the successive states)'s transitions until we run out of
* transitions. This happens either when we find an explicit state, or
* find ourselves partially along an edge (conceptually speaking). In
* the partial case, we return the state prior to the edge traversal.
* (The information about the 'edge' is contained on its target State;
* we can do this because a state is only referenced by one other state.)
*/
_canonize(aState, aStart, aEnd) {
if (aEnd <= aStart) {
return [aState, aStart];
}
let statePrime;
// we treat an aState of null as 'bottom', which has transitions for every
// letter in the alphabet to 'root'. rather than create all those
// transitions, we special-case here.
if (aState === null) {
statePrime = this._root;
} else {
statePrime = aState[this._str[aStart]];
}
while (statePrime.length <= aEnd - aStart) {
// (no 1 adjustment required)
aStart += statePrime.length;
aState = statePrime;
if (aStart < aEnd) {
statePrime = aState[this._str[aStart]];
}
}
return [aState, aStart];
},
/**
* Given a reference 'pair' whose state may or may not be explicit (and for
* which we will perform the required splitting to make it explicit), test
* whether it already possesses a transition corresponding to the provided
* character.
*
* @returns A list of: whether we had to make it explicit, the (potentially)
* new explicit state.
*/
_testAndSplit(aState, aStart, aEnd, aChar) {
if (aStart < aEnd) {
// it's not explicit
const statePrime = aState[this._str[aStart]];
const length = aEnd - aStart;
if (aChar == this._str[statePrime.start + length]) {
return [true, aState];
}
// do splitting... aState -> rState -> statePrime
const rState = new State(statePrime.start, statePrime.start + length);
aState[this._str[statePrime.start]] = rState;
statePrime.start += length;
rState[this._str[statePrime.start]] = statePrime;
return [false, rState];
}
// it's already explicit
if (aState === null) {
// bottom case... shouldn't happen, but hey.
return [true, aState];
}
return [aChar in aState, aState];
},
_update(aState, aStart, aIndex) {
let oldR = this._root;
const textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
// because of the way we store the 'end' value as a one-past form, we do
// not need to subtract 1 off of aIndex.
let [endPoint, rState] = this._testAndSplit(
aState,
aStart,
aIndex, // no -1
textAtIndex
);
while (!endPoint) {
const rPrime = new State(aIndex, this._infinity);
rState[textAtIndex] = rPrime;
if (oldR !== this._root) {
oldR.suffix = rState;
}
oldR = rState;
[aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
[endPoint, rState] = this._testAndSplit(
aState,
aStart,
aIndex, // no -1
textAtIndex
);
}
if (oldR !== this._root) {
oldR.suffix = aState;
}
return [aState, aStart];
},
_construct(aStr) {
this._str = aStr;
// just needs to be longer than the string.
this._infinity = aStr.length + 1;
// this._bottom = new State(0, -1, null);
this._root = new State(-1, 0, null); // null === bottom
let state = this._root;
let start = 0;
for (let i = 0; i < aStr.length; i++) {
[state, start] = this._update(state, start, i); // treat as flowing -1...
[state, start] = this._canonize(state, start, i + 1); // 1-length string
}
},
dump(aState, aIndent, aKey) {
if (aState === undefined) {
aState = this._root;
}
if (aIndent === undefined) {
aIndent = "";
aKey = ".";
}
if (aState.isImplicit) {
let snip;
if (aState.length > 10) {
snip =
this._str.slice(
aState.start,
Math.min(aState.start + 10, this._str.length)
) + "...";
} else {
snip = this._str.slice(
aState.start,
Math.min(aState.end, this._str.length)
);
}
dump(
aIndent +
aKey +
":" +
snip +
"(" +
aState.start +
":" +
aState.end +
")\n"
);
} else {
dump(
aIndent +
aKey +
": (explicit:" +
aState.start +
":" +
aState.end +
")\n"
);
}
const nextIndent = aIndent + " ";
const keys = Object.keys(aState).filter(c => c.length == 1);
for (const key of keys) {
this.dump(aState[key], nextIndent, key);
}
},
};
MultiSuffixTree.prototype = SuffixTree.prototype;